In [None]:
import nltk
import spacy
import string
import matplotlib.pyplot as plt
from collections import Counter
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.util import ngrams
from wordcloud import WordCloud

In [None]:
# Download required NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('punkt_tab')
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords

In [None]:
# Load spaCy model for Named Entity Recognition (NER)
nlp = spacy.load("en_core_web_sm")

def text_eda(text):
    """Performs exploratory data analysis on a given text."""
    
    # Tokenization
    words = word_tokenize(text.lower())
    sentences = sent_tokenize(text)
    
    # Remove punctuation and stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.isalnum() and word not in stop_words]
    
    # Basic statistics
    total_words = len(words)
    total_sentences = len(sentences)
    avg_word_length = sum(len(word) for word in words) / total_words
    
    print(f"Total Words: {total_words}")
    print(f"Total Sentences: {total_sentences}")
    print(f"Average Word Length: {avg_word_length:.2f}")
    
    # Word Frequency Distribution
    word_freq = Counter(words)
    common_words = word_freq.most_common(20)
    print("\nMost Common Words:")
    for word, freq in common_words:
        print(f"{word}: {freq}")
    
    # Plot Word Frequency
    plt.figure(figsize=(10, 5))
    plt.bar(*zip(*common_words))
    plt.xlabel("Words")
    plt.ylabel("Frequency")
    plt.xticks(rotation=90, ha='right')
    plt.title("Top 10 Most Common Words")
    plt.show()
    
    # Generate Word Cloud
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(words))
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title("Word Cloud")
    plt.show()
    
    # Bigram Analysis
    bigrams = list(ngrams(words, 2))
    bigram_freq = Counter(bigrams).most_common(5)
    print("\nMost Common Bigrams:")
    for bigram, freq in bigram_freq:
        print(f"{' '.join(bigram)}: {freq}")
    
    # # Named Entity Recognition (NER)
    # doc = nlp(text)
    # print("\nNamed Entities:")
    # for ent in doc.ents:
    #     print(f"{ent.text} ({ent.label_})")

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('all_ECB_speeches_para.csv')

# text_eda(sample_text)


In [None]:
text = ''
for i in df['contents_para']:
    if isinstance(i, str):
        text += (i + ' ')
    else:
        None
text

In [None]:
nlp.max_length = 54723303  # Increase the maximum character limit for spaCy
text_eda(text)