# Loading libraries

##### Import pandas for data manipulation and Counter from collections for counting elements.

In [None]:
import pandas as pd
from collections import Counter

# Read the CSV file
##### Load the data from 'data/simpsons_script_lines.zip' assuming it's a zipped CSV file
##### Use 'compression='zip'' to handle the zipped format
##### Display the first two rows using head(2) to get a glimpse of the data


In [None]:
# Read the CSV file
data = pd.read_csv('simpsons_script_lines.zip',compression='zip')
data.head(2)

In [None]:
# # Extract the raw text column and create word frequency
# Get the 'raw_text' column from the DataFrame
raw_text = data["raw_text"]

# Tokenize the text into words by splitting each text entry
words = []
for text in raw_text:
    words.extend(text.split())

# Use Counter to calculate the frequency of each word
word_freq = Counter(words)

# Print the 10 most common words
print(word_freq.most_common(10))

## Create a WordCloud visualization

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Define WordCloud parameters: width, height, and background color
wordcloud = WordCloud(width=800, height=400, background_color='white')

# Generate the word cloud based on word frequencies
wordcloud.generate_from_frequencies(word_freq)

# Display the word cloud using matplotlib
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')

##### Analyze frequency of characters 

In [None]:
data['raw_character_text'].value_counts()

##### Analyze word frequency for specific characters
##### Loop through characters ('Homer Simpson', 'Marge Simpson', etc.)

In [None]:
for ch in ['Homer Simpson','Marge Simpson','Bart Simpson','Lisa Simpson']:

    raw_text = data[data['raw_character_text']==ch]["raw_text"]
    
    # Tokenize the text into words
    words = []
    for text in raw_text:
        words.extend(text.split())
    
    # Calculate word frequency
    word_freq = Counter(words)
    
    # Print the most common words
    print(ch)
    print(word_freq.most_common(10))
    print()

##### Perform word frequency analysis with stop words removal

In [None]:
import nltk
nltk.download('stopwords')
from nltk import corpus
stop_words = corpus.stopwords.words('english')
# Print the first 10 stop words
stop_words[:10]

In [None]:
# Load stop words and add character names (optional customization)
stop_words = corpus.stopwords.words('english')+['Homer','Marge','Bart','Lisa','Simpson']
# Convert all stop words to lowercase
stop_words = [word.lower() for word in stop_words]

# Repeat the analysis for each character, filtering out stop words
for ch in ['Homer Simpson','Marge Simpson','Bart Simpson','Lisa Simpson']:

    raw_text = data[data['raw_character_text']==ch]["raw_text"]
    
    # Tokenize the text into words
    words = []
    for text in raw_text:
        words.extend(text.split())

    # Convert words to lowercase
    words = [word.lower() for word in words]
    
    # Filter out stop words using list comprehension
    filtered_words = [word for word in words if word not in stop_words]
    
    # Calculate word frequency
    word_freq = Counter(filtered_words)
    
    # Print the most common words
    print(ch)
    print(word_freq.most_common(10))
    print()

### Find occurrences of "simpson" and variations

In [None]:
import re
import numpy as np

pattern = r"simpson.*"

# Find all matches in the text
np.unique( [re.findall(pattern, word, re.IGNORECASE) for word in words if 'simpson' in word] )

# Convolution

In [None]:
# Define the input signal and kernel
input_signal = np.array([1, 2, 3, 4, 5])
kernel = np.array([0, 1, 0])

# Perform convolution
output = np.convolve(input_signal, kernel, mode='valid')

print(output)