In [None]:
import pandas as pd
import pathlib
import os
import nltk

In [None]:
base_dir = pathlib.Path(os.getenv('BASE_DIR'))
data = base_dir / 'data'
data_raw = data / 'raw'
images = data_raw / 'images'

In [None]:
# Load Data
df = pd.read_csv(data_raw / 'seasons46to50.csv')

In [None]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
import re
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 
def preprocess(series):
    series = series.str.lower()
    series = series.str.replace('{html}',"") 
    
    cleanr = re.compile('<.*?>')
    series = series.str.replace(cleanr, '')
    
    series = series.str.replace(r'http\S+', '')
    series = series.str.replace('[0-9]+', '')
    tokenizer = RegexpTokenizer(r'\w+')
    series = series.apply(tokenizer.tokenize)  
    series = series.apply(lambda tokens: [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')])
    series = series.apply(lambda filtered_words: ' '.join([lemmatizer.lemmatize(w) for w in filtered_words]))
    return series

In [None]:
episode_summaries = df.groupby('Episode')['Description'].agg(list).apply(lambda segs: ' '.join(segs))

from joblib import Parallel, delayed
import numpy as np

cleaned = preprocess(episode_summaries)

In [None]:
cleaned

In [None]:
# Word frequency distribution
from nltk import FreqDist
from itertools import chain
fdist = FreqDist(' '.join(cleaned).split(' '))

In [None]:
import matplotlib.pyplot as plt
fdist = FreqDist(' '.join(cleaned).split(' '
fig, ax = plt.subplots(figsize=(16,4),dpi=300)
fdist.plot(100)

### A couple of first thoughts:
- Monster is a common "last name" for the puppets (cookie monster, elmo monster, etc). This may be over used and could be removed
- Why is the word "first" on in the top 3 list? Could the summaries say "First, elmo did this".
- The word day may be "number of the day, letter of the day" this wont provide much value


In [None]:
from collections import Counter

def most_common_ngrams(col: pd.Series, n: int=2):
    zippedTuples = col.apply(ngram_generator, n=n)
    chainedTuples = chain.from_iterable(zippedTuples)
    return Counter(chainedTuples)

def ngram_generator(s: str, n: int=2):
    wordIdxs = []
    for i in range(1, n+1):
        wordIdxs.append(s[i:])
    return zip(*wordIdxs)


count = cleaned.str.split(' ').aggregate(most_common_ngrams, n=2)
for text, cnt in count.most_common(10):
    print(cnt,'-',' '.join(text))
    

### Looking at the top 10 2 gram words, we can probably remove a few common words to every episode
- First Episode (not much added info here)
- letter day (in every episode)
- number day (in every episode)
- introduce letter (letter of the day related)


In [None]:
removeCommon = [
    'first episode',
    'letter day',
    'number day',
    'introduce letter'
]
pattern = '|'.join(removeCommon)
cleaned2 = cleaned.str.replace(pattern, '')

cleaned2 = cleaned2.str.split().apply(lambda x: ' '.join(x))

In [None]:
fdist = FreqDist(' '.join(cleaned2).split(' '))
fig, ax = plt.subplots(figsize=(16,4),dpi=300)
fdist.plot(100)

In [None]:
counter = cleaned2.apply(lambda x: Counter(x.split()))
episodeNorm = counter.apply(lambda x: x['grover'])
episodeNorm.sort_values(ascending=False, inplace=True)
episodeNorm.head(25)

In [None]:
def getEpisodesImages(episodeNumbers):
    episodes = pd.read_csv(data_raw / 'seasons46to50.csv')
    returnEpisodes = episodes[episodes['Episode'].isin(episodeNumbers)]
    returnEpisodes = returnEpisodes.groupby(['Episode']).agg({'Segment':list,'Description':list}).to_dict(orient='index')
    return returnEpisodes

a = getEpisodesImages([4702])
a

## Topic Modeling

In [None]:
import gensim
dictionary = gensim.corpora.Dictionary(cleaned2.str.split())

In [None]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in cleaned2.str.split()]

In [None]:
from gensim import corpora, models
from pprint import pprint

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [None]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=5, id2word=dictionary, passes=2, workers=2)

In [None]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

In [None]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=5, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))