In [33]:
import pandas as pd
import string
import time
import re

from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec as w2v
from nltk import word_tokenize
from nltk.stem import wordnet
from collections import defaultdict


In [34]:
df = pd.read_csv("simpsons_dataset.csv")
df

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...
...,...,...
158309,Miss Hoover,I'm back.
158310,Miss Hoover,"You see, class, my Lyme disease turned out to ..."
158311,Miss Hoover,Psy-cho-so-ma-tic.
158312,Ralph Wiggum,Does that mean you were crazy?


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158314 entries, 0 to 158313
Data columns (total 2 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   raw_character_text  140500 non-null  object
 1   spoken_words        131855 non-null  object
dtypes: object(2)
memory usage: 2.4+ MB


In [36]:
df.isna().sum()

raw_character_text    17814
spoken_words          26459
dtype: int64

In [37]:
simpson = df.dropna().reset_index(drop = True)

In [38]:
simpson.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131853 entries, 0 to 131852
Data columns (total 2 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   raw_character_text  131853 non-null  object
 1   spoken_words        131853 non-null  object
dtypes: object(2)
memory usage: 2.0+ MB


In [39]:
clean_data = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in simpson['spoken_words'])

In [40]:
df_clean = pd.DataFrame({'clean': clean_data})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(119516, 1)

Bigram We are using Gensim Phases package to automatically detect common phases(bigrams) from a list of sentences. Using the Bigram model to catch or capture words like "mr_burns" or "bart_simpson"!

In [41]:
from gensim.models.phrases import Phrases, Phraser

In [42]:
sent = [row.split() for row in df_clean['clean']]

Creates the relevant phrases from the list of sentences:

In [43]:
phrases = Phrases(sent, min_count=30, progress_per=10000)

In [44]:
bigram = Phraser(phrases)

tokenizing sentences

In [45]:
sentences = bigram[sent]

In [46]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

40991

getting frequent words from documents

In [47]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['the', 'you', 'i', 'a', 'to', 'and', 'of', 'it', 'my', 'that']

training the model using the gensim word2vec model

In [48]:
import multiprocessing
from gensim.models import Word2Vec

In [49]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

In [50]:
w2v_model = Word2Vec(min_count=20, #Ignore all words with total absolute frequency lower than  (2, 100) 
                    window=6,
                    #size=300, #Dimensionality of the feature vectors. (50, 300)
                    sample=6e-5, #The threshold for configuring which higher-frequency words are randomly downsampled. Highly influencial. (0, 1e-5)
                    alpha=0.03, #The initial learning rate - (0.01, 0.05)
                    min_alpha=0.0007, #Learning rate will linearly drop to min_alpha as training progresses. To set it: alpha - (min_alpha * epochs) ~ 0.00
                    negative=20, #If > 0, negative sampling will be used, the ini for negatives specifies how "noise words" should be drown. If set to 0, no negative sampling is used. (5, 20)
                    workers=cores-1)

In [51]:
# t = time()
w2v_model.build_vocab(sentences, progress_per=10000)
# print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

In [52]:
#t = time()
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)
#print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

(12599626, 37627050)

In [53]:
w2v_model.init_sims(replace=True)

  w2v_model.init_sims(replace=True)


In [54]:
w2v_model.wv.most_similar(positive=['homer'])

[('marge', 0.6404784321784973),
 ('homie', 0.601138174533844),
 ('you', 0.5326716303825378),
 ('dad', 0.5080299377441406),
 ('mom', 0.46384236216545105),
 ('moe', 0.459073930978775),
 ('honey', 0.4552690386772156),
 ('becky', 0.44478437304496765),
 ('bart', 0.44301554560661316),
 ('sweetie', 0.4404270648956299)]

checking for similarities between words

In [59]:
w2v_model.similiarity("good","bad")

AttributeError: 'Word2Vec' object has no attribute 'similiarity'

checking for words that did not match from an array of words

In [None]:
w2v_model.doesnt_match(['good', 'failure', 'cute'])

AttributeError: 'Word2Vec' object has no attribute 'doesnt_match'