In [16]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency

import spacy  # For preprocessing

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [26]:
# https://www.kaggle.com/code/pierremegret/gensim-word2vec-tutorial
data = pd.read_csv('simpsons_dataset.csv')
data

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...
...,...,...
158309,Miss Hoover,I'm back.
158310,Miss Hoover,"You see, class, my Lyme disease turned out to ..."
158311,Miss Hoover,Psy-cho-so-ma-tic.
158312,Ralph Wiggum,Does that mean you were crazy?


In [27]:
# удаляем пустые значения
data = data.dropna().reset_index(drop=True)
data.isnull().sum()

raw_character_text    0
spoken_words          0
dtype: int64

In [28]:
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [29]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in data['spoken_words'])

In [30]:
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_process=-1)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 1.75 mins


In [31]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(85955, 1)

In [32]:
from gensim.models.phrases import Phrases, Phraser

In [33]:
sent = [row.split() for row in df_clean['clean']]

In [34]:
phrases = Phrases(sent, min_count=30, progress_per=10000)

INFO - 16:52:05: collecting all words and their counts
INFO - 16:52:05: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 16:52:05: PROGRESS: at sentence #10000, processed 63557 words and 52723 word types
INFO - 16:52:05: PROGRESS: at sentence #20000, processed 130936 words and 99612 word types
INFO - 16:52:05: PROGRESS: at sentence #30000, processed 192961 words and 138181 word types
INFO - 16:52:05: PROGRESS: at sentence #40000, processed 249832 words and 172156 word types
INFO - 16:52:05: PROGRESS: at sentence #50000, processed 311269 words and 207943 word types
INFO - 16:52:05: PROGRESS: at sentence #60000, processed 373578 words and 242950 word types
INFO - 16:52:05: PROGRESS: at sentence #70000, processed 436424 words and 277852 word types
INFO - 16:52:06: PROGRESS: at sentence #80000, processed 497887 words and 310927 word types
INFO - 16:52:06: collected 329641 token types (unigram + bigrams) from a corpus of 537095 words and 85955 sentences
INFO - 16:52:06: m

In [35]:
bigram = Phraser(phrases)

INFO - 16:52:06: exporting phrases from Phrases<329641 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>
INFO - 16:52:06: FrozenPhrases lifecycle event {'msg': 'exported FrozenPhrases<128 phrases, min_count=30, threshold=10.0> from Phrases<329641 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000> in 0.35s', 'datetime': '2023-05-14T16:52:06.410696', 'gensim': '4.3.1', 'python': '3.11.3 (tags/v3.11.3:f3909b8, Apr  4 2023, 23:49:59) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'created'}


In [36]:
sentences = bigram[sent]

In [37]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

29694

In [38]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['oh', 'like', 'know', 'get', 'hey', 'think', 'come', 'right', 'look', 'want']

In [39]:
import multiprocessing

from gensim.models import Word2Vec

In [40]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

In [41]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     vector_size=300,
                     sample=6e-5,
                     alpha=0.03,
                     min_alpha=0.0007,
                     negative=20,
                     workers=cores-1)

INFO - 16:52:06: Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=300, alpha=0.03>', 'datetime': '2023-05-14T16:52:06.830794', 'gensim': '4.3.1', 'python': '3.11.3 (tags/v3.11.3:f3909b8, Apr  4 2023, 23:49:59) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'created'}


In [42]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 16:52:06: collecting all words and their counts
INFO - 16:52:06: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 16:52:06: PROGRESS: at sentence #10000, processed 61697 words, keeping 9518 word types
INFO - 16:52:06: PROGRESS: at sentence #20000, processed 127312 words, keeping 14384 word types
INFO - 16:52:07: PROGRESS: at sentence #30000, processed 187772 words, keeping 17442 word types
INFO - 16:52:07: PROGRESS: at sentence #40000, processed 243265 words, keeping 20121 word types
INFO - 16:52:07: PROGRESS: at sentence #50000, processed 303120 words, keeping 22551 word types
INFO - 16:52:07: PROGRESS: at sentence #60000, processed 363858 words, keeping 24820 word types
INFO - 16:52:07: PROGRESS: at sentence #70000, processed 425311 words, keeping 26987 word types
INFO - 16:52:07: PROGRESS: at sentence #80000, processed 485433 words, keeping 28822 word types
INFO - 16:52:07: collected 29694 word types from a corpus of 523538 raw words and 85955 sentence

Time to build vocab: 0.01 mins


In [43]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 16:52:07: Word2Vec lifecycle event {'msg': 'training model with 15 workers on 3325 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2 shrink_windows=True', 'datetime': '2023-05-14T16:52:07.365016', 'gensim': '4.3.1', 'python': '3.11.3 (tags/v3.11.3:f3909b8, Apr  4 2023, 23:49:59) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'train'}
INFO - 16:52:08: EPOCH 0: training on 523538 raw words (199584 effective words) took 0.8s, 260772 effective words/s
INFO - 16:52:08: EPOCH 1: training on 523538 raw words (199638 effective words) took 0.7s, 299671 effective words/s
INFO - 16:52:09: EPOCH 2: training on 523538 raw words (199401 effective words) took 0.8s, 255415 effective words/s
INFO - 16:52:10: EPOCH 3: training on 523538 raw words (199017 effective words) took 0.8s, 244027 effective words/s
INFO - 16:52:11: EPOCH 4: training on 523538 raw words (199060 effective words) took 0.8s, 239077 effective words/s
INFO - 16:52:12

Time to train the model: 0.44 mins


In [44]:
w2v_model.init_sims(replace=True)

  w2v_model.init_sims(replace=True)


In [45]:
w2v_model.wv.most_similar(positive=["homer"])

[('marge', 0.7253799438476562),
 ('depressed', 0.6885277628898621),
 ('convince', 0.6841305494308472),
 ('fault', 0.6789838671684265),
 ('sweetheart', 0.6788270473480225),
 ('tab', 0.6756338477134705),
 ('teeny', 0.6755796074867249),
 ('ask', 0.6725385785102844),
 ('straighten', 0.6678181886672974),
 ('crummy', 0.6669281721115112)]

In [83]:
w2v_model.wv.most_similar(positive=["homer_simpson"])

[('congratulation', 0.7026652693748474),
 ('select', 0.6953061819076538),
 ('council', 0.6940925717353821),
 ('versus', 0.6886987686157227),
 ('robert', 0.673720121383667),
 ('charles', 0.6649235486984253),
 ('request', 0.6641824245452881),
 ('brief', 0.6627007722854614),
 ('simon', 0.6583764553070068),
 ('threat', 0.6580336689949036)]

In [84]:
w2v_model.wv.most_similar(positive=["marge"])

[('homer', 0.7770687341690063),
 ('homie', 0.7143275141716003),
 ('becky', 0.7051325440406799),
 ('darling', 0.6850861310958862),
 ('ned', 0.6842876672744751),
 ('want', 0.6806071996688843),
 ('snuggle', 0.6760526895523071),
 ('ashamed', 0.6747106313705444),
 ('grownup', 0.6661184430122375),
 ('fault', 0.6644595861434937)]

In [85]:
w2v_model.wv.most_similar(positive=["bart"])

[('lisa', 0.7836252450942993),
 ('homework', 0.7634544372558594),
 ('mom_dad', 0.7293103337287903),
 ('pay_attention', 0.7222058176994324),
 ('mom', 0.7120025753974915),
 ('substitute', 0.7105140686035156),
 ('hearing', 0.7104999423027039),
 ('creepy', 0.7030847072601318),
 ('milhouse', 0.6877204775810242),
 ('bedtime', 0.6859204173088074)]

In [86]:
w2v_model.wv.similarity('bart', 'nelson')

0.57336015

In [87]:
w2v_model.wv.doesnt_match(['jimbo', 'milhouse', 'kearney'])



'jimbo'

In [88]:
w2v_model.wv.doesnt_match(["nelson", "bart", "milhouse"])

'nelson'

In [89]:
w2v_model.wv.doesnt_match(['homer', 'patty', 'selma'])

'homer'

In [90]:
w2v_model.wv.most_similar(positive=["woman", "homer"], negative=["marge"], topn=3)

[('admire', 0.5814297199249268),
 ('reason', 0.5814090967178345),
 ('man', 0.5605944395065308)]

In [91]:
w2v_model.wv.most_similar(positive=["woman", "bart"], negative=["man"], topn=3)

[('lisa', 0.6816440224647522),
 ('pay_attention', 0.6672130823135376),
 ('pregnant', 0.631349503993988)]