In [121]:
import polars as pl
import gensim
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.manifold import TSNE
import string

In [253]:
df = pl.scan_csv('simpsons_script_lines.csv',
                 ignore_errors = True).select(['character_id', 'raw_character_text', 'spoken_words']).drop_nulls().collect()
lemmatizer = WordNetLemmatizer()
token_list = [word_tokenize(str.lower(phrase), language='english') for phrase in df['spoken_words'].to_list()]

stop_words = list(stopwords.words('english')) + list(string.punctuation) + ['...']
for phrase in token_list:
    for word in phrase:
        if word not in stop_words:
            word = lemmatizer.lemmatize(word)
        else:
            phrase.remove(word)

df.insert_column(3, pl.Series('tokenized', token_list))

character_id,raw_character_text,spoken_words,tokenized
i64,str,str,list[str]
464,"""Miss Hoover""","""No, actually, it was a little …","[""actually"", ""a"", … "".""]"
9,"""Lisa Simpson""","""Where's Mr. Bergstrom?""","[""'s"", ""mr."", ""bergstrom""]"
464,"""Miss Hoover""","""I don't know. Although I'd sur…","[""do"", ""n't"", … ""?""]"
9,"""Lisa Simpson""","""That life is worth living.""","[""life"", ""worth"", ""living""]"
40,"""Edna Krabappel-Flanders""","""The polls will be open from no…","[""polls"", ""be"", … ""martin""]"
…,…,…,…
464,"""Miss Hoover""","""I'm back.""","[""'m"", ""back""]"
464,"""Miss Hoover""","""You see, class, my Lyme diseas…","[""see"", ""class"", … ""...""]"
464,"""Miss Hoover""","""Psy-cho-so-ma-tic.""","[""psy-cho-so-ma-tic""]"
119,"""Ralph Wiggum""","""Does that mean you were crazy?""","[""that"", ""mean"", … ""crazy""]"


In [254]:
w2v = gensim.models.Word2Vec(token_list, 
                            vector_size=64, 
                            window=5,
                            min_count=1, 
                            workers=16)

top_n = 1000
words = [word for word in w2v.wv.index_to_key[:top_n]]
vectors = np.array([w2v.wv[word] for word in words])

tsne = TSNE(n_components=2, 
            random_state=42, 
            perplexity=30, 
            n_iter=3000)
vectors_2d = tsne.fit_transform(vectors)

vectors_df = pl.from_numpy(vectors_2d, schema=["X", "Y"]).with_columns(pl.Series(name="words", values=words))



In [325]:
# output_notebook()

source = vectors_df.to_pandas()

TOOLTIPS = [("word", "@words")]

p = figure(width=800, height=600, tooltips=TOOLTIPS,
           title='Эмбеддинги Word2Vec (t-SNE)')

p.hex('X', 'Y', size=15, color="forestgreen", alpha=0.5, source=source)

show(p)

In [333]:
from bokeh.io import export_png
export_png(p, filename="plot.png")

'e:\\python_projects\\notebooks\\Deep learning\\Deep learning homeworks\\HW11\\plot.png'

In [226]:
emb_sum1 = w2v.wv['homer'] + w2v.wv['marge'] + w2v.wv['bart']
print(w2v.wv.most_similar(emb_sum1, topn=5))
emb_sum2 = w2v.wv['bart'] - w2v.wv['lisa'] + w2v.wv['school']
print(w2v.wv.most_similar(emb_sum2, topn=5))
emb_sum3 = w2v.wv['marge'] - w2v.wv['homer'] + w2v.wv['home']
print(w2v.wv.most_similar(emb_sum3, topn=5))

[('homer', 0.9710502028465271), ('marge', 0.9563987255096436), ('bart', 0.9552832245826721), ('lisa', 0.9360835552215576), ('milhouse', 0.8553897738456726)]
[('school', 0.9496179223060608), ('game', 0.851958155632019), ('christmas', 0.830085813999176), ('town', 0.8289662599563599), ('dentist', 0.8202036023139954)]
[('home', 0.9131438732147217), ('back', 0.8254780769348145), ('bed', 0.8186049461364746), ('sleep', 0.798312246799469), ('free', 0.7947134971618652)]


In [242]:
emb_sum2 = w2v.wv['cat'] + w2v.wv['fly']
print(w2v.wv.most_similar(emb_sum2, topn=5))

[('imagination', 0.9547594785690308), ('cuff', 0.9528318643569946), ('memory', 0.9526265263557434), ('kwik-e-mart', 0.9463292360305786), ('cat', 0.9458079934120178)]
