In [1]:
import pandas as pd
import gensim
import re

from nltk.tokenize import word_tokenize

# Task 1: Read The Dataset and Apply The Data Preprocessing

In [2]:
df = pd.read_csv("./data/simpsons_dataset.csv")
df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [3]:
def clean_text(text: str) -> str:
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\d+", "", text)
    text = text.lower()
    return text

In [4]:
df.dropna(subset=["spoken_words"], inplace=True)
df["cleaned_text"] = df["spoken_words"].apply(clean_text)
df[["cleaned_text", "spoken_words"]].head()

Unnamed: 0,cleaned_text,spoken_words
0,no actually it was a little of both sometimes ...,"No, actually, it was a little of both. Sometim..."
1,wheres mr bergstrom,Where's Mr. Bergstrom?
2,i dont know although id sure like to talk to h...,I don't know. Although I'd sure like to talk t...
3,that life is worth living,That life is worth living.
4,the polls will be open from now until the end ...,The polls will be open from now until the end ...


In [5]:
data = df["cleaned_text"].tolist()

In [6]:
tokens = []

for i in data:
    token = word_tokenize(i)
    tokens.append(token)

In [7]:
model = gensim.models.Word2Vec(
    tokens,
    min_count=1,
    vector_size=120,
    window=8,
    sg=1,
    workers=6,
)

# Task 2: Find similar words using Word2Vec

In [8]:
def most_similar(
    word: str,
    model: gensim.models.Word2Vec,
) -> list:
    similar = model.wv.most_similar(word)
    return similar


def print_most_similar(
    word: str,
    model: gensim.models.Word2Vec,
) -> None:
    print(f"Most Similar word to '{word}' is: ")
    for i in most_similar(word, model):
        print(i)

In [9]:
print_most_similar("homer", model)

Most Similar word to 'homer' is: 
('abe', 0.8551385998725891)
('marge', 0.8428040146827698)
('bartholomew', 0.8117152452468872)
('barney', 0.7930422425270081)
('karl', 0.7785205841064453)
('grampa', 0.7784044742584229)
('cooties', 0.7783496975898743)
('lurleen', 0.7759321928024292)
('eliza', 0.7751312255859375)
('herb', 0.7743209004402161)


In [10]:
print_most_similar("marge", model)

Most Similar word to 'marge' is: 
('homer', 0.842803955078125)
('abe', 0.8176416158676147)
('sweetie', 0.7895161509513855)
('sweetheart', 0.7819197177886963)
('maude', 0.7737510800361633)
('honey', 0.7725657820701599)
('blanche', 0.7671554684638977)
('becky', 0.7669074535369873)
('marjorie', 0.7643067240715027)
('allison', 0.7614040374755859)


In [11]:
print_most_similar("bart", model)

Most Similar word to 'bart' is: 
('lisa', 0.8110323548316956)
('jessica', 0.7966541051864624)
('milhouse', 0.7905529737472534)
('grampa', 0.7844950556755066)
('abe', 0.7810661196708679)
('saxophone', 0.7726356387138367)
('eliza', 0.7610217332839966)
('bartholomew', 0.7591866850852966)
('janey', 0.7553672194480896)
('jimbo', 0.7542879581451416)


In [12]:
print_most_similar("bart", model)

Most Similar word to 'bart' is: 
('lisa', 0.8110323548316956)
('jessica', 0.7966541051864624)
('milhouse', 0.7905529737472534)
('grampa', 0.7844950556755066)
('abe', 0.7810661196708679)
('saxophone', 0.7726356387138367)
('eliza', 0.7610217332839966)
('bartholomew', 0.7591866850852966)
('janey', 0.7553672194480896)
('jimbo', 0.7542879581451416)


# Task 3: Similarity between two words in the dataset

In [13]:
def similarity(
    word1: str,
    word2: str,
    model: gensim.models.Word2Vec,
) -> float:
    return model.wv.similarity(word1, word2)


def print_cosine_similarity(
    word1: str,
    word2: str,
    model: gensim.models.Word2Vec,
) -> None:
    print(f"Cosine Similarity between '{word1}' and '{word2}' is: {similarity(word1, word2, model)}")

In [14]:
print_cosine_similarity("moes", "tavern", model)
print_cosine_similarity("maggie", "baby", model)
print_cosine_similarity("bart", "nelson", model)

Cosine Similarity between 'moes' and 'tavern' is: 0.8600816130638123
Cosine Similarity between 'maggie' and 'baby' is: 0.6984161138534546
Cosine Similarity between 'bart' and 'nelson' is: 0.7291463017463684


# Task 4: Doesn't match

In [15]:
def does_not_match(
    words,
    model: gensim.models.Word2Vec,
):
    return model.wv.doesnt_match(words)


def print_does_not_match(
    words,
    model: gensim.models.Word2Vec,
):
    print(f"The word that does not match in the list '{words}' is: {does_not_match(words, model)}")

In [16]:
print_does_not_match(
    words=["jimbo", "milhouse", "kearney"],
    model=model,
)
print_does_not_match(
    words=["nelson", "bart", "milhouse"],
    model=model,
)
print_does_not_match(
    words=["bart", "homer", "milhouse"],
    model=model,
)

The word that does not match in the list '['jimbo', 'milhouse', 'kearney']' is: milhouse
The word that does not match in the list '['nelson', 'bart', 'milhouse']' is: nelson
The word that does not match in the list '['bart', 'homer', 'milhouse']' is: homer


# Task 5.1: Which word is to woman as homer is to marge

In [17]:
def analogy(
    word1: str,
    word2: str,
    word_to_compare: str,
    model: gensim.models.Word2Vec,
):
    return model.wv.most_similar(positive=[word1, word_to_compare], negative=[word2], topn=3)


def print_analogy(
    word1: str,
    word2: str,
    word_to_compare: str,
    model: gensim.models.Word2Vec,
):
    print(f"Analogy: {word1} is to {word2} as {word_to_compare} is to: ")
    for i in analogy(word1, word2, word_to_compare, model):
        print(i)

In [18]:
print_analogy(
    word1="homer",
    word2="marge",
    word_to_compare="woman",
    model=model,
)

Analogy: homer is to marge as woman is to: 
('bear', 0.760161817073822)
('hostage', 0.7541569471359253)
('grimes', 0.7411158680915833)


# Task 5.2: Which word is to woman as bart is to man

In [19]:
print_analogy(
    word1="bart",
    word2="man",
    word_to_compare="woman",
    model=model,
)

Analogy: bart is to man as woman is to: 
('lisa', 0.7667612433433533)
('herself', 0.7279494404792786)
('juliet', 0.7161649465560913)
