In [None]:
!wget -O wikitext-filtered-full.zip "https://www.dropbox.com/scl/fi/ibd4cmixckghx6hhb361c/wikitext-filtered-full.zip?rlkey=q71cebf0k5fvvwhmcntoswzhq&dl=1"
!wget -O wikitext-filtered-10k.zip "https://www.dropbox.com/scl/fi/ek174r3sg7qjx0aa9atop/wikitext-filtered-10k.zip?rlkey=zy6jqxv6qsc16lr9qm3ki9uhf&dl=1"

In [None]:
!unzip wikitext-filtered-full.zip
!unzip wikitext-filtered-10k.zip

In [None]:
# !pip install datasets
import datasets

In [None]:
from datasets import Dataset

def load_dataset():
  wikitext_small = "wikitext-filtered-10k"
  wikitext_large = "wikitext-filtered-full"

  dataset_small = Dataset.load_from_disk(wikitext_small)
  dataset_large = Dataset.load_from_disk(wikitext_large)
  print("wikitext_small: {} docs, wikitext_large: {} docs".format(len(dataset_small), len(dataset_large)))
  return dataset_small, dataset_large

wikitext_small, wikitext_large = load_dataset()

In [None]:
def normalize_text(text: str) -> str:
    """Normalize text by removing special characters, extra spaces, and converting to lowercase. """
    # Remove special characters manually
    cleaned_text = "".join(char for char in text if char.isalnum() or char.isspace())
    # Remove extra spaces
    cleaned_text = " ".join(cleaned_text.split())
    # Convert to lowercase
    cleaned_text = cleaned_text.lower()
    return cleaned_text

In [None]:
START_TOKEN = "<s>"
END_TOKEN = "</s>"

def read_corpus(files) -> list[list[str]]:
    Return a list of tokenised reviews, each review is a list of words.

    # Return a list of lists, where each sub-list is a tokenized review with start and end tokens
    # Access the 'text' column of the dataset
    return [f"{START_TOKEN} {normalize_text(line['text'])} {END_TOKEN}".split(" ") for line in files]

In [None]:
wikitext_smallNormalToken = read_corpus(wikitext_small)
print(wikitext_smallNormalToken[0])

In [None]:
from gensim.models import Word2Vec

model1 = Word2Vec(sentences=wikitext_smallNormalToken, vector_size=50, window=5, min_count=5, workers=4)
model1.save("word2vec.model")

Importing WordSim-353

In [None]:
!wget -O wordsim353.zip "https://gabrilovich.com/resources/data/wordsim353/wordsim353.zip"

In [None]:
!unzip wordsim353.zip

In [None]:
import pandas as pd

In [None]:
wordsim = pd.read_csv("combined.csv")
wordsim.head()

Step 3 Cosine Similarity

In [None]:
import numpy as np

In [None]:
def cosine_similarity(model):
  word2vecSimNum = []
  for i in range(len(wordsim)):
    w1 = wordsim.loc[i].get("Word 1")
    w2 = wordsim.loc[i].get("Word 2")
    if w1 in model.wv and w2 in model.wv:
      word2VecMean = model.wv.similarity(w1, w2)
      word2vecSimNum.append(word2VecMean)
    else:
      word2vecSimNum.append(np.nan)
  return word2vecSimNum

In [None]:
model1Cosine = cosine_similarity(model1)

In [None]:
# Question A
planeCar = model1.wv.similarity("plane", "car")
print(planeCar)

planetSun = model1.wv.similarity("planet", "sun")
print(planetSun)

cupArticle = model1.wv.similarity("cup", "article")
print(cupArticle)

sugarApproach = model1.wv.similarity("sugar", "approach")
print(sugarApproach)

Step 4 Evaluate Semantic Relatedness

In [None]:
import scipy.stats as scistats

In [None]:
scistats.spearmanr(wordsim["Human (mean)"], model1Cosine, nan_policy = 'omit')

Repeating for wiki_large

In [None]:
#Normalising and Tokenising
wikitext_largeNormalToken = read_corpus(wikitext_large)
print(wikitext_largeNormalToken[0])

#Training new word2vec model with larger dataset
model2 = Word2Vec(sentences=wikitext_largeNormalToken, vector_size=50, window=5, min_count=5, workers=4)
model2.save("word2vecLarge.model")

In [None]:
model2Cosine = cosine_similarity(model2)

In [None]:
scistats.spearmanr(wordsim["Human (mean)"], model2Cosine, nan_policy = 'omit')

In [None]:
# Question A Large
planeCar = model2.wv.similarity("plane", "car")
print(planeCar)

planetSun = model2.wv.similarity("planet", "sun")
print(planetSun)

cupArticle = model2.wv.similarity("cup", "article")
print(cupArticle)

sugarApproach = model2.wv.similarity("sugar", "approach")
print(sugarApproach)

Step 5 Pre Trained Model

In [None]:
import gensim.downloader as downloader

In [None]:
preTrainedModel = downloader.load("word2vec-google-news-300")

In [None]:
def cosine_similarity_pretrained(model):

    word2vecSimNum = []
    for i in range(len(wordsim)):
        w1 = wordsim.loc[i].get("Word 1")
        w2 = wordsim.loc[i].get("Word 2")
        if w1 in model and w2 in model:
            word2VecMean = model.similarity(w1, w2)
            word2vecSimNum.append(word2VecMean)
        else:
            word2vecSimNum.append(np.nan)
    return word2vecSimNum

In [None]:
preTrainedCosine = cosine_similarity_pretrained(preTrainedModel)

In [None]:
scistats.spearmanr(wordsim["Human (mean)"], preTrainedCosine, nan_policy = 'omit')

# Response Section
a.  What are the cosine similarity scores for the following pairs: 
-  plane / car 
-  planet / sun 
-  cup / article 
-  sugar / approach

b.  What is the value of the Spearman correlation coefficients computed in Step 4?

c.  How do you interpret each coefficient value with respect to the word similarity task? 
Are the coefficient values for the two vector space models you created different, and 
if so, why?

d.  What is the value of the Spearman correlation coefficient and how do you interpret it?

e.  Create a table of results that summarises your experiments. Tips: In case you have 
run several experiments with different hyperparameters choose to present the most 
significant. It might be worth presenting more than one experiment with only a single 
hyperparameter change if you want to emphasise a striking difference worth 
discussing. Finally, apart from the Spearman correlation coefficients, make sure you 
also include the most significant hyperparameters as separate columns (for example, 
see Table 2 from Merity et al., 2016).

f.  Using the table of results from your answer to question g., write a short discussion 
section that answers the following questions:  
i.  Does a bigger corpus yield better representations?   
ii.  Does a bigger vocabulary yield better representations?  
iii.  Do bigger word vectors yield better representations?  
iv.  Does a bigger context window yield better representations? 
v.  Step 6 requires you to look for the best combination of hyperparameters using 
the same two datasets for evaluation. Is this a good practice?

g.  What are the top-5 analogies for the following configurations: 
-  man is to woman as king is to ___? 
-  Athens is to Greece as Rome is to ___? 
-  reading is to read as playing is to ___? 
-  Greece is to souvlaki as Italy is to ___? 
-  airplane is to propeller as car is to ___? 
Is the top-1 answer always the “correct”? What about the rest of the results?

h.  What are the top-5 analogies for the following configurations? Can you identify any 
gender-based stereotypes? Briefly discuss your findings: 
-  man is to woman as computer programmer is to ___? 
-  man is to woman as superstar is to ___? 
-  man is to woman as guitarist is to ___? 
-  man is to woman as boss is to ___?