# Step 1 (Enviroment Setup)


---


## Importing the Datasets
wikitext (*full* ) - 859955 docs

wikitext (*small* ) - 10000 docs

In [1]:
!wget -O wikitext-filtered-full.zip "https://www.dropbox.com/scl/fi/ibd4cmixckghx6hhb361c/wikitext-filtered-full.zip?rlkey=q71cebf0k5fvvwhmcntoswzhq&dl=1"
!wget -O wikitext-filtered-10k.zip "https://www.dropbox.com/scl/fi/ek174r3sg7qjx0aa9atop/wikitext-filtered-10k.zip?rlkey=zy6jqxv6qsc16lr9qm3ki9uhf&dl=1"

--2025-10-20 08:38:49--  https://www.dropbox.com/scl/fi/ibd4cmixckghx6hhb361c/wikitext-filtered-full.zip?rlkey=q71cebf0k5fvvwhmcntoswzhq&dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.1.18, 2620:100:6016:18::a27d:112
Connecting to www.dropbox.com (www.dropbox.com)|162.125.1.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://uc3b588031cc1d50bea7a00d4a6d.dl-eu.dropboxusercontent.com/cd/0/inline/CzmPZgxgYO63SV9LrYxYUMaZXXH2lb6pywYviRNGFwFFMeFYDt7wwR-9wcA99pZwJYBBx5jsKjTkyN8gYsz5fOv6gV7XctGXxk36oFssNZ8YQyfDCC_j140SbkEVpAgKV7BnD56hBB4eRBM5tzB1bQWY/file?dl=1# [following]
--2025-10-20 08:38:50--  https://uc3b588031cc1d50bea7a00d4a6d.dl-eu.dropboxusercontent.com/cd/0/inline/CzmPZgxgYO63SV9LrYxYUMaZXXH2lb6pywYviRNGFwFFMeFYDt7wwR-9wcA99pZwJYBBx5jsKjTkyN8gYsz5fOv6gV7XctGXxk36oFssNZ8YQyfDCC_j140SbkEVpAgKV7BnD56hBB4eRBM5tzB1bQWY/file?dl=1
Resolving uc3b588031cc1d50bea7a00d4a6d.dl-eu.dropboxusercontent.com (uc3b588031cc1d50bea7a00d4a6d.dl-eu.dropbo

In [2]:
!unzip wikitext-filtered-full.zip
!unzip wikitext-filtered-10k.zip

Archive:  wikitext-filtered-full.zip
   creating: wikitext-filtered-full/
  inflating: wikitext-filtered-full/dataset_info.json  
  inflating: wikitext-filtered-full/state.json  
  inflating: wikitext-filtered-full/data-00000-of-00001.arrow  
Archive:  wikitext-filtered-10k.zip
   creating: wikitext-filtered-10k/
  inflating: wikitext-filtered-10k/dataset_info.json  
  inflating: wikitext-filtered-10k/state.json  
  inflating: wikitext-filtered-10k/data-00000-of-00001.arrow  


In [3]:
# datasets package provides dataset tools from hugginface
!pip install datasets
import datasets



In [4]:
from datasets import load_dataset, Dataset

def load_dataset():
  wikitext_small = "wikitext-filtered-10k"
  wikitext_large = "wikitext-filtered-full"

  dataset_small = Dataset.load_from_disk(wikitext_small)
  dataset_large = Dataset.load_from_disk(wikitext_large)
  print("wikitext_small: {} docs, wikitext_large: {} docs".format(len(dataset_small), len(dataset_large)))
  return dataset_small, dataset_large

wikitext_small, wikitext_large = load_dataset()

wikitext_small: 10000 docs, wikitext_large: 859955 docs


## Understanding the Dataset
Summary statistics

In [5]:
wt = wikitext_small
#wt = wikitext_large

print('# TYPE OF THE DATASET:', '\n', type(wt))
print(wt, '\n')
print('# ENTRIES LOOK LIKE:')
print(wt.features, '\n', wt[0], '\n', wt[1], '\n')

print('# DATASET STATISTICS:')
print('No. of docs:', len(wt))
lengths = [len(doc['text'].split()) for doc in wt]
print('Mean doc length:', sum(lengths)/len(lengths), 'words')

# TYPE OF THE DATASET: 
 <class 'datasets.arrow_dataset.Dataset'>
Dataset({
    features: ['text'],
    num_rows: 10000
}) 

# ENTRIES LOOK LIKE:
{'text': Value('string')} 
 {'text': 'Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " .'} 
 {'text': "The game began development in 2010 , carrying over a large portion of the work done on Valkyria Ch

# Step 2 (Train Baselines)

---
Installing dependancies
- gensim - word2vec models
- nltk (natural language tool kit) - stopwords removal

In [6]:
!pip install gensim nltk
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [7]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_dataset(dataset):
    text_col = 'text' if 'text' in dataset.column_names else dataset.column_names[0]
    tokenized = []

    for i in range(len(dataset)):
        text = dataset[i][text_col]
        if not isinstance(text, str):
            continue
        tokens = [t.lower() for t in text.split() if t.isalpha() and t.lower() not in stop_words]
        if tokens:
            tokenized.append(tokens)

    return tokenized

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
tokens_small = preprocess_dataset(wikitext_small)
tokens_large = preprocess_dataset(wikitext_large)

In [9]:
def train_word2vec(tokens, model_name, vector_size=50, window=5, min_count=5, epochs=5):
    print(f"Training {model_name} ...")
    model = Word2Vec(
        sentences=tokens,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        epochs=epochs
    )
    model.save(f"{model_name}.model")
    model.wv.save(f"{model_name}.kv")
    print(f"{model_name} saved.")
    return model

In [48]:
# explicit manual model training
model_small = train_word2vec(tokens_small, "word2vec_small")
model_large = train_word2vec(tokens_large, "word2vec_large")

Training word2vec_small ...
word2vec_small saved.
Training word2vec_large ...
word2vec_large saved.


In [None]:
# writing model to disk
model_small.save("word2vec_small.model")
model_large.save("word2vec_large.model")

In [None]:
model_small = Word2Vec.load("word2vec_small.model")
model_large = Word2Vec.load("word2vec_large.model")

In [103]:
vocab_size = len(model_small.wv)
print(f"Vocab size (learned by model):", vocab_size, '\n')

example_tokens = ['plane', 'car', 'planet', 'nurse', 'city', 'country']
for token in example_tokens:
    if token in model_small.wv:
        print(f"Top-10 similar to '{token}':", model_small.wv.most_similar(token, topn=10))
    else:
        print(f"'{token}' not in vocabulary.")

Vocab size (learned by model): 13838 

Top-10 similar to 'plane': [('immediate', 0.9978976845741272), ('observed', 0.997430682182312), ('replacement', 0.9972765445709229), ('channel', 0.9971292614936829), ('successfully', 0.9968360662460327), ('evening', 0.996701717376709), ('trip', 0.9966968297958374), ('costs', 0.9966406226158142), ('assisted', 0.996527910232544), ('registered', 0.9965097308158875)]
Top-10 similar to 'car': [('paper', 0.9985315799713135), ('painting', 0.9984121918678284), ('figure', 0.998360812664032), ('binding', 0.9983581304550171), ('identity', 0.9983425140380859), ('glass', 0.9983089566230774), ('powerful', 0.9982881546020508), ('solid', 0.9982327818870544), ('typical', 0.998124361038208), ('apparently', 0.9980954527854919)]
Top-10 similar to 'planet': [('tree', 0.997928261756897), ('depicts', 0.9977642297744751), ('typically', 0.9977415204048157), ('atmosphere', 0.9975147843360901), ('normally', 0.9975024461746216), ('whereas', 0.9973671436309814), ('enough', 0.

# Task 3

In [11]:
import numpy as np
import math

In [12]:
def mag(v):
  s = sum((e*e) for e in v)
  s = math.sqrt(s)
  return s

def cosineSimilarity(v1, v2):
  dotProd = np.dot(v1, v2)
  cos = dotProd/(mag(v1)*mag(v2))
  return cos

In [63]:
w1, w2 = 'sugar', 'approach'

v1 = model_small.wv[w1]
v2 = model_small.wv[w2]

print("small:", cosineSimilarity(v1, v2))

v1 = model_large.wv[w1]
v2 = model_large.wv[w2]

print("large:", cosineSimilarity(v1, v2))

small: 0.9862285
large: -0.17874575


# Step 4

In [14]:
!wget -O wordsim353.zip "www.gabrilovich.com/resources/data/wordsim353/wordsim353.zip"
!unzip wordsim353.zip -d wordsim353

--2025-10-20 08:41:46--  http://www.gabrilovich.com/resources/data/wordsim353/wordsim353.zip
Resolving www.gabrilovich.com (www.gabrilovich.com)... 173.236.137.139
Connecting to www.gabrilovich.com (www.gabrilovich.com)|173.236.137.139|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://gabrilovich.com/resources/data/wordsim353/wordsim353.zip [following]
--2025-10-20 08:41:46--  https://gabrilovich.com/resources/data/wordsim353/wordsim353.zip
Resolving gabrilovich.com (gabrilovich.com)... 173.236.137.139
Connecting to gabrilovich.com (gabrilovich.com)|173.236.137.139|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 23257 (23K) [application/zip]
Saving to: ‘wordsim353.zip’


2025-10-20 08:41:47 (389 KB/s) - ‘wordsim353.zip’ saved [23257/23257]

Archive:  wordsim353.zip
  inflating: wordsim353/combined.csv  
  inflating: wordsim353/set1.csv     
  inflating: wordsim353/set2.csv     
  inflating: wordsim353/combined.ta

In [15]:
import pandas as pd
from scipy.stats import spearmanr

In [64]:
df = pd.read_csv("wordsim353/combined.csv")
print("Loaded: wordsim353/combined.csv | shape:", df.shape)
print(df.head())

Loaded: wordsim353/combined.csv | shape: (353, 3)
     Word 1    Word 2  Human (mean)
0      love       sex          6.77
1     tiger       cat          7.35
2     tiger     tiger         10.00
3      book     paper          7.46
4  computer  keyboard          7.62


In [65]:
# compare vocabularies
wordsim_vocab = set(df["Word 1"].str.lower()).union(set(df["Word 2"].str.lower()))

smallModel_vocab = set(model_small.wv.key_to_index)
largeModel_vocab = set(model_large.wv.key_to_index)

sharedVocab = wordsim_vocab.intersection(largeModel_vocab)
OOV = wordsim_vocab.difference(largeModel_vocab)

print(OOV) # should be empty

set()


In [66]:
def sim_or_nan_strict(model, a, b):
    if a not in model.wv.key_to_index or b not in model.wv.key_to_index:
        return np.nan
    v1 = model.wv[a]
    v2 = model.wv[b]
    return float(cosineSimilarity(v1, v2))


In [67]:
def ws_spearman(model, df):
    sims = df.apply(lambda r: sim_or_nan_strict(model, r['Word 1'], r['Word 2']), axis=1)
    gold = df['Human (mean)']
    valid = sims.notna()


    if not valid.any():
        return np.nan, np.nan, 0, len(df), len(df)

    rho, p = spearmanr(sims[valid], gold[valid])

    used = int(valid.sum())
    total = int(len(df))
    n_oov = total - used
    return float(rho), float(p), used, total, n_oov


# alternative simpler implamentation
def applySpearman_to_WordSim(model, df):
  df = df.copy()
  df["Word 1"] = df["Word 1"].str.lower()
  df["Word 2"] = df["Word 2"].str.lower()

  similarities = df.apply(
      lambda r: model.wv.similarity(r["Word 1"], r["Word 2"]),
      axis = 1
  )

  human_standard = df["Human (mean)"].astype(float)
  rho, p_value = spearmanr(similarities, human_standard)

  return float(rho), float(p_value)

In [71]:
for label, m in [("wikitext_small", model_small), ("wikitext_large", model_large)]:
    rho, p, used, total, n_oov = ws_spearman(m, df)
    print(f"{label}: Spearman ρ = {rho}, p-value {p}")


wikitext_small: Spearman ρ = 0.11157656314685778, p-value 0.07707069233537944
wikitext_large: Spearman ρ = 0.6283015932937069, p-value 3.395172611754527e-38


Note that p-value of small model is ~22%. Indicating probability that our result is due to chance and not statistically significant.

# Step 5

In [21]:
import gensim.downloader as gensimDownloader

googleNews_model = gensimDownloader.load("word2vec-google-news-300")



In [22]:
print(googleNews_model)

KeyedVectors<vector_size=300, 3000000 keys>


Note the massive model that google trained on news data is imported as a KeyedVectors object (not a full model with functionality)

had to make a version of ws_spearman for keyedVectors instead of a full word2vec model

In [23]:
def ws_spearman_google(model, df):
    df = df.copy()
    df["Word 1"] = df["Word 1"].str.lower()
    df["Word 2"] = df["Word 2"].str.lower()

    def sim_or_nan(a, b):
        if a not in model.key_to_index or b not in model.key_to_index:
          return np.nan
        return float(model.similarity(a, b))

    similarities = df.apply(lambda r: sim_or_nan(r["Word 1"], r["Word 2"]), axis=1)
    human_standard = df["Human (mean)"].astype(float)

    mask = ~similarities.isna()
    if mask.sum() == 0:
      return np.nan, np.nan, 0, len(df), mask.sum()

    rho, p_value = spearmanr(similarities[mask], human_standard[mask])

    n_oov = len(df) - mask.sum()
    return float(rho), float(p_value), int(mask.sum()), int(len(df)), int(n_oov)


In [104]:
rho, p, used, total, n_oov = ws_spearman_google(googleNews_model, df)
print(f"googleNews_model Spearman\n ρ (rho) = {rho},\n p-value = {p:.40f}")

googleNews_model Spearman
 ρ (rho) = 0.6941224810339758,
 p-value = 0.0000000000000000000000000000000000000000


# Hyperparameter tuning
This section is for experimenting with hyperparameter configurations, and documenting results (ensure to save document outside this runtime or it will disappear).

In [25]:
parameter_configurations = [
    {"vector_size": 50, "window": 5, "min_count": 5, "epochs": 5}, # baseline
    {"vector_size": 100, "window": 5, "min_count": 5, "epochs": 5}, # larger vector size
    {"vector_size": 50, "window": 10, "min_count": 5, "epochs": 5}, # larger window
    {"vector_size": 50, "window": 5, "min_count": 2, "epochs": 5}, # lower min_count (more words)
    {"vector_size": 50, "window": 5, "min_count": 5, "epochs": 10} # (more training epochs)
]

In [26]:
import time

results = [] # store all results

In [35]:
for corpus_name, tokens in [("small", tokens_small), ("large", tokens_large)]:
  for configs in parameter_configurations:
    model_name = (
        "Word2Vec" + corpus_name +
        "-vs" + str(configs["vector_size"]) +
        "_w" + str(configs["window"]) +
        "_mc" + str(configs["min_count"]) +
        "_ep" + str(configs["epochs"])
    )
    print("Training model: ", model_name)

    start = time.time()
    model = Word2Vec(
        sentences = tokens,
        vector_size = configs["vector_size"],
        window = configs["window"],
        min_count = configs["min_count"],
        epochs = configs["epochs"]
    )

    train_time = round(time.time() - start, 2)

    rho, p, used, total, n_oov = ws_spearman(model, df)

    results.append({
        "Dataset": corpus_name,
        "Vector size": configs["vector_size"],
        "Window": configs["window"],
        "Min count": configs["min_count"],
        "Epochs": configs["epochs"],
        "Vocab size": len(model.wv),
        "Spearman rho": rho,
        "p-value": p,
        "OOV pairs": n_oov,
        "Train time (s)": train_time
    })

Training model:  Word2Vecsmall-vs50_w5_mc5_ep5
Training model:  Word2Vecsmall-vs100_w5_mc5_ep5
Training model:  Word2Vecsmall-vs50_w10_mc5_ep5
Training model:  Word2Vecsmall-vs50_w5_mc2_ep5
Training model:  Word2Vecsmall-vs50_w5_mc5_ep10
Training model:  Word2Veclarge-vs50_w5_mc5_ep5
Training model:  Word2Veclarge-vs100_w5_mc5_ep5
Training model:  Word2Veclarge-vs50_w10_mc5_ep5
Training model:  Word2Veclarge-vs50_w5_mc2_ep5
Training model:  Word2Veclarge-vs50_w5_mc5_ep10


In [39]:
results_df = pd.DataFrame(results)

display(results_df)

Unnamed: 0,Dataset,Vector size,Window,Min count,Epochs,Vocab size,Spearman rho,p-value,OOV pairs,Train time (s)
0,small,50,5,5,5,13838,0.092654,0.1424635,101,6.52
1,small,100,5,5,5,13838,0.077806,0.2183734,101,4.72
2,small,50,10,5,5,13838,0.121472,0.05412182,101,7.08
3,small,50,5,2,5,26508,0.061705,0.2859219,52,5.98
4,small,50,5,5,10,13838,0.187886,0.002749047,101,11.12
5,large,50,5,5,5,162898,0.63623,2.077819e-39,18,613.96
6,large,100,5,5,5,162898,0.650294,1.191345e-41,18,555.76
7,large,50,10,5,5,162898,0.653907,3.026365e-42,18,681.11
8,large,50,5,2,5,290287,0.635704,2.50741e-39,18,633.58
9,large,50,5,5,10,162898,0.633475,5.5355250000000004e-39,18,1154.1


In [47]:
results_df["p-value (decimal)"] = results_df["p-value"].apply(lambda p: f"{p:.20f}")
display(results_df)

Unnamed: 0,Dataset,Vector size,Window,Min count,Epochs,Vocab size,Spearman rho,p-value,OOV pairs,Train time (s),p-value (decimal)
0,small,50,5,5,5,13838,0.092654,0.1424635,101,6.52,0.1424634896457367
1,small,100,5,5,5,13838,0.077806,0.2183734,101,4.72,0.2183734026567162
2,small,50,10,5,5,13838,0.121472,0.05412182,101,7.08,0.0541218190801185
3,small,50,5,2,5,26508,0.061705,0.2859219,52,5.98,0.2859219159502586
4,small,50,5,5,10,13838,0.187886,0.002749047,101,11.12,0.0027490474051209
5,large,50,5,5,5,162898,0.63623,2.077819e-39,18,613.96,0.0
6,large,100,5,5,5,162898,0.650294,1.191345e-41,18,555.76,0.0
7,large,50,10,5,5,162898,0.653907,3.026365e-42,18,681.11,0.0
8,large,50,5,2,5,290287,0.635704,2.50741e-39,18,633.58,0.0
9,large,50,5,5,10,162898,0.633475,5.5355250000000004e-39,18,1154.1,0.0


In [40]:
results_df.to_csv("hyperparameterTuningResults.csv", index=False)

# Analogies

In [100]:
def analogy(model, w1, w2, w3):
  try:
    results = model.most_similar(positive=[w2,w3], negative=[w1], topn=5)
    print("\n", w1, " is to ", w2, " as ", w3, " is to ?")
    for rank, (word, score) in enumerate(results, 1):
      print(f"{rank}. {word} ({score:.4f})")
    return results
  except KeyError as e:
    print("word not in vocab")
    return []

Analogies

In [101]:
analogy(googleNews_model, "man", "woman", "king")
analogy(googleNews_model, "Athens", "Greece", "Rome")
analogy(googleNews_model, "reading", "read", "playing")
analogy(googleNews_model, "Greece", "souvlaki", "Italy")
analogy(googleNews_model, "airplane", "propeller", "car")


 man  is to  woman  as  king  is to ?
1. queen (0.7118)
2. monarch (0.6190)
3. princess (0.5902)
4. crown_prince (0.5499)
5. prince (0.5377)

 Athens  is to  Greece  as  Rome  is to ?
1. Italy (0.6826)
2. Sicily (0.5808)
3. Portugal (0.5467)
4. Italian (0.5194)
5. ANSA (0.5114)

 reading  is to  read  as  playing  is to ?
1. played (0.7010)
2. play (0.6676)
3. Playing (0.5651)
4. playin (0.5131)
5. toplay (0.4870)

 Greece  is to  souvlaki  as  Italy  is to ?
1. quiche_Lorraine (0.5962)
2. gelati (0.5909)
3. pesce (0.5880)
4. pizza_margherita (0.5872)
5. porchetta (0.5866)

 airplane  is to  propeller  as  car  is to ?
1. steering_wheel (0.5703)
2. front_fender (0.5417)
3. fender (0.5393)
4. Ford_Festiva (0.5320)
5. Nissan_###ZX (0.5274)


[('steering_wheel', 0.5702899694442749),
 ('front_fender', 0.5416868925094604),
 ('fender', 0.5393295884132385),
 ('Ford_Festiva', 0.5320079922676086),
 ('Nissan_###ZX', 0.5273997187614441)]

Gender bias checking

In [102]:

analogy(googleNews_model, "man", "woman", "computer_programmer")
analogy(googleNews_model, "man", "woman", "superstar")
analogy(googleNews_model, "man", "woman", "guitarist")
analogy(googleNews_model, "man", "woman", "boss")



 man  is to  woman  as  computer_programmer  is to ?
1. homemaker (0.5627)
2. housewife (0.5105)
3. graphic_designer (0.5052)
4. schoolteacher (0.4979)
5. businesswoman (0.4935)

 man  is to  woman  as  superstar  is to ?
1. megastar (0.6585)
2. diva (0.6237)
3. pop_diva (0.5787)
4. star (0.5745)
5. songstress (0.5550)

 man  is to  woman  as  guitarist  is to ?
1. vocalist (0.7626)
2. drummer (0.6976)
3. bassist (0.6961)
4. singer_guitarist (0.6777)
5. guitarist_vocalist (0.6640)

 man  is to  woman  as  boss  is to ?
1. bosses (0.5523)
2. manageress (0.4915)
3. exec (0.4594)
4. Manageress (0.4560)
5. receptionist (0.4474)


[('bosses', 0.5522644519805908),
 ('manageress', 0.49151360988616943),
 ('exec', 0.45940810441970825),
 ('Manageress', 0.4559843838214874),
 ('receptionist', 0.4474116563796997)]