In [None]:
!wget -O wikitext-filtered-full.zip "https://www.dropbox.com/scl/fi/ibd4cmixckghx6hhb361c/wikitext-filtered-full.zip?rlkey=q71cebf0k5fvvwhmcntoswzhq&dl=1"
!wget -O wikitext-filtered-10k.zip "https://www.dropbox.com/scl/fi/ek174r3sg7qjx0aa9atop/wikitext-filtered-10k.zip?rlkey=zy6jqxv6qsc16lr9qm3ki9uhf&dl=1"

In [None]:
!unzip wikitext-filtered-full.zip
!unzip wikitext-filtered-10k.zip

In [None]:
# !pip install datasets
import datasets

In [1]:
from datasets import Dataset

def load_dataset():
  wikitext_small = "wikitext-filtered-10k"
  wikitext_large = "wikitext-filtered-full"

  dataset_small = Dataset.load_from_disk(wikitext_small)
  dataset_large = Dataset.load_from_disk(wikitext_large)
  print("wikitext_small: {} docs, wikitext_large: {} docs".format(len(dataset_small), len(dataset_large)))
  return dataset_small, dataset_large

wikitext_small, wikitext_large = load_dataset()

  from .autonotebook import tqdm as notebook_tqdm


wikitext_small: 10000 docs, wikitext_large: 859955 docs


In [2]:
def normalize_text(text: str) -> str:
    """Normalize text by removing special characters, extra spaces, and converting to lowercase. """
    # Remove special characters manually
    cleaned_text = "".join(char for char in text if char.isalnum() or char.isspace())
    # Remove extra spaces
    cleaned_text = " ".join(cleaned_text.split())
    # Convert to lowercase
    cleaned_text = cleaned_text.lower()
    return cleaned_text

In [3]:
START_TOKEN = "<s>"
END_TOKEN = "</s>"

def read_corpus(files) -> list[list[str]]:
    # Return a list of tokenised reviews, each review is a list of words.

    # Return a list of lists, where each sub-list is a tokenized review with start and end tokens
    # Access the 'text' column of the dataset
    return [f"{START_TOKEN} {normalize_text(line['text'])} {END_TOKEN}".split(" ") for line in files]

In [4]:
wikitext_smallNormalToken = read_corpus(wikitext_small)
print(wikitext_smallNormalToken[0])

['<s>', 'senjō', 'no', 'valkyria', '3', 'unrecorded', 'chronicles', 'japanese', '戦場のヴァルキュリア3', 'lit', 'valkyria', 'of', 'the', 'battlefield', '3', 'commonly', 'referred', 'to', 'as', 'valkyria', 'chronicles', 'iii', 'outside', 'japan', 'is', 'a', 'tactical', 'role', 'playing', 'video', 'game', 'developed', 'by', 'sega', 'and', 'mediavision', 'for', 'the', 'playstation', 'portable', 'released', 'in', 'january', '2011', 'in', 'japan', 'it', 'is', 'the', 'third', 'game', 'in', 'the', 'valkyria', 'series', 'employing', 'the', 'same', 'fusion', 'of', 'tactical', 'and', 'real', 'time', 'gameplay', 'as', 'its', 'predecessors', 'the', 'story', 'runs', 'parallel', 'to', 'the', 'first', 'game', 'and', 'follows', 'the', 'nameless', 'a', 'penal', 'military', 'unit', 'serving', 'the', 'nation', 'of', 'gallia', 'during', 'the', 'second', 'europan', 'war', 'who', 'perform', 'secret', 'black', 'operations', 'and', 'are', 'pitted', 'against', 'the', 'imperial', 'unit', 'calamaty', 'raven', '</s>']


In [5]:
from gensim.models import Word2Vec

model1 = Word2Vec(sentences=wikitext_smallNormalToken, vector_size=50, window=5, min_count=5, workers=4)
model1.save("word2vec.model")

Importing WordSim-353

In [6]:
!wget -O wordsim353.zip "https://gabrilovich.com/resources/data/wordsim353/wordsim353.zip"

--2025-10-19 11:13:05--  https://gabrilovich.com/resources/data/wordsim353/wordsim353.zip
Resolving gabrilovich.com (gabrilovich.com)... 173.236.137.139
Connecting to gabrilovich.com (gabrilovich.com)|173.236.137.139|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 23257 (23K) [application/zip]
Saving to: ‘wordsim353.zip’


2025-10-19 11:13:06 (252 KB/s) - ‘wordsim353.zip’ saved [23257/23257]



In [7]:
!unzip wordsim353.zip

Archive:  wordsim353.zip
  inflating: combined.csv            
  inflating: set1.csv                
  inflating: set2.csv                
  inflating: combined.tab            
  inflating: set1.tab                
  inflating: set2.tab                
  inflating: instructions.txt        


In [8]:
import pandas as pd

In [9]:
wordsim = pd.read_csv("combined.csv")
wordsim.head()

Unnamed: 0,Word 1,Word 2,Human (mean)
0,love,sex,6.77
1,tiger,cat,7.35
2,tiger,tiger,10.0
3,book,paper,7.46
4,computer,keyboard,7.62


Step 3 Cosine Similarity

In [10]:
import numpy as np

In [11]:
def cosine_similarity(model):
  word2vecSimNum = []
  for i in range(len(wordsim)):
    w1 = wordsim.loc[i].get("Word 1")
    w2 = wordsim.loc[i].get("Word 2")
    if w1 in model.wv and w2 in model.wv:
      word2VecMean = model.wv.similarity(w1, w2)
      word2vecSimNum.append(word2VecMean)
    else:
      word2vecSimNum.append(np.nan)
  return word2vecSimNum

In [12]:
model1Cosine = cosine_similarity(model1)

In [13]:
# Question A
planeCar = model1.wv.similarity("plane", "car")
print(planeCar)

planetSun = model1.wv.similarity("planet", "sun")
print(planetSun)

cupArticle = model1.wv.similarity("cup", "article")
print(cupArticle)

sugarApproach = model1.wv.similarity("sugar", "approach")
print(sugarApproach)

0.8853891
0.8932499
0.54733366
0.7980922


Step 4 Evaluate Semantic Relatedness

In [14]:
import scipy.stats as scistats

In [15]:
scistats.spearmanr(wordsim["Human (mean)"], model1Cosine, nan_policy = 'omit')

SignificanceResult(statistic=0.1203052723914138, pvalue=0.05649148344997391)

Repeating for wiki_large

In [16]:
#Normalising and Tokenising
wikitext_largeNormalToken = read_corpus(wikitext_large)
print(wikitext_largeNormalToken[0])

#Training new word2vec model with larger dataset
model2 = Word2Vec(sentences=wikitext_largeNormalToken, vector_size=50, window=5, min_count=5, workers=4)
model2.save("word2vecLarge.model")

['<s>', 'senjō', 'no', 'valkyria', '3', 'unrecorded', 'chronicles', 'japanese', '戦場のヴァルキュリア3', 'lit', 'valkyria', 'of', 'the', 'battlefield', '3', 'commonly', 'referred', 'to', 'as', 'valkyria', 'chronicles', 'iii', 'outside', 'japan', 'is', 'a', 'tactical', 'role', 'playing', 'video', 'game', 'developed', 'by', 'sega', 'and', 'mediavision', 'for', 'the', 'playstation', 'portable', 'released', 'in', 'january', '2011', 'in', 'japan', 'it', 'is', 'the', 'third', 'game', 'in', 'the', 'valkyria', 'series', 'employing', 'the', 'same', 'fusion', 'of', 'tactical', 'and', 'real', 'time', 'gameplay', 'as', 'its', 'predecessors', 'the', 'story', 'runs', 'parallel', 'to', 'the', 'first', 'game', 'and', 'follows', 'the', 'nameless', 'a', 'penal', 'military', 'unit', 'serving', 'the', 'nation', 'of', 'gallia', 'during', 'the', 'second', 'europan', 'war', 'who', 'perform', 'secret', 'black', 'operations', 'and', 'are', 'pitted', 'against', 'the', 'imperial', 'unit', 'calamaty', 'raven', '</s>']


In [17]:
model2Cosine = cosine_similarity(model2)

In [18]:
scistats.spearmanr(wordsim["Human (mean)"], model2Cosine, nan_policy = 'omit')

SignificanceResult(statistic=0.6074315423385077, pvalue=3.653239708480032e-35)

In [19]:
# Question A Large
planeCar = model2.wv.similarity("plane", "car")
print(planeCar)

planetSun = model2.wv.similarity("planet", "sun")
print(planetSun)

cupArticle = model2.wv.similarity("cup", "article")
print(cupArticle)

sugarApproach = model2.wv.similarity("sugar", "approach")
print(sugarApproach)

0.6524783
0.63540334
0.10199884
-0.14861105


Step 5 Pre Trained Model

In [20]:
import gensim.downloader as downloader

In [21]:
preTrainedModel = downloader.load("word2vec-google-news-300")



In [22]:
def cosine_similarity_pretrained(model):

    word2vecSimNum = []
    for i in range(len(wordsim)):
        w1 = wordsim.loc[i].get("Word 1")
        w2 = wordsim.loc[i].get("Word 2")
        if w1 in model and w2 in model:
            word2VecMean = model.similarity(w1, w2)
            word2vecSimNum.append(word2VecMean)
        else:
            word2vecSimNum.append(np.nan)
    return word2vecSimNum

In [23]:
preTrainedCosine = cosine_similarity_pretrained(preTrainedModel)

In [24]:
scistats.spearmanr(wordsim["Human (mean)"], preTrainedCosine, nan_policy = 'omit')

SignificanceResult(statistic=0.7000166486272194, pvalue=2.8686666605142608e-53)

In [25]:
#Differing Hyper Paramters test
#Large Models
# Differing Hyperparameters Test
params = [
    (5, 50), (10, 50), (15, 50),
    (5, 100), (10, 100), (15, 100),
    (5, 200), (10, 200), (15, 200)
]
print("Large Model Hyper-parameter Test")
for window, vector_size in params:
    model = Word2Vec(sentences=wikitext_largeNormalToken, vector_size=vector_size, window=window, min_count=5, workers=4)
    score = scistats.spearmanr(wordsim["Human (mean)"], cosine_similarity(model), nan_policy='omit')
    print(f'Window={window}, Vector={vector_size}: {score}')

Large Model Hyper-parameter Test
Window=5, Vector=50: SignificanceResult(statistic=0.6118080510433198, pvalue=8.821653866113175e-36)
Window=5, Vector=50: SignificanceResult(statistic=0.6118080510433198, pvalue=8.821653866113175e-36)
Window=10, Vector=50: SignificanceResult(statistic=0.6338436494812426, pvalue=4.8588282405768905e-39)
Window=10, Vector=50: SignificanceResult(statistic=0.6338436494812426, pvalue=4.8588282405768905e-39)
Window=15, Vector=50: SignificanceResult(statistic=0.6489743805643045, pvalue=1.9562697793877926e-41)
Window=15, Vector=50: SignificanceResult(statistic=0.6489743805643045, pvalue=1.9562697793877926e-41)
Window=5, Vector=100: SignificanceResult(statistic=0.6279410603944227, pvalue=3.8476799879517744e-38)
Window=5, Vector=100: SignificanceResult(statistic=0.6279410603944227, pvalue=3.8476799879517744e-38)
Window=10, Vector=100: SignificanceResult(statistic=0.6687969868900207, pvalue=8.697439763044538e-45)
Window=10, Vector=100: SignificanceResult(statistic=0

In [27]:
#Differing Hyper Paramters test
#Small Models
# Differing Hyperparameters Test
params = [
    (5, 50), (10, 50), (15, 50),
    (5, 100), (10, 100), (15, 100),
    (5, 200), (10, 200), (15, 200)
]
print("Small Model Hyper-parameter Test")
for (window, vector_size) in params:
    model = Word2Vec(sentences=wikitext_smallNormalToken, vector_size=vector_size, window=window, min_count=5, workers=4)
    score = scistats.spearmanr(wordsim["Human (mean)"], cosine_similarity(model), nan_policy='omit')
    print(f'Window={window}, Vector={vector_size}: {score}')


Small Model Hyper-parameter Test
Window=5, Vector=50: SignificanceResult(statistic=0.0993008784103129, pvalue=0.11585684219911782)
Window=5, Vector=50: SignificanceResult(statistic=0.0993008784103129, pvalue=0.11585684219911782)
Window=10, Vector=50: SignificanceResult(statistic=0.15408177770011589, pvalue=0.014347198561918623)
Window=10, Vector=50: SignificanceResult(statistic=0.15408177770011589, pvalue=0.014347198561918623)
Window=15, Vector=50: SignificanceResult(statistic=0.20274245701573984, pvalue=0.001211709851426948)
Window=15, Vector=50: SignificanceResult(statistic=0.20274245701573984, pvalue=0.001211709851426948)
Window=5, Vector=100: SignificanceResult(statistic=0.05373011717391517, pvalue=0.3957075259348354)
Window=5, Vector=100: SignificanceResult(statistic=0.05373011717391517, pvalue=0.3957075259348354)
Window=10, Vector=100: SignificanceResult(statistic=0.15254300524566322, pvalue=0.015362988294003371)
Window=10, Vector=100: SignificanceResult(statistic=0.1525430052456

In [None]:
def getTopKAnalogyResult(base1, analogy1, base2, modelVW, k=5):
    """
    Find top-k words for analogy: base1 is to analogy1 as base2 is to ____
    Returns a list of top-k words (excluding input words).
    """
    
    if not all(w in modelVW for w in [base1, analogy1, base2]):
        raise ValueError("One or more words not in vocabulary.")
    analogy_vector = modelVW[analogy1] - modelVW[base1] + modelVW[base2]
    # Find top k most similar words to the analogy vector
    results = modelVW.similar_by_vector(analogy_vector, topn=k+3)
    # Exclude the input words from the results
    filtered = [word for word, score in results if word not in {base1, analogy1, base2}]
    return filtered[:k]

In [40]:
# man is to woman as king is to ___?
print(getTopKAnalogyResult("man", "woman", "king", preTrainedModel))
# Athens is to Greece as Rome is to ___?
print(getTopKAnalogyResult("Athens", "Greece", "Rome", preTrainedModel))
# reading is to read as playing is to ___? 
print(getTopKAnalogyResult("reading", "read", "playing", preTrainedModel))
# Greece is to souvlaki as Italy is to ___? 
print(getTopKAnalogyResult("Greece", "souvlaki", "Italy", preTrainedModel))
# airplane is to propeller as car is to ___? 
print(getTopKAnalogyResult("airplane", "propeller", "car", preTrainedModel))

['queen', 'monarch', 'princess', 'crown_prince', 'prince']
['Italy', 'Sicily', 'Portugal', 'Italian', 'ANSA']
['played', 'play', 'Playing', 'playin', 'toplay']
['quiche_Lorraine', 'banh_mi_sandwiches', 'porchetta', 'veal_parmigiana', 'panino']
['steering_wheel', 'front_fender', 'fender', 'brake_rotor', 'headlight']


In [45]:
# man is to woman as computer programmer is to ___?
print(getTopKAnalogyResult("man", "woman", "programmer", preTrainedModel))
# man is to woman as superstar is to ___?
print(getTopKAnalogyResult("man", "woman", "superstar", preTrainedModel))
# man is to woman as guitarist is to ___? 
print(getTopKAnalogyResult("man", "woman", "guitarist", preTrainedModel))
# man is to woman as boss is to ___?  
print(getTopKAnalogyResult("man", "woman", "boss", preTrainedModel))


['programmers', 'computer_programmer', 'coder', 'Programmer', 'programer']
['megastar', 'diva', 'star', 'pop_diva', 'songstress']
['vocalist', 'drummer', 'bassist', 'singer_guitarist', 'guitarist_vocalist']
['bosses', 'manageress', 'coworker', 'receptionist', 'exec']


# Response Section
a.  What are the cosine similarity scores for the following pairs: 

-  plane / car 
-  planet / sun 
-  cup / article 
-  sugar / approach

---

wikitext-small

plane / car
0.8853891

planet / sun
0.8932499

cup/article
0.54733366

sugar / approach
0.7980922


wikitext-large

plane / car
0.6524783

planet / sun
0.63540334

cup/article
0.10199884

sugar / approach
-0.14861105

---

b.  What is the value of the Spearman correlation coefficients computed in Step 4?

---

wikitext-small
Value = .1203

wikitext-large
Value = .60743

---

c.  How do you interpret each coefficient value with respect to the word similarity task? 
Are the coefficient values for the two vector space models you created different, and 
if so, why?

---

You can interpret the coefficient value in terms of the word similarity task as a measure for how similarly the model performs in regards of ranking the similarity of word pairs compared to humans ranking the same pairs of words. If the coefficient scores highly, then that means that the model is computing similarities in a manner that is similar to how our group of humans percieve the similarity of the given words. Thus, we can say that the value of .1203 for the small wikitext vector space model and .60743 for the large wikitext vector space model shows that the ranking coorelation between the models' performance and the human control group is .1203 and.60743 respectively.

These values are different, due to the reduction in bias that occurs as you gain more data in a larger corpus. When less data is available word correlations that are not representative of a language as a whole may occur and skew data, whereas when a larger corpus is present individual abnormal word correlations impact the overall learning of the similarity less.

---

d.  What is the value of the Spearman correlation coefficient and how do you interpret it?

---

pretrained
Value = .7000

You can intepret this correlation coefficient to mean that there is a moderately strong positive correlation between the rankings of similarities of words between the model and human control group, coming in at .70 alignment.

---

e.  Create a table of results that summarises your experiments. 
Tips: In case you have 
run several experiments with different hyperparameters choose to present the most 
significant. It might be worth presenting more than one experiment with only a single 
hyperparameter change if you want to emphasise a striking difference worth 
discussing. Finally, apart from the Spearman correlation coefficients, make sure you 
also include the most significant hyperparameters as separate columns (for example, 
see Table 2 from Merity et al., 2016).

---

_Table formatting generated with help from ChatGPT_

*All Experiments done with a standard 5 word minimum count in corpus for vector generator*

### Large Corpus

| Vector Size | Window Size = 5 | Window Size = 10 | Window Size = 15 |
|--------------|-----------------|------------------|------------------|
| 50           | 0.6118          | 0.6338           | 0.6490           |
| 100          | 0.6279          | 0.6688           | 0.6742           |
| 200          | 0.6274          | 0.6679           | 0.6802           |



### Small Corpus

| Vector Size | Window Size = 5 | Window Size = 10 | Window Size = 15 |
|--------------|-----------------|------------------|------------------|
| 50           | 0.1213          | 0.1523           | 0.1939           |
| 100          | 0.0625          | 0.1443           | 0.1792           |
| 200          | 0.1056          | 0.1580           | 0.1714           |


### Combined View

| Dataset | Vector Size | Window Size | Coefficient Score |
|----------|--------------|--------------|--------------------|
| Large    | 50           | 5            | 0.6118             |
| Large    | 50           | 10           | 0.6338             |
| Large    | 50           | 15           | 0.6490             |
| Large    | 100          | 5            | 0.6279             |
| Large    | 100          | 10           | 0.6688             |
| Large    | 100          | 15           | 0.6742             |
| Large    | 200          | 5            | 0.6274             |
| Large    | 200          | 10           | 0.6679             |
| Large    | 200          | 15           | 0.6802             |
| Small    | 50           | 5            | 0.1213             |
| Small    | 50           | 10           | 0.1523             |
| Small    | 50           | 15           | 0.1939             |
| Small    | 100          | 5            | 0.0625             |
| Small    | 100          | 10           | 0.1443             |
| Small    | 100          | 15           | 0.1792             |
| Small    | 200          | 5            | 0.1056             |
| Small    | 200          | 10           | 0.1580             |
| Small    | 200          | 15           | 0.1714             |

---

f.  Using the table of results from your answer to question g., write a short discussion section that answers the following questions:  
i.  Does a bigger corpus yield better representations?   
ii.  Does a bigger vocabulary yield better representations?  
iii.  Do bigger word vectors yield better representations?  
iv.  Does a bigger context window yield better representations? 
v.  Step 6 requires you to look for the best combination of hyperparameters using 
the same two datasets for evaluation. Is this a good practice?

---

There are several trends and conclusions that we can draw using the data collected through the experiments done here. Firstly, training a Word2Vec model over a larger corpus generally leads to much more accurate representations, as seen through the massively higher coefficient scores for all of our large model experiments compared to our smaller dataset. That being said, a larger vocabularly doesn't necessarily lead to better representations, as for the most part extremely rare words don't tend to influence our understanding of most words outside of very specific contexts. 

Moving onto the hyperparameter analysis, we see from our results several factors in play. Firstly, it would seem that having a larger vector for words does seem to give a marginal improvement for these datasets, although it seems to have diminishing returns as we increase to 200 compnents per vector. A notable exception is in our small dataset, which seems to provide the best results with a mere 50 components. This would make sense, as with less training data we are less likely to capture specific nuances of words. Thus, the extra detail the larger vectors gives us is unneeded. Moving onto our window hyperparameter, we see a similar correlation where larger windows seem to give us better results. As we increase the number of words in our context window, we do see a marginal improvement for the correlation scores for this dataset. It should be noted however, that as window size increases we lose semantic meaning of individual words, and learn more about the context of the document. 

Lastly, it should be understood that when searching for general purpose representations of words, you should avoid looking for best combinations of hyperparameters based on two datasets. While we see positive results from certain combinations of hyperparameters for these two datasets, it cannot be concluded that these hyperparameters are the best ones for any arbitrary dataset.

---

g.  What are the top-5 analogies for the following configurations: 
-  man is to woman as king is to ___? 
-  Athens is to Greece as Rome is to ___? 
-  reading is to read as playing is to ___? 
-  Greece is to souvlaki as Italy is to ___? 
-  airplane is to propeller as car is to ___? 
Is the top-1 answer always the “correct”? What about the rest of the results?

---

man is to woman as king is to ___?
['queen', 'monarch', 'princess', 'crown_prince', 'prince']

Athens is to Greece as Rome is to ___?
['Italy', 'Sicily', 'Portugal', 'Italian', 'ANSA']

reading is to read as playing is to ___?
['played', 'play', 'Playing', 'playin', 'toplay']

Greece is to souvlaki as Italy is to ___?
['quiche_Lorraine', 'banh_mi_sandwiches', 'porchetta', 'veal_parmigiana', 'panino']

airplane is to propeller as car is to ___?
['steering_wheel', 'front_fender', 'fender', 'brake_rotor', 'headlight']

The top-1 answer is not always correct, as sometimes context and differing meanings can make other answers just as valid. Take in consideration 'reading is to read as playing is to _'. For this analogy read can be either present or past tense, thus both played and play are valid answers. The rest of the results are generally somewhat correct or related, but the Top-1 generally does a good job at answering the analogies.

---

h.  What are the top-5 analogies for the following configurations? Can you identify any 
gender-based stereotypes? Briefly discuss your findings: 
-  man is to woman as computer programmer is to ___? 
-  man is to woman as superstar is to ___? 
-  man is to woman as guitarist is to ___? 
-  man is to woman as boss is to ___?

---

man is to woman as computer programmer is to ___?
['programmers', 'computer_programmer', 'coder', 'Programmer', 'programer']

man is to woman as superstar is to ___?
['megastar', 'diva', 'star', 'pop_diva', 'songstress']

man is to woman as guitarist is to ___?
['vocalist', 'drummer', 'bassist', 'singer_guitarist', 'guitarist_vocalist']

man is to woman as boss is to ___?
['bosses', 'manageress', 'coworker', 'receptionist', 'exec']


There are a few gender-based stereotypes present in these results, such as 'sonstress', 'vocalist', 'coworker', and 'receptionist'. All of these terms are related to the prompt, but tend to have connotations that are less in regard than the original title. This goes to show how in the era of AI we must be careful to not propogate these stereotypes unconciously in our language, as it can influence and bias important systems in ways that disadvantage certain groups.

---
