# Step 1 (Enviroment Setup)


---


## Importing the Datasets
wikitext (*full* ) - 859955 docs

wikitext (*small* ) - 10000 docs

In [None]:
!wget -O wikitext-filtered-full.zip "https://www.dropbox.com/scl/fi/ibd4cmixckghx6hhb361c/wikitext-filtered-full.zip?rlkey=q71cebf0k5fvvwhmcntoswzhq&dl=1"
!wget -O wikitext-filtered-10k.zip "https://www.dropbox.com/scl/fi/ek174r3sg7qjx0aa9atop/wikitext-filtered-10k.zip?rlkey=zy6jqxv6qsc16lr9qm3ki9uhf&dl=1"

--2025-10-17 23:04:24--  https://www.dropbox.com/scl/fi/ibd4cmixckghx6hhb361c/wikitext-filtered-full.zip?rlkey=q71cebf0k5fvvwhmcntoswzhq&dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.3.18, 2620:100:601c:18::a27d:612
Connecting to www.dropbox.com (www.dropbox.com)|162.125.3.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://uceea71edf314d203d1ecea54905.dl-eu.dropboxusercontent.com/cd/0/inline/CzaM-GcUFl25wnxZLuuZEjPle2s8cbOBXGUdauuTx4bVyCkbHbKJUfztLG3QrW5FRDVIUwJSHM88MN9fg7ti5opxoNh6rR4SszpW_0stmIH7BuoPSPh_MbPtg1K0JHjBqlKuDbSk_AEyCcpoIy8yHpzC/file?dl=1# [following]
--2025-10-17 23:04:25--  https://uceea71edf314d203d1ecea54905.dl-eu.dropboxusercontent.com/cd/0/inline/CzaM-GcUFl25wnxZLuuZEjPle2s8cbOBXGUdauuTx4bVyCkbHbKJUfztLG3QrW5FRDVIUwJSHM88MN9fg7ti5opxoNh6rR4SszpW_0stmIH7BuoPSPh_MbPtg1K0JHjBqlKuDbSk_AEyCcpoIy8yHpzC/file?dl=1
Resolving uceea71edf314d203d1ecea54905.dl-eu.dropboxusercontent.com (uceea71edf314d203d1ecea54905.dl-eu.dropbo

In [None]:
!unzip wikitext-filtered-full.zip
!unzip wikitext-filtered-10k.zip

Archive:  wikitext-filtered-full.zip
replace wikitext-filtered-full/dataset_info.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: Archive:  wikitext-filtered-10k.zip
replace wikitext-filtered-10k/dataset_info.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
# datasets package provides dataset tools from hugginface
!pip install datasets
import datasets



In [None]:
from datasets import load_dataset, Dataset

def load_dataset():
  wikitext_small = "wikitext-filtered-10k"
  wikitext_large = "wikitext-filtered-full"

  dataset_small = Dataset.load_from_disk(wikitext_small)
  dataset_large = Dataset.load_from_disk(wikitext_large)
  print("wikitext_small: {} docs, wikitext_large: {} docs".format(len(dataset_small), len(dataset_large)))
  return dataset_small, dataset_large

wikitext_small, wikitext_large = load_dataset()

wikitext_small: 10000 docs, wikitext_large: 859955 docs


## Understanding the Dataset
Summary statistics

In [None]:
wt = wikitext_small
#wt = wikitext_large

print('# TYPE OF THE DATASET:', '\n', type(wt))
print(wt, '\n')
print('# ENTRIES LOOK LIKE:')
print(wt.features, '\n', wt[0], '\n', wt[1], '\n')

print('# DATASET STATISTICS:')
print('No. of docs:', len(wt))
lengths = [len(doc['text'].split()) for doc in wt]
print('Mean doc length:', sum(lengths)/len(lengths), 'words')

# TYPE OF THE DATASET: 
 <class 'datasets.arrow_dataset.Dataset'>
Dataset({
    features: ['text'],
    num_rows: 10000
}) 

# ENTRIES LOOK LIKE:
{'text': Value('string')} 
 {'text': 'Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " .'} 
 {'text': "The game began development in 2010 , carrying over a large portion of the work done on Valkyria Ch

# Step 2 (Train Baselines)

---
Installing dependancies
- gensim - word2vec models
- nltk (natural language tool kit) - stopwords removal

In [None]:
!pip install gensim nltk
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk



In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_dataset(dataset):
    text_col = 'text' if 'text' in dataset.column_names else dataset.column_names[0]
    tokenized = []

    for i in range(len(dataset)):
        text = dataset[i][text_col]
        if not isinstance(text, str):
            continue
        tokens = [t.lower() for t in text.split() if t.isalpha() and t.lower() not in stop_words]
        if tokens:
            tokenized.append(tokens)

    return tokenized

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
tokens_small = preprocess_dataset(wikitext_small)
tokens_large = preprocess_dataset(wikitext_large)

In [None]:
def train_word2vec(tokens, model_name, vector_size=50, window=5, min_count=5, epochs=5):
    print(f"Training {model_name} ...")
    model = Word2Vec(
        sentences=tokens,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        epochs=epochs
    )
    model.save(f"{model_name}.model")
    model.wv.save(f"{model_name}.kv")
    print(f"{model_name} saved.")
    return model

In [None]:
model_small = train_word2vec(tokens_small, "word2vec_small")
model_large = train_word2vec(tokens_large, "word2vec_large")

Training word2vec_small ...
word2vec_small saved.
Training word2vec_large ...
word2vec_large saved.


In [None]:
# writing model to disk
model_small.save("word2vec_small.model")
model_large.save("word2vec_large.model")

In [17]:
model_small = Word2Vec.load("word2vec_small.model")
model_large = Word2Vec.load("word2vec_large.model")

In [18]:
vocab_size = len(model_small.wv)
print(f"Vocab size (learned by model):", vocab_size, '\n')

example_tokens = ['plane', 'car', 'planet', 'nurse', 'city', 'country']
for token in example_tokens:
    if token in model_small.wv:
        print(f"Top-10 similar to '{token}':", model_small.wv.most_similar(token, topn=10))
    else:
        print(f"'{token}' not in vocabulary.")

Vocab size (learned by model): 13838 

Top-10 similar to 'plane': [('channel', 0.9976177215576172), ('probe', 0.997555136680603), ('threatened', 0.9975177049636841), ('briefly', 0.9974639415740967), ('recovery', 0.9973675608634949), ('completion', 0.9973640441894531), ('disaster', 0.997336208820343), ('immediate', 0.9971866011619568), ('repairs', 0.9970893859863281), ('arrive', 0.9970740675926208)]
Top-10 similar to 'car': [('bring', 0.9986448287963867), ('demon', 0.9985685348510742), ('becomes', 0.9983323812484741), ('possibility', 0.9982467889785767), ('view', 0.9980642795562744), ('reality', 0.9980185627937317), ('leaves', 0.9980082511901855), ('direct', 0.997999906539917), ('fully', 0.9979443550109863), ('owen', 0.9979330897331238)]
Top-10 similar to 'planet': [('slightly', 0.9979651570320129), ('indicated', 0.9977326393127441), ('eye', 0.9974536299705505), ('classification', 0.9972626566886902), ('capable', 0.997245728969574), ('engines', 0.9970810413360596), ('offers', 0.99705696

# Task 3

In [19]:
import numpy as np
import math

In [20]:
def mag(v):
  s = sum((e*e) for e in v)
  s = math.sqrt(s)
  return s

def cosineSimilarity(v1, v2):
  dotProd = np.dot(v1, v2)
  cos = dotProd/(mag(v1)*mag(v2))
  return cos

In [None]:
planeIndex = model_small.wv.key_to_index['plane']
carIndex = model_small.wv.key_to_index['car']
planetIndex = model_small.wv.key_to_index['planet']
sunIndex = model_small.wv.key_to_index['sun']
passengerIndex = model_small.wv.key_to_index['passenger']

In [21]:
v1 = model_large.wv['planet']
v2 = model_large.wv['sun']

cosineSimilarity(v1, v2)

0.6430315907730422

# Step 4

In [23]:
!wget -O wordsim353.zip "www.gabrilovich.com/resources/data/wordsim353/wordsim353.zip"
!unzip wordsim353.zip -d wordsim353

--2025-10-17 23:52:10--  http://www.gabrilovich.com/resources/data/wordsim353/wordsim353.zip
Resolving www.gabrilovich.com (www.gabrilovich.com)... 173.236.137.139
Connecting to www.gabrilovich.com (www.gabrilovich.com)|173.236.137.139|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://gabrilovich.com/resources/data/wordsim353/wordsim353.zip [following]
--2025-10-17 23:52:10--  https://gabrilovich.com/resources/data/wordsim353/wordsim353.zip
Resolving gabrilovich.com (gabrilovich.com)... 173.236.137.139
Connecting to gabrilovich.com (gabrilovich.com)|173.236.137.139|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 23257 (23K) [application/zip]
Saving to: ‘wordsim353.zip’


2025-10-17 23:52:10 (842 KB/s) - ‘wordsim353.zip’ saved [23257/23257]

Archive:  wordsim353.zip
replace wordsim353/combined.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: wordsim353/combined.csv  
replace wordsim353/set1.csv? [y]es, [n

In [24]:
import pandas as pd
from scipy.stats import spearmanr

In [26]:
!wget -q -O combined.zip "https://raw.githubusercontent.com/kavgan/nlp-text-mining-working-examples/master/wordSimilarity/resources/wordsim353/combined.csv"

!unzip -o combined.zip

import pandas as pd

df = pd.read_csv("wordsim353/combined.csv")  # columns: 'Word 1', 'Word 2', 'Human (mean)'
print("Loaded: wordsim353/combined.csv | shape:", df.shape)
print(df.head())

Archive:  combined.zip
  End-of-central-directory signature not found.  Either this file is not
  a zipfile, or it constitutes one disk of a multi-part archive.  In the
  latter case the central directory and zipfile comment will be found on
  the last disk(s) of this archive.
unzip:  cannot find zipfile directory in one of combined.zip or
        combined.zip.zip, and cannot find combined.zip.ZIP, period.
Loaded: wordsim353/combined.csv | shape: (353, 3)
     Word 1    Word 2  Human (mean)
0      love       sex          6.77
1     tiger       cat          7.35
2     tiger     tiger         10.00
3      book     paper          7.46
4  computer  keyboard          7.62
