In [1]:
!pip install --upgrade gensim
!pip install smart_open==5.2.1




In [2]:
from datasets import load_dataset
import re
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec, FastText
from gensim.models.callbacks import CallbackAny2Vec
import sys, time
from csv import writer

class Progress(CallbackAny2Vec):
    def __init__(self, name, total):
        self.name = name
        self.total = total
        self.epoch = 0
        self.start = time.time()
    def on_epoch_begin(self, model):
        self.epoch += 1
        print(f"\n{self.name} → Epoch {self.epoch}/{self.total} started...")
        self.epoch_time = time.time()
    def on_epoch_end(self, model):
        e = time.time() - self.epoch_time
        t = time.time() - self.start
        sys.stdout.write(f"\r{self.name} → Epoch {self.epoch}/{self.total} completed ({e:.1f}s, total {t:.1f}s)")
        sys.stdout.flush()
    def on_train_end(self, model):
        print(f"\n{self.name} training complete!\n")

data = load_dataset("lucadiliello/newsqa")
docs = []
for part in ['train', 'validation']:
    for item in data[part]:
        docs.append(str(item['context']))
        docs.append(str(item['question']))
        for ans in item['answers']:
            if isinstance(ans, list):
                for a in ans:
                    docs.append(str(a))
            else:
                docs.append(str(ans))

def clean(txt):
    txt = txt.lower()
    txt = re.sub(r'[^a-z0-9\s]', '', txt)
    return txt

docs = [clean(t) for t in docs]
tokens = [word_tokenize(t) for t in docs]

epochs = 5
print("\nTraining Word2Vec...")
w2v = Word2Vec(sentences=tokens, vector_size=150, window=4, min_count=2, workers=4, sg=1, epochs=epochs, callbacks=[Progress("Word2Vec", epochs)])
with open("w2v_embeddings.csv", "w", newline="") as f:
    w = writer(f)
    for i, word in enumerate(w2v.wv.index_to_key):
        w.writerow([word]+w2v.wv[word].tolist())
        if i % max(1, len(w2v.wv)//100) == 0:
            sys.stdout.write(f"\rWord2Vec saving progress: {int(i/len(w2v.wv)*100)}%")
            sys.stdout.flush()
    print("\rWord2Vec saving progress: 100%")

print("\nTraining FastText...")
ft = FastText(sentences=tokens, vector_size=120, window=6, min_count=1, workers=4, epochs=epochs, callbacks=[Progress("FastText", epochs)])
with open("fasttext_embeddings.csv", "w", newline="") as f:
    w = writer(f)
    for i, word in enumerate(ft.wv.index_to_key):
        w.writerow([word]+ft.wv[word].tolist())
        if i % max(1, len(ft.wv)//100) == 0:
            sys.stdout.write(f"\rFastText saving progress: {int(i/len(ft.wv)*100)}%")
            sys.stdout.flush()
    print("\rFastText saving progress: 100%")

print("\nAll embeddings saved successfully!")



Training Word2Vec...

Word2Vec → Epoch 1/5 started...
Word2Vec → Epoch 1/5 completed (110.1s, total 116.5s)
Word2Vec → Epoch 2/5 started...
Word2Vec → Epoch 2/5 completed (109.4s, total 225.9s)
Word2Vec → Epoch 3/5 started...
Word2Vec → Epoch 3/5 completed (109.9s, total 335.8s)
Word2Vec → Epoch 4/5 started...
Word2Vec → Epoch 4/5 completed (110.2s, total 446.0s)
Word2Vec → Epoch 5/5 started...
Word2Vec → Epoch 5/5 completed (110.1s, total 556.2s)
Word2Vec training complete!

Word2Vec saving progress: 100%

Training FastText...

FastText → Epoch 1/5 started...
FastText → Epoch 1/5 completed (177.1s, total 189.4s)
FastText → Epoch 2/5 started...
FastText → Epoch 2/5 completed (177.8s, total 367.2s)
FastText → Epoch 3/5 started...
FastText → Epoch 3/5 completed (167.8s, total 535.0s)
FastText → Epoch 4/5 started...
FastText → Epoch 4/5 completed (167.6s, total 702.6s)
FastText → Epoch 5/5 started...
FastText → Epoch 5/5 completed (168.7s, total 871.3s)
FastText training complete!

FastT

In [3]:
words_to_test = ["president", "government", "war", "peace", "city", "village"]

print("Word2Vec evaluation:")
for w in words_to_test:
    if w in w2v.wv:
        sim = w2v.wv.most_similar(w, topn=5)
        print(f"\nTop words similar to '{w}':")
        for s, score in sim:
            print(f"{s} → {score:.3f}")
    else:
        print(f"{w} not in Word2Vec vocab")

print("\nFastText evaluation:")
for w in words_to_test:
    if w in ft.wv:
        sim = ft.wv.most_similar(w, topn=5)
        print(f"\nTop words similar to '{w}':")
        for s, score in sim:
            print(f"{s} → {score:.3f}")
    else:
        print(f"{w} not in FastText vocab")

print("\nExample word similarity scores:")
pairs = [("king","queen"), ("man","woman"), ("city","village"), ("war","peace")]
for a,b in pairs:
    if a in w2v.wv and b in w2v.wv:
        score = w2v.wv.similarity(a,b)
        print(f"Word2Vec similarity({a},{b}) = {score:.3f}")
    if a in ft.wv and b in ft.wv:
        score = ft.wv.similarity(a,b)
        print(f"FastText similarity({a},{b}) = {score:.3f}")


Word2Vec evaluation:

Top words similar to 'president':
barack → 0.676
obama → 0.665
exboss → 0.660
sjon → 0.660
egyptain → 0.658

Top words similar to 'government':
arabdominated → 0.675
backpedal → 0.665
sunnibacked → 0.658
ethiopianbacked → 0.656
fatahhamas → 0.653

Top words similar to 'war':
iraniraq → 0.703
19501953 → 0.642
longplanned → 0.637
romuald → 0.636
ii → 0.632

Top words similar to 'peace':
naypidaw → 0.662
bedeviled → 0.649
nobel → 0.632
corecipient → 0.625
egyptianisraeli → 0.621

Top words similar to 'city':
moreheadbeaufort → 0.621
apia → 0.609
cabanatuan → 0.607
milehigh → 0.596
grenoble → 0.595

Top words similar to 'village':
okayama → 0.641
sharqi → 0.631
kaniguran → 0.618
hayagay → 0.612
galachipa → 0.612

FastText evaluation:

Top words similar to 'president':
nowpresident → 0.957
copresident → 0.956
presidente → 0.949
presidenta → 0.943
techpresident → 0.939

Top words similar to 'government':
governmenttogovernment → 0.988
usgovernment → 0.967
progovernment 