In [1]:
!pip install --upgrade gensim
!pip install smart_open==5.2.1


Collecting gensim
  Downloading gensim-4.4.0-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.8/27.8 MB[0m [31m63.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 4.3.3
    Uninstalling gensim-4.3.3:
      Successfully uninstalled gensim-4.3.3
Successfully installed gensim-4.4.0
Collecting smart_open==5.2.1
  Downloading smart_open-5.2.1-py3-none-any.whl.metadata (22 kB)
Downloading smart_open-5.2.1-py3-none-any.whl (58 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.6/58.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: smart_open
  Attempting uninstall: smart_open
    Found existing installation: smart_open 7.3.0.post1
    Uni

In [2]:
from datasets import load_dataset
import re
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec, FastText
from gensim.models.callbacks import CallbackAny2Vec
import sys, time
from csv import writer

class Progress(CallbackAny2Vec):
    def __init__(self, name, total):
        self.name = name
        self.total = total
        self.epoch = 0
        self.start = time.time()
    def on_epoch_begin(self, model):
        self.epoch += 1
        print(f"\n{self.name} → Epoch {self.epoch}/{self.total} started...")
        self.epoch_time = time.time()
    def on_epoch_end(self, model):
        e = time.time() - self.epoch_time
        t = time.time() - self.start
        sys.stdout.write(f"\r{self.name} → Epoch {self.epoch}/{self.total} completed ({e:.1f}s, total {t:.1f}s)")
        sys.stdout.flush()
    def on_train_end(self, model):
        print(f"\n{self.name} training complete!\n")

data = load_dataset("lucadiliello/newsqa")
docs = []
for part in ['train', 'validation']:
    for item in data[part]:
        docs.append(str(item['context']))
        docs.append(str(item['question']))
        for ans in item['answers']:
            if isinstance(ans, list):
                for a in ans:
                    docs.append(str(a))
            else:
                docs.append(str(ans))

def clean(txt):
    txt = txt.lower()
    txt = re.sub(r'[^a-z0-9\s]', '', txt)
    return txt

docs = [clean(t) for t in docs]
tokens = [word_tokenize(t) for t in docs]

epochs = 5
print("\nWord2Vec traing is starting: ")
w2v = Word2Vec(sentences=tokens, vector_size=150, window=4, min_count=2, workers=4, sg=1, epochs=epochs, callbacks=[Progress("Word2Vec", epochs)])
with open("w2v_embeddings.csv", "w", newline="") as f:
    w = writer(f)
    for i, word in enumerate(w2v.wv.index_to_key):
        w.writerow([word]+w2v.wv[word].tolist())
        if i % max(1, len(w2v.wv)//100) == 0:
            sys.stdout.write(f"\rWord2Vec saving progress: {int(i/len(w2v.wv)*100)}%")
            sys.stdout.flush()
    print("\rWord2Vec saved")

print("\nFastText is Training")
ft = FastText(sentences=tokens, vector_size=120, window=6, min_count=1, workers=4, epochs=epochs, callbacks=[Progress("FastText", epochs)])
with open("fasttext_embeddings.csv", "w", newline="") as f:
    w = writer(f)
    for i, word in enumerate(ft.wv.index_to_key):
        w.writerow([word]+ft.wv[word].tolist())
        if i % max(1, len(ft.wv)//100) == 0:
            sys.stdout.write(f"\rFastText saving progress: {int(i/len(ft.wv)*100)}%")
            sys.stdout.flush()
    print("/nsaving fasttext")

print("\nEVerything is saved")


README.md:   0%|          | 0.00/681 [00:00<?, ?B/s]

data/train-00000-of-00001-ec54fbe500fc3b(…):   0%|          | 0.00/29.7M [00:00<?, ?B/s]

data/validation-00000-of-00001-3cf888b12(…):   0%|          | 0.00/1.63M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/74160 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4212 [00:00<?, ? examples/s]


Word2Vec traing is starting: 

Word2Vec → Epoch 1/5 started...
Word2Vec → Epoch 1/5 completed (107.8s, total 114.3s)
Word2Vec → Epoch 2/5 started...
Word2Vec → Epoch 2/5 completed (107.1s, total 221.4s)
Word2Vec → Epoch 3/5 started...
Word2Vec → Epoch 3/5 completed (113.5s, total 334.9s)
Word2Vec → Epoch 4/5 started...
Word2Vec → Epoch 4/5 completed (105.5s, total 440.3s)
Word2Vec → Epoch 5/5 started...
Word2Vec → Epoch 5/5 completed (106.2s, total 546.5s)
Word2Vec training complete!

Word2Vec savedg progress: 99%

FastText is Training

FastText → Epoch 1/5 started...
FastText → Epoch 1/5 completed (174.2s, total 186.2s)
FastText → Epoch 2/5 started...
FastText → Epoch 2/5 completed (173.1s, total 359.3s)
FastText → Epoch 3/5 started...
FastText → Epoch 3/5 completed (176.3s, total 535.6s)
FastText → Epoch 4/5 started...
FastText → Epoch 4/5 completed (178.2s, total 713.9s)
FastText → Epoch 5/5 started...
FastText → Epoch 5/5 completed (175.9s, total 889.8s)
FastText training complete

In [3]:
words_to_test = ["obama","have","dihh", "damn","thats","crazy"]

print("Word2Vec testing:")
for w in words_to_test:
    if w in w2v.wv:
        sim = w2v.wv.most_similar(w, topn=5)
        print(f"\nTop words similar to '{w}':")
        for s, score in sim:
            print(f"{s} → {score:.3f}")
    else:
        print(f"{w} not in Word2Vec vocab")

print("\nFastText testing:")
for w in words_to_test:
    if w in ft.wv:
        sim = ft.wv.most_similar(w, topn=5)
        print(f"\nTop words similar to '{w}':")
        for s, score in sim:
            print(f"{s} → {score:.3f}")
    else:
        print(f"{w} not in FastText vocab")




Word2Vec testing:

Top words similar to 'obama':
barack → 0.862
obamas → 0.777
presidentelect → 0.730
mccain → 0.712
clinton → 0.706

Top words similar to 'have':
had → 0.752
has → 0.681
havent → 0.646
be → 0.589
theyve → 0.576
dihh not in Word2Vec vocab

Top words similar to 'damn':
peytons → 0.627
menopausal → 0.539
repressors → 0.537
abbreviate → 0.533
kook → 0.529

Top words similar to 'thats':
overanxious → 0.657
supergreat → 0.646
categorizes → 0.645
tasters → 0.635
unsporting → 0.631

Top words similar to 'crazy':
extrovert → 0.617
motson → 0.563
pawing → 0.562
geeked → 0.559
cheekiness → 0.547

FastText testing:

Top words similar to 'obama':
obamao → 0.969
clintonobama → 0.890
obamaclinton → 0.882
obamabiden → 0.877
obamathon → 0.863

Top words similar to 'have':
16have → 0.915
hav → 0.878
havel → 0.869
havent → 0.811
shave → 0.799

Top words similar to 'dihh':
dijk → 0.967
diy → 0.958
dikgacoi → 0.880
dix → 0.829
diazs → 0.790

Top words similar to 'damn':
dam → 0.756
damme →