In [None]:
! pip install gensim pandas



In [None]:
import pandas as pd
from gensim.models import Word2Vec, FastText
import re
from google.colab import drive

In [None]:
drive_path_cleaned = "/content/drive/MyDrive/Colab Notebooks/nlp/Cleaned/"
drive_path_embeddings = "/content/drive/MyDrive/Colab Notebooks/nlp/embeddings/"

drive.mount('/content/drive')

model_definer = "v200w2min1neg5epoc10fastt3_9/"
w2v = Word2Vec.load(drive_path_embeddings + model_definer + "word2vec.model")
ft = FastText.load(drive_path_embeddings + model_definer+ "fasttext.model")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
seed_words = [
    "yaxşı","pis","çox","bahalı","ucuz","mükəmməl","dəhşət","<PRICE>","<RATING_POS>"
]

syn_pairs = [("yaxşı","əla"), ("bahalı","qiymətli"), ("ucuz","sərfəli")]
ant_pairs = [("yaxşı","pis"), ("bahalı","ucuz")]


In [None]:
def lexical_coverage(model, tokens):
    vocab = model.wv.key_to_index
    return sum(1 for t in tokens if t in vocab) / max(1, len(tokens))


In [None]:
files = [
    f"{drive_path_cleaned}labeled-sentiment_2col.xlsx",
    f"{drive_path_cleaned}test__1__2col.xlsx",
    f"{drive_path_cleaned}train__3__2col.xlsx",
    f"{drive_path_cleaned}train-00000-of-00001_2col.xlsx",
    f"{drive_path_cleaned}merged_dataset_CSV__1__2col.xlsx",
]

def read_tokens(f):
    df = pd.read_excel(f, usecols=["cleaned_text"])
    return [t for row in df["cleaned_text"].astype(str) for t in row.split()]


In [None]:
print("== Lexical coverage (per dataset) ==")
for f in files:
    toks = read_tokens(f)
    cov_w2v = lexical_coverage(w2v, toks)
    cov_ftv = lexical_coverage(ft, toks)
    print(f"{f}: W2V={cov_w2v:.3f}, FT(vocab)={cov_ftv:.3f}")


== Lexical coverage (per dataset) ==
/content/drive/MyDrive/Colab Notebooks/nlp/Cleaned/labeled-sentiment_2col.xlsx: W2V=1.000, FT(vocab)=1.000
/content/drive/MyDrive/Colab Notebooks/nlp/Cleaned/test__1__2col.xlsx: W2V=1.000, FT(vocab)=1.000
/content/drive/MyDrive/Colab Notebooks/nlp/Cleaned/train__3__2col.xlsx: W2V=1.000, FT(vocab)=1.000
/content/drive/MyDrive/Colab Notebooks/nlp/Cleaned/train-00000-of-00001_2col.xlsx: W2V=1.000, FT(vocab)=1.000
/content/drive/MyDrive/Colab Notebooks/nlp/Cleaned/merged_dataset_CSV__1__2col.xlsx: W2V=1.000, FT(vocab)=1.000


In [None]:
vocab_size = len(ft.wv)
print(f"Vocabulary Size: {vocab_size}")

Vocabulary Size: 144459


In [None]:
vocab_size = len(w2v.wv)
print(f"Vocabulary Size: {vocab_size}")

Vocabulary Size: 144459


In [None]:
from numpy import dot
from numpy.linalg import norm

def cos(a, b):
    return float(dot(a, b) / (norm(a) * norm(b)))

def pair_sim(model, pairs):
    vals = []
    for a, b in pairs:
        try:
            vals.append(model.wv.similarity(a, b))
        except KeyError:
            pass
    return sum(vals)/len(vals) if vals else float('nan')


In [None]:
syn_w2v = pair_sim(w2v, syn_pairs)
syn_ft  = pair_sim(ft,  syn_pairs)
ant_w2v = pair_sim(w2v, ant_pairs)
ant_ft  = pair_sim(ft,  ant_pairs)

print("\n== Similarity (higher better for synonyms; lower better for antonyms) ==")
print(f"Synonyms: W2V={syn_w2v:.3f}, FT={syn_ft:.3f}")
print(f"Antonyms: W2V={ant_w2v:.3f}, FT={ant_ft:.3f}")
print(f"Separation (Syn - Ant): W2V={(syn_w2v - ant_w2v):.3f}, FT={(syn_ft - ant_ft):.3f}")



== Similarity (higher better for synonyms; lower better for antonyms) ==
Synonyms: W2V=0.592, FT=0.608
Antonyms: W2V=0.490, FT=0.503
Separation (Syn - Ant): W2V=0.101, FT=0.106


In [None]:
def neighbors(model, word, k=5):
    try:
        return [w for w, _ in model.wv.most_similar(word, topn=k)]
    except KeyError:
        return []

print("\n== Nearest neighbors (qualitative) ==")
for w in seed_words:
    print(f" W2V NN for '{w}':", neighbors(w2v, w))
    print(f" FT  NN for '{w}':", neighbors(ft,  w))
# (Optional) domain drift if you train domain-specific models separately:
# drift(word, model_a, model_b) = 1 - cos(vec_a, vec_b)



== Nearest neighbors (qualitative) ==
 W2V NN for 'yaxşı': ['<RATING_POS>', 'iyi', 'yaxşi', 'yaxshi', 'zor']
 FT  NN for 'yaxşı': ['yaxşıı', 'yaxşıfı', 'yaxşıkı', 'yaxşııdıı', 'yaxşıca']
 W2V NN for 'pis': ['<RATING_NEG>', 'bərbad', 'iyi', 'zay', 'berbat']
 FT  NN for 'pis': ['pispispis', 'pisdr', 'piss', 'piis', 'pissdii']
 W2V NN for 'çox': ['çoox', 'coox', 'temu', 'chox', 'bəyənirəm']
 FT  NN for 'çox': ['ççox', 'çoxçox', 'çoxx', 'çoh', 'ço']
 W2V NN for 'bahalı': ['acılı', 'nəhəng', 'kefiyyətsiz', 'növbələriniz', 'detallara']
 FT  NN for 'bahalı': ['bahalıı', 'bahalısı', 'baharlı', 'bahalıdı', 'bahaymış']
 W2V NN for 'ucuz': ['münasib', 'baha', 'qiymətə', 'qiymete', 'toyuğu']
 FT  NN for 'ucuz': ['ucuzlu', 'ucuzluk', 'ucuza', 'ucuzu', 'ucuzdu']
 W2V NN for 'mükəmməl': ['möhtəşəm', 'yararlı', 'süper', 'möhtəşəmm', 'süjetli']
 FT  NN for 'mükəmməl': ['mükəmməll', 'mükəmməldii', 'mükəmməldiçox', 'mükemməl', 'qeyrimükəmməl']
 W2V NN for 'dəhşət': ['hədsiz', 'çoox', 'gülməlidir', 'çoxx

In [None]:
w2v.wv.most_similar("<NUM>", topn=7)

[('gb', 0.6147559285163879),
 ('manata', 0.5996270775794983),
 ('kq', 0.5868653655052185),
 ('cü', 0.5773811936378479),
 ('lik', 0.5705621838569641),
 ('cu', 0.5645840167999268),
 ('tl', 0.563035249710083)]

In [None]:
ft.wv.most_similar("yox", topn=10)

[('yoxx', 0.9212023615837097),
 ('yoxh', 0.9197113513946533),
 ('yoxe', 0.8936871290206909),
 ('yoxmu', 0.8933278322219849),
 ('yoxmuw', 0.8917135000228882),
 ('yoxey', 0.8834620714187622),
 ('yoxtu', 0.8822392225265503),
 ('yoxdh', 0.8812844753265381),
 ('yoxd', 0.879164457321167),
 ('yoxdy', 0.8777079582214355)]

In [None]:
ft.wv.most_similar("iyi", topn=150)

[('iyiyi', 0.9759093523025513),
 ('qdpiyi', 0.9243342280387878),
 ('piyi', 0.9237945079803467),
 ('pipiyi', 0.9190375208854675),
 ('ciyi', 0.9042347073554993),
 ('seudiyi', 0.8943328857421875),
 ('çekdiyi', 0.87161785364151),
 ('gijdiyi', 0.8693091869354248),
 ('eniyi', 0.8650492429733276),
 ('kiciyi', 0.8582351207733154),
 ('beşiyi', 0.8506500124931335),
 ('kesdiyi', 0.8475185036659241),
 ('sidiyi', 0.8467510938644409),
 ('incitdiyi', 0.8465353846549988),
 ('niyi', 0.8427123427391052),
 ('içeriyi', 0.8413795828819275),
 ('pubgiyi', 0.8397108316421509),
 ('qəbzdiyi', 0.8360980749130249),
 ('ıyi', 0.8360335826873779),
 ('dyi', 0.8314380645751953),
 ('cekdiyi', 0.8302986025810242),
 ('tilidiyi', 0.8299102783203125),
 ('əkdiyi', 0.8295677304267883),
 ('yi', 0.8284099102020264),
 ('kəsdiyi', 0.8278812170028687),
 ('fyi', 0.8266210556030273),
 ('etydiyi', 0.8260728120803833),
 ('tepkiyi', 0.8254333734512329),
 ('kecdiyi', 0.8204808235168457),
 ('kiçiyi', 0.820457398891449),
 ('endiyi', 0.81