
# Embeddings Comparison: Word2Vec, FastText, GloVe, spaCy, SBERT, Raw BERT
Run cells top-to-bottom. Edit the **Data** section to plug in your own corpus.


## 1) Setup & Imports

In [None]:
!pip install gensim nltk sentence-transformers spacy scikit-learn matplotlib umap-learn transformers torch

In [None]:

import re
import numpy as np
from pprint import pprint

# Visualization & eval
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # noqa: F401
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import umap

# Tokenization
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Gensim
import gensim.downloader as api
from gensim.models import Word2Vec, FastText

# Sentence Transformers (SBERT)
from sentence_transformers import SentenceTransformer

# spaCy (quick sentence/word vecs)
import spacy

# Transformers (raw BERT)
from transformers import AutoTokenizer, AutoModel
import torch

print("✅ Imports loaded")


## 2) Data

In [None]:

RAW_TEXT = """
King and queen rule a kingdom. A man and a woman walk to Rome in Italy.
Paris is the capital of France. Apple and banana are fruits.
The queen and the woman visited Paris. The king and the man visited Rome.
A royal family governs the kingdom, and their palace is in the capital.
Italy and France are European countries. Rome and Paris are famous cities.
Apples and bananas are often found in markets across the cities.
"""

SENTENCES = [
    "Paris is the capital of France.",
    "Rome is the capital of Italy.",
    "Apples and bananas are common fruits.",
    "The king and the queen live in a palace.",
    "A woman walked to Rome.",
    "Bananas are sold in city markets.",
    "France and Italy are European countries.",
    "The royal family rules the kingdom.",
]

CONTENT_WORDS = [
    "king","queen","man","woman",
    "paris","france","rome","italy",
    "kingdom","palace","capital","country","city",
    "apple","banana","fruit","family","royal"
]

print("✅ Data loaded (edit RAW_TEXT/SENTENCES for your corpus)")


## 3) Sentence Split + Tokenization (stopwords removed)

In [None]:

nltk.download("punkt_tab", quiet=True)
nltk.download("stopwords", quiet=True)
stops = set(stopwords.words("english"))

def sentence_split(text: str):
    # Simple, dependency-light split
    return [s.strip() for s in re.split(r"[.\n!?]+", text) if s.strip()]

def tokenize_line(line: str):
    toks = [t.lower() for t in word_tokenize(line) if t.isalpha()]
    toks = [t for t in toks if t not in stops]  # remove stopwords
    return toks

sentences_raw = sentence_split(RAW_TEXT)
tokenized = [tokenize_line(s) for s in sentences_raw]

print("=== Tokenized sentences (first 8) ===")
for i, s in enumerate(tokenized[:8], 1):
    print(f"{i:02d}:", s)


## 4) Train Word2Vec & FastText (gensim)

In [None]:

w2v = Word2Vec(
    sentences=tokenized,
    vector_size=100,
    window=5,
    min_count=1,       # keep low for demo; raise to >=5 on real corpora
    workers=4,
    sg=1,              # skip-gram
    negative=10,
    sample=1e-3,
    epochs=200,
    seed=7
)

ft = FastText(
    sentences=tokenized,
    vector_size=100,
    window=5,
    min_count=1,
    workers=4,
    sg=1,
    negative=10,
    sample=1e-3,
    epochs=200,
    seed=7
)

w2v_kv = w2v.wv
ft_kv  = ft.wv

print("✅ Trained Word2Vec & FastText")


## 5) Load Pretrained: GloVe, spaCy, SBERT, Raw BERT

In [None]:

print("Loading pretrained models (GloVe, spaCy, SBERT, BERT)...")
!python -m spacy download en_core_web_md

# GloVe word vectors
glove = api.load("glove-wiki-gigaword-100")

# spaCy medium model; exclude lemmatizer to avoid W108 warning
nlp = spacy.load("en_core_web_md")

# SBERT sentence embeddings
sbert = SentenceTransformer("all-MiniLM-L6-v2")

# Raw BERT for mean-pooled sentence embeddings
bert_tok = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_model = AutoModel.from_pretrained("bert-base-uncased")
bert_model.eval()

print("✅ Pretrained models ready")


## 6) Utilities

In [None]:

def cos(a, b):
    a = np.asarray(a); b = np.asarray(b)
    return float(np.dot(a, b) / ((np.linalg.norm(a) + 1e-9)*(np.linalg.norm(b) + 1e-9)))

def vec_gensim_word(model, word):
    return model[word] if word in model.key_to_index else None

def vec_glove_word(word):
    return glove[word] if word in glove.key_to_index else None

def embed_spacy(texts):
    # doc.vector: averaged pretrained word vectors
    docs = list(nlp.pipe(texts, disable=["tagger","parser","ner"]))
    X = np.vstack([d.vector for d in docs])
    X = X / (np.linalg.norm(X, axis=1, keepdims=True) + 1e-9)
    return X

def sbert_embed(texts):
    return sbert.encode(texts, normalize_embeddings=True)

@torch.no_grad()
def bert_sentence_embed(texts):
    enc = bert_tok(texts, padding=True, truncation=True, return_tensors="pt")
    out = bert_model(**enc).last_hidden_state  # (B, T, H)
    mask = enc["attention_mask"].unsqueeze(-1)
    summed = (out * mask).sum(dim=1)
    counts = mask.sum(dim=1).clamp(min=1)
    mean = (summed / counts).cpu().numpy()
    mean = mean / (np.linalg.norm(mean, axis=1, keepdims=True) + 1e-9)
    return mean

print("✅ Utilities ready")


## 7) Sample Embeddings (Word2Vec / FastText / GloVe)

In [None]:

def show_vector(label, vec, head=8):
    if vec is None:
        print(f"(missing) '{label}' has no vector")
        return
    print(f"\nVector for '{label}' (first {head} dims of {len(vec)}):")
    print(np.array2string(vec[:head], precision=4, suppress_small=True))

for w in ["king","queen","paris","france","rome","italy","apple","banana","palace","capital"]:
    show_vector(f"W2V:{w}", vec_gensim_word(w2v_kv, w))
    show_vector(f"FT :{w}", vec_gensim_word(ft_kv, w))
    show_vector(f"GloVe:{w}", vec_glove_word(w))


## 7b) Sentence-level vectors + cosine similarities + heatmaps (spaCy / SBERT / BERT)

In [None]:

sent_samples = [
    "The king and the queen live in a palace.",
    "Paris is the capital of France.",
    "Apples and bananas are common fruits.",
    "A woman walked to Rome."
]

X_spacy_samp = embed_spacy(sent_samples)
X_sbert_samp = sbert_embed(sent_samples)
X_bert_samp  = bert_sentence_embed(sent_samples)

def show_sent_vectors(name, X, texts, head=8):
    print(f"\n=== {name} sentence vectors ===")
    for s, v in zip(texts, X):
        print(f"\n{s}")
        print(f"first {head} of {len(v)} dims:")
        print(np.array2string(v[:head], precision=4, suppress_small=True))

show_sent_vectors("spaCy", X_spacy_samp, sent_samples)
show_sent_vectors("SBERT", X_sbert_samp, sent_samples)
show_sent_vectors("BERT ", X_bert_samp,  sent_samples)

def cosine_matrix(X):
    return (X @ X.T)  # X is already normalized

def print_cosine_table(name, sims, texts):
    print(f"\n=== Cosine similarities ({name}) ===")
    n = sims.shape[0]
    for i in range(n):
        row = " | ".join(f"{sims[i,j]:.2f}" for j in range(n))
        print(f"{i:02d} {row}  <- {texts[i]}")

S_spacy = cosine_matrix(X_spacy_samp)
S_sbert = cosine_matrix(X_sbert_samp)
S_bert  = cosine_matrix(X_bert_samp)

print_cosine_table("spaCy", S_spacy, sent_samples)
print_cosine_table("SBERT", S_sbert, sent_samples)
print_cosine_table("BERT ", S_bert,  sent_samples)

def plot_heatmap(sim, title, labels):
    plt.figure(figsize=(5,4))
    plt.imshow(sim, aspect='auto')
    plt.title(title)
    plt.xticks(range(len(labels)), range(len(labels)))
    plt.yticks(range(len(labels)), range(len(labels)))
    plt.colorbar()
    plt.tight_layout()
    plt.show()

plot_heatmap(S_spacy, "Cosine (spaCy) – sentence samples", sent_samples)
plot_heatmap(S_sbert, "Cosine (SBERT) – sentence samples", sent_samples)
plot_heatmap(S_bert,  "Cosine (BERT) – sentence samples", sent_samples)


## 8) Cosine Similarities (word-level)

In [None]:

pairs = [
    ("king","queen"),
    ("king","man"),
    ("queen","woman"),
    ("paris","france"),
    ("rome","italy"),
    ("apple","banana"),
    ("king","apple"),
]

def cosine_table(name, vec_fn):
    print(f"\n=== Cosine similarities ({name}) ===")
    for a, b in pairs:
        va, vb = vec_fn(a), vec_fn(b)
        if va is not None and vb is not None:
            print(f"{a:>6} ~ {b:<6}: {cos(va, vb):.3f}")

cosine_table("Word2Vec", lambda w: vec_gensim_word(w2v_kv, w))
cosine_table("FastText", lambda w: vec_gensim_word(ft_kv, w))
cosine_table("GloVe",   vec_glove_word)


## 9) Analogy: king - man + woman (no temp vectors)

In [None]:

def analogy(model, a, b, c, topn=5):
    if not all(w in model.key_to_index for w in [a,b,c]): return []
    return model.most_similar(positive=[a, c], negative=[b], topn=topn)

print("\n=== Analogy: king - man + woman ≈ ? ===")
print("Word2Vec ->"); pprint(analogy(w2v_kv, "king","man","woman"))
print("FastText ->"); pprint(analogy(ft_kv,  "king","man","woman"))
print("GloVe   ->");  pprint(glove.most_similar(positive=["king","woman"], negative=["man"], topn=5))


## 10) Sentence Semantic Search & Clustering

In [None]:

X_sbert_all = sbert_embed(SENTENCES)
X_bert_all  = bert_sentence_embed(SENTENCES)
X_spacy_all = embed_spacy(SENTENCES)

def semantic_search(query, X, encoder, topk=3):
    q = encoder([query])
    S = cosine_similarity(q, X)[0]
    order = np.argsort(S)[::-1][:topk]
    return [(SENTENCES[i], float(S[i])) for i in order]

queries = [
    "capital of a country",
    "royal family and palace",
    "common fruits in markets",
]

print("\n=== Sentence semantic search ===")
for name, X, enc in [
    ("SBERT", X_sbert_all, sbert_embed),
    ("BERT ", X_bert_all,  bert_sentence_embed),
    ("spaCy", X_spacy_all, embed_spacy),
]:
    print(f"\n-- {name} --")
    for q in queries:
        print(q, "->")
        pprint(semantic_search(q, X, enc))

for name, X in [("SBERT", X_sbert_all), ("BERT", X_bert_all), ("spaCy", X_spacy_all)]:
    k = 3
    km = KMeans(n_clusters=k, n_init=10, random_state=42).fit(X)
    sil = silhouette_score(X, km.labels_)
    print(f"{name} clustering silhouette (k={k}): {sil:.3f}")


## 11) 3D PCA Word Plot with Cosine Edges

In [None]:

def word_plot_3d_with_edges(model_kv, title, words, edge_threshold=0.55):
    words = [w for w in words if w in model_kv.key_to_index]
    if len(words) < 3:
        print(f"Not enough words for plot: {title}")
        return
    X = np.stack([model_kv[w] for w in words])

    pca = PCA(n_components=3, random_state=0)
    X3 = pca.fit_transform(X)

    Vn = X / (np.linalg.norm(X, axis=1, keepdims=True) + 1e-9)
    S = (Vn @ Vn.T)  # cosine sim matrix

    fig = plt.figure(figsize=(9,7))
    ax = fig.add_subplot(111, projection="3d")
    ax.scatter(X3[:,0], X3[:,1], X3[:,2])
    for i, w in enumerate(words):
        ax.text(X3[i,0], X3[i,1], X3[i,2], w)

    for i in range(len(words)):
        for j in range(i+1, len(words)):
            if S[i, j] >= edge_threshold:
                xs = [X3[i,0], X3[j,0]]
                ys = [X3[i,1], X3[j,1]]
                zs = [X3[i,2], X3[j,2]]
                ax.plot(xs, ys, zs, linewidth=1 + 2*(S[i,j]-edge_threshold),
                        alpha=min(0.2 + 0.8*(S[i,j]-edge_threshold), 1.0))

                if (words[i], words[j]) in [("king","queen"), ("paris","france"),
                                            ("rome","italy"), ("apple","banana")] or                    (words[j], words[i]) in [("king","queen"), ("paris","france"),
                                            ("rome","italy"), ("apple","banana")]:
                    mx = (xs[0]+xs[1])/2; my = (ys[0]+ys[1])/2; mz = (zs[0]+zs[1])/2
                    ax.text(mx, my, mz, f"{S[i,j]:.2f}")

    ax.set_title(title + "\nEdges = cosine similarity ≥ " + str(edge_threshold))
    ax.set_xlabel("PC1"); ax.set_ylabel("PC2"); ax.set_zlabel("PC3")
    plt.tight_layout()
    plt.show()

print("Generating 3D plots...")
word_plot_3d_with_edges(w2v_kv, "Word2Vec Embeddings (PCA → 3D)", CONTENT_WORDS, edge_threshold=0.55)
word_plot_3d_with_edges(ft_kv,  "FastText Embeddings (PCA → 3D)", CONTENT_WORDS, edge_threshold=0.55)


## 11b) Sentence embeddings: 3D PCA (SBERT vs BERT)

In [None]:

def plot_sentences_3d(X, title, labels):
    p3 = PCA(n_components=3, random_state=0).fit_transform(X)
    fig = plt.figure(figsize=(8,6))
    ax = fig.add_subplot(111, projection="3d")
    ax.scatter(p3[:,0], p3[:,1], p3[:,2])
    for i, txt in enumerate(labels):
        ax.text(p3[i,0], p3[i,1], p3[i,2], str(i))
    ax.set_title(title)
    ax.set_xlabel("PC1"); ax.set_ylabel("PC2"); ax.set_zlabel("PC3")
    plt.tight_layout(); plt.show()
    for i, s in enumerate(labels):
        print(f"{i:02d}: {s}")

plot_sentences_3d(X_sbert_all, "SBERT – Sentences (PCA 3D)", SENTENCES)
plot_sentences_3d(X_bert_all,  "BERT  – Sentences (PCA 3D)", SENTENCES)


## 12) (Optional) UMAP 2D Visualizations (words & sentences)

In [None]:

def word_umap_plot(model_kv, title, words):
    words = [w for w in words if w in model_kv.key_to_index]
    if len(words) < 3:
        print(f"Not enough words for plot: {title}")
        return
    X = np.stack([model_kv[w] for w in words])
    X2 = umap.UMAP(n_components=2, random_state=0, n_neighbors=10, min_dist=0.1).fit_transform(X)
    plt.figure(figsize=(6,5))
    plt.scatter(X2[:,0], X2[:,1])
    for i,w in enumerate(words):
        plt.text(X2[i,0]+0.01, X2[i,1]+0.01, w)
    plt.title(title); plt.xlabel("UMAP-1"); plt.ylabel("UMAP-2")
    plt.tight_layout(); plt.show()

def plot_sentences_umap(X, title, labels):
    u = umap.UMAP(n_components=2, random_state=0, n_neighbors=8, min_dist=0.1).fit_transform(X)
    plt.figure(figsize=(6,5))
    plt.scatter(u[:,0], u[:,1])
    for i, txt in enumerate(labels):
        plt.text(u[i,0]+0.01, u[i,1]+0.01, str(i))
    plt.title(title); plt.xlabel("UMAP-1"); plt.ylabel("UMAP-2")
    plt.tight_layout(); plt.show()
    for i, s in enumerate(labels):
        print(f"{i:02d}: {s}")

# Word 2D plots (uncomment to run)
# word_umap_plot(w2v_kv, "Word2Vec (UMAP 2D)", CONTENT_WORDS)
# word_umap_plot(ft_kv,  "FastText (UMAP 2D)", CONTENT_WORDS)

# Sentence 2D plots (SBERT vs BERT)
plot_sentences_umap(X_sbert_all, "SBERT – Sentences (UMAP 2D)", SENTENCES)
plot_sentences_umap(X_bert_all,  "BERT  – Sentences (UMAP 2D)", SENTENCES)


## 13) Word analogy arrows in 3D (PCA) and 2D (UMAP)

In [None]:

def analogy_target(model_kv, a, b, c):
    if not all(w in model_kv.key_to_index for w in [a,b,c]):
        return None, []
    pred = model_kv[a] - model_kv[b] + model_kv[c]
    V = model_kv.vectors
    Vn = V / (np.linalg.norm(V, axis=1, keepdims=True) + 1e-9)
    p = pred / (np.linalg.norm(pred) + 1e-9)
    sims = Vn @ p
    idx2w = [None]*len(model_kv.key_to_index)
    for w, i in model_kv.key_to_index.items():
        idx2w[i] = w
    order = np.argsort(sims)[::-1]
    best = []
    for i in order:
        w = idx2w[i]
        if w in {a,b,c}:
            continue
        best.append((w, float(sims[i])))
        if len(best) == 5:
            break
    return pred, best

def word_analogy_plot_3d(model_kv, title, words, triplet=("king","man","woman")):
    words = [w for w in words if w in model_kv.key_to_index]
    if not words:
        print("No words to plot");
        return
    X = np.stack([model_kv[w] for w in words])
    p3 = PCA(n_components=3, random_state=0).fit_transform(X)
    fig = plt.figure(figsize=(8,6))
    ax = fig.add_subplot(111, projection="3d")
    ax.scatter(p3[:,0], p3[:,1], p3[:,2])
    for i, w in enumerate(words):
        ax.text(p3[i,0], p3[i,1], p3[i,2], w)
    a,b,c = triplet
    if all(w in model_kv.key_to_index for w in [a,b,c]):
        pred_vec, best = analogy_target(model_kv, a,b,c)
        if best:
            target = best[0][0]
            def proj(w): return p3[words.index(w)]
            try:
                A,B,C,T = proj(a), proj(b), proj(c), proj(target)
                ax.plot([A[0],B[0]],[A[1],B[1]],[A[2],B[2]])
                ax.plot([B[0],C[0]],[B[1],C[1]],[B[2],C[2]])
                ax.plot([C[0],T[0]],[C[1],T[1]],[C[2],T[2]])
                ax.text(T[0], T[1], T[2], f"≈ {target}")
            except ValueError:
                pass
    ax.set_title(title + " – Analogy arrows")
    ax.set_xlabel("PC1"); ax.set_ylabel("PC2"); ax.set_zlabel("PC3")
    plt.tight_layout(); plt.show()

def word_analogy_plot_2d(model_kv, title, words, triplet=("king","man","woman")):
    words = [w for w in words if w in model_kv.key_to_index]
    if len(words) < 3:
        print("Not enough words"); return
    X = np.stack([model_kv[w] for w in words])
    u = umap.UMAP(n_components=2, random_state=0, n_neighbors=8, min_dist=0.1).fit_transform(X)
    plt.figure(figsize=(6,5))
    plt.scatter(u[:,0], u[:,1])
    for i, w in enumerate(words):
        plt.text(u[i,0]+0.01, u[i,1]+0.01, w)
    a,b,c = triplet
    if all(w in model_kv.key_to_index for w in [a,b,c]) and all(w in words for w in [a,b,c]):
        pred_vec, best = analogy_target(model_kv, a,b,c)
        if best:
            target = best[0][0]
            A = u[words.index(a)]; B = u[words.index(b)]
            Cp = u[words.index(c)]; T = u[words.index(target)]
            plt.plot([A[0],B[0]],[A[1],B[1]])
            plt.plot([B[0],Cp[0]],[B[1],Cp[1]])
            plt.plot([Cp[0],T[0]],[Cp[1],T[1]])
            plt.text(T[0], T[1], f"≈ {target}")
    plt.title(title + " – Analogy arrows")
    plt.xlabel("UMAP-1"); plt.ylabel("UMAP-2")
    plt.tight_layout(); plt.show()

# Draw for W2V/FT; GloVe via wrapper
word_analogy_plot_3d(w2v_kv, "W2V (PCA 3D)", CONTENT_WORDS, ("king","man","woman"))
word_analogy_plot_3d(ft_kv,  "FastText (PCA 3D)", CONTENT_WORDS, ("king","man","woman"))

class SimpleKV:
    def __init__(self, words):
        self.key_to_index = {w:i for i,w in enumerate(words)}
        self.vecs = np.stack([glove[w] for w in words])
    def __getitem__(self, w): return self.vecs[self.key_to_index[w]]
    @property
    def vectors(self): return self.vecs

glove_words = [w for w in CONTENT_WORDS if w in glove.key_to_index]
if len(glove_words) >= 4:
    gkv = SimpleKV(glove_words)
    word_analogy_plot_3d(gkv, "GloVe (PCA 3D)", glove_words, ("king","man","woman"))
    word_analogy_plot_2d(w2v_kv, "W2V (UMAP 2D)", CONTENT_WORDS, ("king","man","woman"))
    word_analogy_plot_2d(ft_kv,  "FastText (UMAP 2D)", CONTENT_WORDS, ("king","man","woman"))
    word_analogy_plot_2d(gkv,    "GloVe (UMAP 2D)", glove_words, ("king","man","woman"))
