In [1]:
!pip -q install gensim numpy pandas scipy


In [2]:
import os, json, zipfile, urllib.request
os.makedirs("data", exist_ok=True)

def safe_download(url, out_path):
    if os.path.exists(out_path) and os.path.getsize(out_path) == 0:
        os.remove(out_path)
    if not os.path.exists(out_path):
        print(f"Downloading -> {out_path}")
        urllib.request.urlretrieve(url, out_path)
    else:
        print(f"Exists -> {out_path}")

# WordSim353
safe_download(
    "https://raw.githubusercontent.com/AdrienGuille/DistributionalSemantics/master/evaluation_data/wordsim353.tsv",
    "data/wordsim353.tsv"
)

# Win353 (we will evaluate as similarity-like with correlation)
safe_download(
    "https://raw.githubusercontent.com/kliegr/word_similarity_relatedness_datasets/master/win353.csv",
    "data/win353.csv"
)

# BATS via pCloud public link (vecto)
BATS_ZIP = "data/BATS_3.0.zip"
BATS_DIR = "data/BATS_3.0"

if not os.path.isdir(BATS_DIR) or len(os.listdir(BATS_DIR)) == 0:
    print("Downloading BATS via pCloud ...")
    code = "XZOn0J7Z8fzFMt7Tw1mGS6uI1SYfCfTyJQTV"
    api_url = f"https://api.pcloud.com/getpublinkdownload?code={code}&forcedownload=1"
    with urllib.request.urlopen(api_url) as r:
        meta = json.loads(r.read().decode("utf-8"))
    if meta.get("result") != 0:
        raise RuntimeError(f"pCloud API error: {meta}")

    hosts, path = meta["hosts"], meta["path"]
    last_err = None
    for h in hosts:
        try:
            urllib.request.urlretrieve(f"https://{h}{path}", BATS_ZIP)
            last_err = None
            break
        except Exception as e:
            last_err = e
    if last_err is not None:
        raise RuntimeError(f"Failed downloading BATS: {last_err}")

    print("Extracting BATS zip ...")
    with zipfile.ZipFile(BATS_ZIP, "r") as z:
        z.extractall("data")
else:
    print("Exists -> data/BATS_3.0/")

print("\nFiles in data/:", os.listdir("data"))
print("BATS top-level:", os.listdir("data/BATS_3.0"))


Exists -> data/wordsim353.tsv
Exists -> data/win353.csv
Exists -> data/BATS_3.0/

Files in data/: ['win353_like_wordsim.tsv', 'win353.csv', 'wordsim353.tsv', 'BATS_3.0.zip', 'BATS_3.0']
BATS top-level: ['1_Inflectional_morphology', '4_Lexicographic_semantics', '3_Encyclopedic_semantics', '2_Derivational_morphology', 'metadata.json']


In [3]:
import os, re, string, random, tempfile
import numpy as np
import pandas as pd

from gensim.models import Word2Vec
import gensim.downloader as api
from IPython.display import display

# ---- Convert Win353 CSV -> TSV (w1, w2, score)
win_raw = pd.read_csv("data/win353.csv")

def find_col(df, keyword, number=None):
    keyword = keyword.lower()
    out = []
    for c in df.columns:
        name = c.lower().replace(" ", "")
        if keyword in name:
            if number is None or str(number) in name:
                out.append(c)
    if not out:
        raise ValueError(f"Column for {keyword}{number or ''} not found. Columns={df.columns.tolist()}")
    return out[0]

col_word1 = find_col(win_raw, "word", 1)
col_word2 = find_col(win_raw, "word", 2)
col_score = find_col(win_raw, "human")

WIN353_TSV = "data/win353_like_wordsim.tsv"
win_raw[[col_word1, col_word2, col_score]].to_csv(WIN353_TSV, sep="\t", header=False, index=False)

# ---- Paths
WORDSIM_PATH = "data/wordsim353.tsv"
WIN353_PATH  = WIN353_TSV
BATS_BASE    = "data/BATS_3.0"

# ---- Hyperparameter grid (12 models)
ARCHITECTURES = ["cbow", "skipgram"]
DIMENSIONS    = [50, 100, 300]
WINDOWS       = [5, 10]
NEGATIVE      = 5
EPOCHS        = 5

# ---- Fixed training params
MIN_COUNT = 5
WORKERS   = 4
SEED      = 42

# ---- Qualitative words
POLY_WORDS = ["bank", "apple", "run"]

# ---- Preprocess tools
PUNCT_TABLE = str.maketrans("", "", string.punctuation)
RNG = random.Random(SEED)

# ---- BATS control
MAX_BATS_PAIRS_PER_FILE     = 120
MAX_BATS_QUESTIONS_PER_FILE = 5000


In [4]:
def preprocess_token(tok: str):
    tok = tok.lower()
    if tok.isdigit():
        return "<NUM>"
    tok = tok.translate(PUNCT_TABLE)
    return tok if tok else None

def load_corpus_text8():
    dataset = api.load("text8")
    sents = list(dataset)
    print(f"[Corpus] Loaded text8 with {len(sents)} sentences.")
    return sents

def preprocess_sentences(sentences_raw):
    out = []
    for sent in sentences_raw:
        cleaned = []
        for tok in sent:
            t = preprocess_token(tok)
            if t:
                cleaned.append(t)
        if cleaned:
            out.append(cleaned)
    print(f"[Preprocess] {len(out)} sentences after cleaning.")
    return out


In [5]:
BATS_Q_DIR = "data/bats_qwords"
os.makedirs(BATS_Q_DIR, exist_ok=True)

def list_txt_files(root):
    out = []
    for r,_,fs in os.walk(root):
        for f in fs:
            if f.lower().endswith(".txt"):
                out.append(os.path.join(r,f))
    return sorted(out)

def clean_token_for_bats(t: str):
    t = t.strip().lower()
    t = t.translate(PUNCT_TABLE)
    if not t:
        return None
    # skip multiword / underscored forms to keep analogies stable
    if " " in t or "_" in t:
        return None
    return t

def read_pairs_strict(path):
    pairs = []
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            parts = re.split(r"\s+", line)
            if len(parts) < 2:
                continue
            a = clean_token_for_bats(parts[0])
            b = clean_token_for_bats(parts[1])
            if a and b:
                pairs.append((a,b))
    # unique
    pairs = list(dict.fromkeys(pairs))
    return pairs

def build_questions(pairs, n_questions, seed):
    rng = random.Random(seed)
    if len(pairs) < 2:
        return []
    qs = []
    while len(qs) < n_questions:
        i = rng.randrange(len(pairs))
        j = rng.randrange(len(pairs))
        if i == j:
            continue
        a1,b1 = pairs[i]
        a2,b2 = pairs[j]
        qs.append((a1,b1,a2,b2))
    return qs

def pick_best_file_by_pairs(dir_path):
    best_p, best_n = None, -1
    for p in list_txt_files(dir_path):
        n = len(read_pairs_strict(p))
        if n > best_n:
            best_p, best_n = p, n
    if best_p is None or best_n < 2:
        raise FileNotFoundError(f"No usable file in {dir_path}")
    return best_p, best_n

# --- pick 4 categories compatible with your BATS layout
inf_dir = os.path.join(BATS_BASE, "1_Inflectional_morphology")
der_dir = os.path.join(BATS_BASE, "2_Derivational_morphology")
lex_dir = os.path.join(BATS_BASE, "4_Lexicographic_semantics")

inf_files = list_txt_files(inf_dir)
plural_reg = [p for p in inf_files if "plural_reg" in p.lower()]
plural_irr = [p for p in inf_files if "plural_irreg" in p.lower()]
if plural_reg and plural_irr:
    bats1_src, bats2_src = plural_reg[0], plural_irr[0]
else:
    plural_any = [p for p in inf_files if "plural" in p.lower()]
    if len(plural_any) < 2:
        raise FileNotFoundError("Not enough plural files in inflectional folder.")
    bats1_src, bats2_src = plural_any[0], plural_any[1]

bats3_src, bats3_n = pick_best_file_by_pairs(der_dir)

lex_files = list_txt_files(lex_dir)
ant_bin = [p for p in lex_files if "antonyms" in p.lower() and "binary" in p.lower()]
ant_any = [p for p in lex_files if "antonyms" in p.lower()]
if ant_bin:
    bats4_src = ant_bin[0]
elif ant_any:
    bats4_src = ant_any[0]
else:
    raise FileNotFoundError("No antonyms file found.")

SELECTED_BATS = [
    ("BATS_1_plural_reg", bats1_src),
    ("BATS_2_plural_irreg", bats2_src),
    ("BATS_3_deriv_best", bats3_src),
    ("BATS_4_antonyms", bats4_src),
]

print("Selected BATS sources:")
for tag, src in SELECTED_BATS:
    print(" ", tag, "->", src)

# --- build questions-words files ONCE
BATS_QFILES = []
for k, (tag, src_path) in enumerate(SELECTED_BATS, start=1):
    pairs = read_pairs_strict(src_path)

    if len(pairs) > MAX_BATS_PAIRS_PER_FILE:
        rng = random.Random(SEED + k)
        pairs = rng.sample(pairs, MAX_BATS_PAIRS_PER_FILE)

    qs = build_questions(pairs, MAX_BATS_QUESTIONS_PER_FILE, seed=SEED + k)
    if len(qs) == 0:
        raise ValueError(f"{tag}: not enough pairs to build analogy questions.")

    out_path = os.path.join(BATS_Q_DIR, f"{tag}.txt")
    with open(out_path, "w", encoding="utf-8") as out:
        out.write(f": {tag}\n")
        for a1,b1,a2,b2 in qs:
            out.write(f"{a1} {b1} {a2} {b2}\n")

    BATS_QFILES.append(out_path)
    print(f"Built {out_path} | pairs={len(pairs)} | questions={len(qs)}")

print("\n✅ BATS prepared. Evaluation will not fail due to format.")


Selected BATS sources:
  BATS_1_plural_reg -> data/BATS_3.0/1_Inflectional_morphology/I01 [noun - plural_reg].txt
  BATS_2_plural_irreg -> data/BATS_3.0/1_Inflectional_morphology/I02 [noun - plural_irreg].txt
  BATS_3_deriv_best -> data/BATS_3.0/2_Derivational_morphology/D01 [noun+less_reg].txt
  BATS_4_antonyms -> data/BATS_3.0/4_Lexicographic_semantics/L10 [antonyms - binary].txt
Built data/bats_qwords/BATS_1_plural_reg.txt | pairs=50 | questions=5000
Built data/bats_qwords/BATS_2_plural_irreg.txt | pairs=50 | questions=5000
Built data/bats_qwords/BATS_3_deriv_best.txt | pairs=50 | questions=5000
Built data/bats_qwords/BATS_4_antonyms.txt | pairs=50 | questions=5000

✅ BATS prepared. Evaluation will not fail due to format.


In [6]:
def train_word2vec_model(sentences, arch, dim, window):
    sg = 1 if arch == "skipgram" else 0
    return Word2Vec(
        sentences=sentences,
        vector_size=dim,
        window=window,
        sg=sg,
        negative=NEGATIVE,
        min_count=MIN_COUNT,
        workers=WORKERS,
        epochs=EPOCHS,
        seed=SEED,
    )

def train_all_models(sentences):
    models = {}
    for arch in ARCHITECTURES:
        for dim in DIMENSIONS:
            for win in WINDOWS:
                name = f"{arch}_dim{dim}_win{win}"
                print(f"\n[Train] {name}")
                models[name] = train_word2vec_model(sentences, arch, dim, win)
    return models


In [7]:
def eval_word_pairs_spearman(kv, filepath):
    pearson, spearman, oov_ratio = kv.evaluate_word_pairs(filepath)
    return float(spearman[0]), float(oov_ratio)

def eval_bats(model):
    scores = {}
    for i, qpath in enumerate(BATS_QFILES, start=1):
        print(f"  [Eval] BATS_{i} -> {os.path.basename(qpath)}")
        acc, _ = model.wv.evaluate_word_analogies(qpath)
        scores[i] = float(acc)
    return scores

def evaluate_all_models(models, out_csv="word2vec_param_search_results.csv"):
    rows = []
    for arch in ARCHITECTURES:
        for dim in DIMENSIONS:
            for win in WINDOWS:
                name = f"{arch}_dim{dim}_win{win}"
                print(f"\n=== Evaluating {name} ===")
                model = models[name]

                ws_s, ws_oov   = eval_word_pairs_spearman(model.wv, WORDSIM_PATH)
                win_s, win_oov = eval_word_pairs_spearman(model.wv, WIN353_PATH)
                bats_scores    = eval_bats(model)

                row = {
                    "ModelName": name,
                    "Architecture": arch,
                    "Window": win,
                    "Dim": dim,
                    "#Neg.samples": NEGATIVE,
                    "WordSim353_Spearman": ws_s,
                    "WordSim353_OOV%": ws_oov,
                    "Win353_Spearman": win_s,
                    "Win353_OOV%": win_oov,
                }
                for i in range(1,5):
                    row[f"BATS_{i}"] = bats_scores[i]

                # MeanScore for selecting best/worst
                ws01  = (ws_s + 1)/2
                win01 = (win_s + 1)/2
                mean  = pd.Series([ws01, win01] + [row[f"BATS_{i}"] for i in range(1,5)], dtype=float).mean()
                row["MeanScore"] = float(mean)

                rows.append(row)
                pd.DataFrame(rows).to_csv(out_csv, index=False)
                print(f"  [Checkpoint] saved {out_csv} (rows={len(rows)})")

    return pd.DataFrame(rows)


In [8]:
def nearest_neighbors_table(best_model, worst_model, words, topn=5):
    rows = []
    for w in words:
        w_p = preprocess_token(w)
        try:
            best = [x for (x, _) in best_model.wv.most_similar(w_p, topn=topn)]
        except KeyError:
            best = ["<OOV>"]
        try:
            worst = [x for (x, _) in worst_model.wv.most_similar(w_p, topn=topn)]
        except KeyError:
            worst = ["<OOV>"]
        rows.append({"Word": w, "BestModelNeighbors": ", ".join(best), "WorstModelNeighbors": ", ".join(worst)})
    df = pd.DataFrame(rows)
    df.to_csv("polysemous_neighbors.csv", index=False)
    print("Saved polysemous_neighbors.csv")
    return df


In [9]:
sentences_raw = load_corpus_text8()
sentences = preprocess_sentences(sentences_raw)

models = train_all_models(sentences)

results_df = evaluate_all_models(models, out_csv="word2vec_param_search_results.csv")
display(results_df)

best_name  = results_df.loc[results_df["MeanScore"].idxmax(), "ModelName"]
worst_name = results_df.loc[results_df["MeanScore"].idxmin(), "ModelName"]
print("\nBest:", best_name, "| Worst:", worst_name)

best_model  = models[best_name]
worst_model = models[worst_name]

best_model.wv.save("best_w2v.kv")
print("Saved best_w2v.kv (for Part 2)")

nn_df = nearest_neighbors_table(best_model, worst_model, POLY_WORDS, topn=5)
display(nn_df)


[Corpus] Loaded text8 with 1701 sentences.
[Preprocess] 1701 sentences after cleaning.

[Train] cbow_dim50_win5

[Train] cbow_dim50_win10

[Train] cbow_dim100_win5

[Train] cbow_dim100_win10

[Train] cbow_dim300_win5

[Train] cbow_dim300_win10

[Train] skipgram_dim50_win5

[Train] skipgram_dim50_win10

[Train] skipgram_dim100_win5

[Train] skipgram_dim100_win10

[Train] skipgram_dim300_win5

[Train] skipgram_dim300_win10

=== Evaluating cbow_dim50_win5 ===
  [Eval] BATS_1 -> BATS_1_plural_reg.txt
  [Eval] BATS_2 -> BATS_2_plural_irreg.txt
  [Eval] BATS_3 -> BATS_3_deriv_best.txt
  [Eval] BATS_4 -> BATS_4_antonyms.txt
  [Checkpoint] saved word2vec_param_search_results.csv (rows=1)

=== Evaluating cbow_dim50_win10 ===
  [Eval] BATS_1 -> BATS_1_plural_reg.txt
  [Eval] BATS_2 -> BATS_2_plural_irreg.txt
  [Eval] BATS_3 -> BATS_3_deriv_best.txt
  [Eval] BATS_4 -> BATS_4_antonyms.txt
  [Checkpoint] saved word2vec_param_search_results.csv (rows=2)

=== Evaluating cbow_dim100_win5 ===
  [Eval] 

Unnamed: 0,ModelName,Architecture,Window,Dim,#Neg.samples,WordSim353_Spearman,WordSim353_OOV%,Win353_Spearman,Win353_OOV%,BATS_1,BATS_2,BATS_3,BATS_4,MeanScore
0,cbow_dim50_win5,cbow,5,50,5,0.597904,0.566572,0.489252,0.566572,0.5122,0.2856,0.0,0.313725,0.442517
1,cbow_dim50_win10,cbow,10,50,5,0.654797,0.566572,0.509595,0.566572,0.5014,0.3046,0.0,0.248366,0.439427
2,cbow_dim100_win5,cbow,5,100,5,0.622663,0.566572,0.53171,0.566572,0.6068,0.384,0.0,0.372549,0.490089
3,cbow_dim100_win10,cbow,10,100,5,0.680943,0.566572,0.537974,0.566572,0.608,0.3682,0.00198,0.45098,0.506437
4,cbow_dim300_win5,cbow,5,300,5,0.632092,0.566572,0.539783,0.566572,0.6284,0.4286,0.0,0.405229,0.508028
5,cbow_dim300_win10,cbow,10,300,5,0.681008,0.566572,0.534627,0.566572,0.6292,0.4024,0.0,0.431373,0.511798
6,skipgram_dim50_win5,skipgram,5,50,5,0.656395,0.566572,0.512987,0.566572,0.5386,0.2782,0.0,0.248366,0.441643
7,skipgram_dim50_win10,skipgram,10,50,5,0.668606,0.566572,0.510041,0.566572,0.4594,0.2652,0.0,0.30719,0.436852
8,skipgram_dim100_win5,skipgram,5,100,5,0.68684,0.566572,0.533319,0.566572,0.5872,0.37,0.0,0.496732,0.510669
9,skipgram_dim100_win10,skipgram,10,100,5,0.699725,0.566572,0.523059,0.566572,0.5524,0.3294,0.0,0.411765,0.484159



Best: skipgram_dim300_win5 | Worst: skipgram_dim50_win10
Saved best_w2v.kv (for Part 2)
Saved polysemous_neighbors.csv


Unnamed: 0,Word,BestModelNeighbors,WorstModelNeighbors
0,bank,"monetary, banks, fund, suntrust, banking","banks, monetary, fund, banking, loans"
1,apple,"macintosh, iic, iigs, iie, amiga","macintosh, imac, iigs, iic, workstation"
2,run,"runs, running, ran, cooperatively, consecutively","running, drivers, lotteries, runs, backups"


In [10]:
import os
for f in ["word2vec_param_search_results.csv", "polysemous_neighbors.csv", "best_w2v.kv"]:
    print(f, "->", "OK" if os.path.exists(f) else "MISSING")


word2vec_param_search_results.csv -> OK
polysemous_neighbors.csv -> OK
best_w2v.kv -> OK


In [11]:
from google.colab import files
files.download("word2vec_param_search_results.csv")
files.download("polysemous_neighbors.csv")
files.download("best_w2v.kv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>