In [1]:
import os
import re
import sys
import json
import string
import warnings
from pathlib import Path
from collections import Counter, defaultdict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Ensure needed packages (lightweight, idempotent)
def _ensure_packages():
    try:
        import kagglehub  # noqa: F401
    except Exception:
        try:
            import subprocess
            subprocess.check_call([sys.executable, "-m", "pip", "install", "kagglehub"])
        except Exception:
            pass
    try:
        import gensim  # noqa: F401
    except Exception:
        import subprocess
        subprocess.check_call([sys.executable, "-m", "pip", "install", "gensim"])
    try:
        import nltk  # noqa: F401
    except Exception:
        import subprocess
        subprocess.check_call([sys.executable, "-m", "pip", "install", "nltk"])
    try:
        from sklearn.linear_model import LogisticRegression  # noqa: F401
    except Exception:
        import subprocess
        subprocess.check_call([sys.executable, "-m", "pip", "install", "scikit-learn"])

_ensure_packages()

import gensim
import gensim.downloader as api
from gensim.models import Word2Vec, FastText
from gensim.models.keyedvectors import KeyedVectors

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report

# NLTK data downloads
def _ensure_nltk_data():
    resources = [
        ("punkt", "tokenizers/punkt"),
        ("punkt_tab", "tokenizers/punkt_tab"),  # newer nltk sometimes needs this
        ("stopwords", "corpora/stopwords"),
    ]
    for pkg, path in resources:
        try:
            nltk.data.find(path)
        except LookupError:
            nltk.download(pkg, quiet=True)

_ensure_nltk_data()

STOPWORDS = set(stopwords.words("english"))
RANDOM_STATE = 42
ARTIFACT_DIR = Path("./artifacts")
ARTIFACT_DIR.mkdir(exist_ok=True, parents=True)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ==== PART 1: print all results in one go (and save to file) ====
import os, io
from pathlib import Path
from contextlib import redirect_stdout

import gensim
import gensim.downloader as api
from gensim.models.keyedvectors import KeyedVectors

# --- loader (works with GoogleNews or falls back to GloVe) ---
def load_pretrained_w2v():
    local_path = os.environ.get("W2V_LOCAL_PATH", "").strip()
    if local_path and Path(local_path).exists():
        print(f"[info] Loading local Word2Vec model from: {local_path}")
        try:
            kv = KeyedVectors.load_word2vec_format(local_path, binary=local_path.endswith(".bin"))
            return kv
        except Exception as e:
            print("[warn] Failed loading local model:", repr(e))

    print("[info] Attempting to load 'word2vec-google-news-300' via gensim.downloader...")
    try:
        kv = api.load("word2vec-google-news-300")  # ~1.6GB
        print("[info] Loaded 'word2vec-google-news-300'.")
        return kv
    except Exception as e:
        print("[warn] Could not load 'word2vec-google-news-300':", repr(e))
        print("[info] Falling back to 'glove-wiki-gigaword-100'")
        kv = api.load("glove-wiki-gigaword-100")
        return kv

# --- helpers for robust token presence (case variations) ---
def pick_variant_in_vocab(kv, token):
    for v in (token, token.lower(), token.title(), token.upper()):
        if v in kv.key_to_index:
            return v
    return None

def safe_analogy(kv, a, b, c, topn=10):
    aa, bb, cc = pick_variant_in_vocab(kv, a), pick_variant_in_vocab(kv, b), pick_variant_in_vocab(kv, c)
    if None in (aa, bb, cc):
        print(f"\n[skip] Missing tokens for analogy: {a}-{b}+{c} "
              f"(variants: {aa or 'X'},{bb or 'X'},{cc or 'X'})")
        return
    res = kv.most_similar(positive=[aa, cc], negative=[bb], topn=topn)
    print(f"\n{aa} - {bb} + {cc} ≈")
    for term, score in res:
        print(f"{term:>20s}  {score:.4f}")

# --- run everything and also save full output to a file ---
ARTIFACT_DIR = Path("artifacts")
ARTIFACT_DIR.mkdir(parents=True, exist_ok=True)
out_txt = ARTIFACT_DIR / "part1_results.txt"

buf = io.StringIO()
with redirect_stdout(buf):
    kv = load_pretrained_w2v()

    # choose 5 seeds; ensure they exist (try variants)
    seed_candidates = ["king", "movie", "love", "computer", "music", "actor", "woman", "man", "paris", "france"]
    seeds = []
    for w in seed_candidates:
        v = pick_variant_in_vocab(kv, w)
        if v and v not in seeds:
            seeds.append(v)
        if len(seeds) == 5:
            break

    print("\n[PART 1] Similar words for 5 seeds")
    print("[seeds used]:", seeds)
    for w in seeds:
        sims = kv.most_similar(w, topn=10)
        print(f"\nTop similar to '{w}':")
        for term, score in sims:
            print(f"{term:>20s}  {score:.4f}")

    print("\n[PART 1] Analogy tests (a - b + c)")
    # use robust case-aware variants; will skip if truly missing
    safe_analogy(kv, "king", "man", "woman")       # classic
    safe_analogy(kv, "Paris", "France", "Italy")   # capital-country swap
    safe_analogy(kv, "actor", "man", "woman")      # gendered profession

# print to cell
print(buf.getvalue())

# and save full, untruncated output
with open(out_txt, "w", encoding="utf-8") as f:
    f.write(buf.getvalue())
print(f"\n[info] Saved full output to: {out_txt.resolve()}")


[info] Attempting to load 'word2vec-google-news-300' via gensim.downloader...
[info] Loaded 'word2vec-google-news-300'.

[PART 1] Similar words for 5 seeds
[seeds used]: ['king', 'movie', 'love', 'computer', 'music']

Top similar to 'king':
               kings  0.7138
               queen  0.6511
             monarch  0.6413
        crown_prince  0.6204
              prince  0.6160
              sultan  0.5865
               ruler  0.5798
             princes  0.5647
        Prince_Paras  0.5433
              throne  0.5422

Top similar to 'movie':
                film  0.8677
              movies  0.8013
               films  0.7363
               moive  0.6830
               Movie  0.6694
        horror_flick  0.6578
              sequel  0.6578
Guy_Ritchie_Revolver  0.6510
     romantic_comedy  0.6413
               flick  0.6322

Top similar to 'love':
               loved  0.6908
               adore  0.6817
               loves  0.6619
             passion  0.6101
              

In [6]:
# ==== PART 2 ONLY: IMDB sentiment — print ALL results & save full log (with fixes) ====
import os, io, sys, re, json, string, warnings
from pathlib import Path
from contextlib import redirect_stdout
from collections import Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ---- deps ----
import gensim.downloader as api
from gensim.models import Word2Vec, FastText
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report

# --- NLTK data (minimal) ---
for pkg, path in [("punkt","tokenizers/punkt"), ("punkt_tab","tokenizers/punkt_tab"), ("stopwords","corpora/stopwords")]:
    try:
        nltk.data.find(path)
    except LookupError:
        nltk.download(pkg, quiet=True)

STOPWORDS = set(stopwords.words("english"))
RANDOM_STATE = 42
ARTIFACT_DIR = Path("artifacts"); ARTIFACT_DIR.mkdir(parents=True, exist_ok=True)

# ----------------------------- helpers & pipeline -----------------------------
RE_HTML = re.compile(r"<.*?>")
RE_URL  = re.compile(r"(https?://\S+|www\.\S+)")
RE_PUNCT = re.compile(rf"[{re.escape(string.punctuation)}]")

def load_imdb():
    import kagglehub
    csv = None
    try:
        print("\n[info] Using kagglehub to fetch IMDB dataset...")
        p = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")
        print("[info] kagglehub path:", p)
        base = Path(p)
        cands = list(base.glob("*.csv")) + list(base.rglob("*.csv"))
        for c in cands:
            if c.name.lower().startswith("imdb") and c.suffix.lower()==".csv":
                csv = c; break
        if csv is None and cands:
            csv = cands[0]
    except Exception as e:
        print("[warn] kagglehub unavailable/failure:", repr(e))
        base = Path(".")
        for c in list(base.glob("*.csv")) + list(base.rglob("*.csv")):
            if "imdb" in c.name.lower():
                csv = c; break
    if csv is None:
        raise FileNotFoundError("Could not locate IMDB CSV. Ensure it's downloaded or available locally.")

    print(f"[info] Loading dataset: {csv}")
    df = pd.read_csv(csv)
    cols = {c.lower(): c for c in df.columns}
    review_col = cols.get("review"); sent_col = cols.get("sentiment")
    if review_col is None or sent_col is None:
        df = df.iloc[:, :2]; df.columns = ["review","sentiment"]; review_col="review"; sent_col="sentiment"

    df = df[[review_col, sent_col]].rename(columns={review_col:"review", sent_col:"sentiment"})
    # >>> FIXED LINE: use .str.strip() on a Series
    df["sentiment"] = df["sentiment"].astype(str).str.strip().str.lower()

    mapping = {"positive":1, "pos":1, "1":1, "negative":0, "neg":0, "0":0}
    df["label"] = df["sentiment"].map(mapping)
    if df["label"].isna().any():
        df.loc[df["label"].isna(),"label"] = (df.loc[df["label"].isna(),"sentiment"].str[0].str.lower()=="p").astype(int)
    df["label"] = df["label"].astype(int)
    df.drop(columns=["sentiment"], inplace=True)
    print(f"[info] Dataset size: {len(df)}; positives={int(df['label'].sum())}; negatives={len(df)-int(df['label'].sum())}")
    return df

def clean_and_tokenize(text, remove_punct=True, remove_stop=True, lowercase=True):
    text = RE_HTML.sub(" ", str(text))
    text = RE_URL.sub(" ", text)
    if lowercase: text = text.lower()
    if remove_punct: text = RE_PUNCT.sub(" ", text)
    toks = word_tokenize(text)
    if remove_stop: toks = [t for t in toks if t not in STOPWORDS and t.strip()]
    return toks

def text_eda(df):
    print("\n[info] Running EDA...")
    df["tokens"] = df["review"].apply(clean_and_tokenize)
    df["length"] = df["tokens"].apply(len)

    stats = df["length"].describe()
    print("\nToken length summary:"); print(stats.to_string())

    plt.figure(figsize=(8,5))
    plt.hist(df["length"], bins=50)
    plt.title("Review Length (tokens)"); plt.xlabel("Tokens"); plt.ylabel("Count")
    plt.tight_layout()
    out_hist = ARTIFACT_DIR / "imdb_length_hist.png"
    plt.savefig(out_hist, dpi=150); plt.close()
    print(f"[info] Saved histogram: {out_hist}")

    for label, name in [(1,"positive"), (0,"negative")]:
        all_toks=[]; [all_toks.extend(t) for t in df.loc[df["label"]==label, "tokens"]]
        top20 = Counter(all_toks).most_common(20)
        print(f"\nTop tokens ({name}):")
        for t,c in top20: print(f"{t:>20s}  {c}")
        pd.DataFrame(top20, columns=["token","count"]).to_csv(ARTIFACT_DIR/f"top_tokens_{name}.csv", index=False)
    return df

def build_doc_vectors(token_seqs, kv, vector_size):
    mat = np.zeros((len(token_seqs), vector_size), dtype=np.float32)
    for i, toks in enumerate(token_seqs):
        vs = [kv.get_vector(t) for t in toks if t in kv.key_to_index]
        if vs: mat[i] = np.mean(vs, axis=0)
    return mat

def train_and_eval(name, X_train_vec, X_test_vec, y_train, y_test):
    clf = LogisticRegression(max_iter=1000, solver="liblinear", random_state=RANDOM_STATE)
    clf.fit(X_train_vec, y_train)
    preds = clf.predict(X_test_vec)
    acc = accuracy_score(y_test, preds)
    f1m = f1_score(y_test, preds, average="macro")
    print(f"\n=== {name} ==="); print(f"Accuracy: {acc:.4f} | F1-macro: {f1m:.4f}")
    rep = classification_report(y_test, preds, digits=4)
    print("\nClassification report:\n", rep)
    with open(ARTIFACT_DIR / f"classification_report_{name.replace(' ','_').lower()}.txt", "w", encoding="utf-8") as f:
        f.write(f"{name}\nAccuracy: {acc:.6f}\nF1-macro: {f1m:.6f}\n\n"); f.write(rep)
    return acc, f1m

def part2_pipeline(kv_pretrained):
    df = load_imdb()
    df = text_eda(df)

    X_train, X_test, y_train, y_test = train_test_split(
        df["tokens"].values, df["label"].values, test_size=0.2, stratify=df["label"].values, random_state=RANDOM_STATE
    )
    X_train_tokens, X_test_tokens = list(X_train), list(X_test)

    results = []

    # (1) Pretrained W2V avg-pooled
    print("\n[info] Vectorizing with pretrained embeddings...")
    d = kv_pretrained.vector_size
    Xtr = build_doc_vectors(X_train_tokens, kv_pretrained, d)
    Xte = build_doc_vectors(X_test_tokens, kv_pretrained, d)
    acc, f1m = train_and_eval("PretrainedW2V_LogReg", Xtr, Xte, y_train, y_test)
    results.append(("PretrainedW2V_LogReg", acc, f1m))

    # Prepare full corpus
    sentences = list(df["tokens"].values)

    # (2) Skip-gram Word2Vec
    print("\n[info] Training custom Skip-gram Word2Vec...")
    w2v_sg = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=2,
                      workers=4, sg=1, epochs=5, seed=RANDOM_STATE)
    kv_sg = w2v_sg.wv
    Xtr = build_doc_vectors(X_train_tokens, kv_sg, kv_sg.vector_size)
    Xte = build_doc_vectors(X_test_tokens, kv_sg, kv_sg.vector_size)
    acc, f1m = train_and_eval("SkipGramW2V_LogReg", Xtr, Xte, y_train, y_test)
    results.append(("SkipGramW2V_LogReg", acc, f1m))

    # (3) CBOW Word2Vec
    print("\n[info] Training custom CBOW Word2Vec...")
    w2v_cbow = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=2,
                        workers=4, sg=0, epochs=5, seed=RANDOM_STATE)
    kv_cbow = w2v_cbow.wv
    Xtr = build_doc_vectors(X_train_tokens, kv_cbow, kv_cbow.vector_size)
    Xte = build_doc_vectors(X_test_tokens, kv_cbow, kv_cbow.vector_size)
    acc, f1m = train_and_eval("CBOWW2V_LogReg", Xtr, Xte, y_train, y_test)
    results.append(("CBOWW2V_LogReg", acc, f1m))

    # (4) FastText (gensim 4.x API FIX: use corpus_iterable=)
    print("\n[info] Training custom FastText...")
    ft = FastText(vector_size=100, window=5, min_count=2, workers=4, sg=1, seed=RANDOM_STATE)
    try:
        ft.build_vocab(corpus_iterable=sentences)
        ft.train(corpus_iterable=sentences, total_examples=len(sentences), epochs=5)
    except TypeError:
        ft.build_vocab(sentences=sentences)
        ft.train(sentences=sentences, total_examples=len(sentences), epochs=5)
    kv_ft = ft.wv
    Xtr = build_doc_vectors(X_train_tokens, kv_ft, kv_ft.vector_size)
    Xte = build_doc_vectors(X_test_tokens, kv_ft, kv_ft.vector_size)
    acc, f1m = train_and_eval("FastText_LogReg", Xtr, Xte, y_train, y_test)
    results.append(("FastText_LogReg", acc, f1m))

    # Final table
    res_df = pd.DataFrame(results, columns=["Method", "Accuracy", "F1_macro"]).sort_values("F1_macro", ascending=False)
    print("\n=== Final Comparison ==="); print(res_df.to_string(index=False))
    res_df.to_csv(ARTIFACT_DIR / "imdb_wordvectors_scores.csv", index=False)
    with open(ARTIFACT_DIR / "imdb_wordvectors_summary.json", "w", encoding="utf-8") as f:
        json.dump({"results": [{"method": m, "accuracy": float(a), "f1_macro": float(f)} for m,a,f in results]}, f, indent=2)
    print(f"[info] Saved scores and summary under: {ARTIFACT_DIR}")

# ----------------------------- run & capture full stdout -----------------------------
# Re-use a pretrained kv from another cell if defined; otherwise load a small fallback
kv = globals().get("kv", None)
if kv is None:
    try:
        print("[info] Loading 'word2vec-google-news-300' (large, may be cached)...")
        kv = api.load("word2vec-google-news-300")
    except Exception:
        print("[warn] Falling back to 'glove-wiki-gigaword-100' (smaller)")
        kv = api.load("glove-wiki-gigaword-100")

buf = io.StringIO()
log_path = ARTIFACT_DIR / "part2_results.txt"
with redirect_stdout(buf):
    warnings.filterwarnings("ignore")
    part2_pipeline(kv_pretrained=kv)

print(buf.getvalue())
with open(log_path, "w", encoding="utf-8") as f:
    f.write(buf.getvalue())
print(f"\n[info] Saved full untruncated Part 2 log to: {log_path.resolve()}")



[info] Using kagglehub to fetch IMDB dataset...
[info] kagglehub path: C:\Users\jaypr\.cache\kagglehub\datasets\lakshmi25npathi\imdb-dataset-of-50k-movie-reviews\versions\1
[info] Loading dataset: C:\Users\jaypr\.cache\kagglehub\datasets\lakshmi25npathi\imdb-dataset-of-50k-movie-reviews\versions\1\IMDB Dataset.csv
[info] Dataset size: 50000; positives=25000; negatives=25000

[info] Running EDA...

Token length summary:
count    50000.000000
mean       119.615600
std         90.371551
min          3.000000
25%         64.000000
50%         89.000000
75%        145.000000
max       1435.000000
[info] Saved histogram: artifacts\imdb_length_hist.png

Top tokens (positive):
                film  42102
               movie  37850
                 one  27318
                like  17710
                good  15025
               great  12964
               story  12935
                time  12750
                well  12724
                 see  12274
                also  10793
             