In [None]:
import re
import numpy as np
from collections import Counter, defaultdict
from sklearn.decomposition import TruncatedSVD
from datasets import load_dataset
from nltk.translate.bleu_score import sentence_bleu

# ------------------------
# 1. Load MC4 Hindi Corpus
# ------------------------
def load_hindi_corpus_mc4(limit=50000):
    ds = load_dataset("allenai/c4", "multilingual", split="train", streaming=True)
    corpus = []
    for i, ex in enumerate(ds):
        if ex.get("text") and ex.get("text").strip():
            corpus.append(ex["text"].strip())
        if i >= limit:
            break
    return "\n".join(corpus)

raw_text = load_hindi_corpus_mc4(limit=50000)
print(f"Loaded raw text length: {len(raw_text)} characters")

Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/59024 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/386 [00:00<?, ?it/s]

Loaded raw text length: 129594854 characters


In [None]:
# --------------------
# 2. Custom Tokenizers
# --------------------
def sentence_tokenizer(text):
    return [s.strip() for s in re.split(r'[।?!\.]+', text) if s.strip()]

def word_tokenizer(sentence):
    return re.findall(r'[\u0900-\u097F]+|[।.,!?]', sentence)

# --------------------------------------
# 3. Reference Tokenizers for Evaluation
# --------------------------------------
def reference_sent_tokenizer(text):
    return [s.strip() for s in re.split(r'[\n।?!\.]+', text) if s.strip()]

def reference_word_tokenizer(sentence):
    return re.findall(r'[\u0900-\u097F]+|[।.,!?]', sentence)

# ------------------------
# 4. Tokenizer Evaluation
# ------------------------
def evaluate_tokens(pred_tokens, ref_tokens, level="Word"):
    pred_set = set(pred_tokens)
    ref_set = set(ref_tokens)
    tp = len(pred_set & ref_set)
    precision = tp / len(pred_set) if pred_set else 0
    recall = tp / len(ref_set) if ref_set else 0
    f1 = 2*precision*recall/(precision+recall) if (precision+recall)>0 else 0
    print(f"\n[{level} Evaluation] Precision={precision:.4f}, Recall={recall:.4f}, F1={f1:.4f}")
    return precision, recall, f1

def evaluate_bleu(pred_tokens, ref_tokens):
    score = sentence_bleu([ref_tokens], pred_tokens, weights=(1,0,0,0))
    print(f"[BLEU] 1-gram score={score:.4f}")
    return score

# -------------------
# 5. Tokenize Corpus
# ------------------

sentences = sentence_tokenizer(raw_text)
tokenized = [word_tokenizer(s) for s in sentences]

ref_sentences = reference_sent_tokenizer(raw_text)
ref_tokenized = [reference_word_tokenizer(s) for s in ref_sentences]

# Evaluate sentence tokenizer
evaluate_tokens([s for s in sentences], [s for s in ref_sentences], level="Sentence")

# Evaluate first 50 sentences for word tokenizer
pred_words = [w for sent in tokenized[:50] for w in sent]
ref_words = [w for sent in ref_tokenized[:50] for w in sent]
evaluate_tokens(pred_words, ref_words, level="Word")
evaluate_bleu(pred_words, ref_words)

# -----------------------------------
# 6. Build Co-occurrence & Embeddings
# -----------------------------------
def build_cooccurrence(corpus, window=2, min_count=2, top_n=10000, force_include=None):
    freq = Counter(w for sent in corpus for w in sent)
    vocab = [w for w, c in freq.items() if c >= min_count][:top_n]

    # --- Force include target words ---
    if force_include:
        for w in force_include:
            if w not in vocab:
                vocab.append(w)

    w2i = {w: i for i, w in enumerate(vocab)}
    i2w = {i: w for w, i in w2i.items()}

    cooc = defaultdict(float)
    for sent in corpus:
        for i, w in enumerate(sent):
            if w not in w2i:
                continue
            wi = w2i[w]
            for j in range(max(0, i - window), min(len(sent), i + window + 1)):
                if i == j:
                    continue
                c = sent[j]
                if c in w2i:
                    cooc[(wi, w2i[c])] += 1.0
    return cooc, w2i, i2w


def cooc_to_sppmi(cooc, vocab_size, k=5):
    total = sum(cooc.values())
    row_sum = defaultdict(float)
    col_sum = defaultdict(float)
    for (i,j),v in cooc.items():
        row_sum[i] += v
        col_sum[j] += v
    M = np.zeros((vocab_size,vocab_size),dtype=np.float32)
    for (i,j),v in cooc.items():
        p_wc = v/total
        p_w = row_sum[i]/total
        p_c = col_sum[j]/total
        pmi = np.log2(p_wc/(p_w*p_c)+1e-8)
        sppmi = max(pmi - np.log2(k),0)
        M[i,j] = sppmi
    return M

def train_embeddings(corpus, dim=100, window=2, min_count=1, top_n=50000, k=5, force_include=None):
    cooc, w2i, i2w = build_cooccurrence(corpus, window, min_count, top_n, force_include)
    print(f"Vocabulary size: {len(w2i)}")
    M = cooc_to_sppmi(cooc, len(w2i), k)
    svd = TruncatedSVD(n_components=dim, n_iter=10, random_state=42)
    W = svd.fit_transform(M)
    return W, w2i, i2w

targets = ["भारत", "दिल्ली", "अमरीका", "पुरुष", "महिला", "श्री", "सरकार", "मंत्री", "देश"]
W, w2i, i2w = train_embeddings(tokenized, dim=100, min_count=1, top_n=50000, force_include=targets)


# ------------------------
# 7. Similarity & Analogy
# ------------------------
def most_similar(word,W,w2i,i2w,topn=5):
    if word not in w2i: return []
    v = W[w2i[word]]
    sims = W.dot(v)/(np.linalg.norm(W,axis=1)*np.linalg.norm(v)+1e-9)
    idx = np.argsort(-sims)
    return [(i2w[i], float(sims[i])) for i in idx[1:topn+1]]

def analogy(a,b,c,W,w2i,i2w,topn=5):
    if any(w not in w2i for w in [a,b,c]): return []
    vec = W[w2i[b]]-W[w2i[a]]+W[w2i[c]]
    sims = W.dot(vec)/(np.linalg.norm(W,axis=1)*np.linalg.norm(vec)+1e-9)
    idx = np.argsort(-sims)
    exclude = {w2i[x] for x in (a,b,c)}
    return [(i2w[i], float(sims[i])) for i in idx if i not in exclude][:topn]

# ------------------------
# 8. Dynamic Vocabulary & Analogy Tests
# ------------------------
all_words = [w for sent in tokenized for w in sent]
freq = Counter(all_words)
top_words = [w for w, c in freq.most_common(50)]

print("\nNearest Neighbors for Top Words:")
for w in top_words[:10]:
    print(f"{w} → {most_similar(w,W,w2i,i2w)}")


[Sentence Evaluation] Precision=0.8252, Recall=0.6018, F1=0.6960

[Word Evaluation] Precision=1.0000, Recall=1.0000, F1=1.0000
[BLEU] 1-gram score=0.6571
Vocabulary size: 637

Nearest Neighbors for Top Words:
, → [('आपको', 0.0), ('करे', 0.0), ('सम्पर्क', 0.0), ('डलवाने', 0.0), ('पे', 0.0)]
है → [('कि', 0.28539201617240906), ('लेता', 0.270214319229126), ('जाता', 0.26591479778289795), ('संवेदनशील', 0.2569257915019989), ('पढ़ा', 0.2545469403266907)]
के → [('लिए', 0.5192482471466064), ('अकर्मक', 0.3166256844997406), ('आंतरिक', 0.3020084500312805), ('शब्द', 0.2934802174568176), ('साथ', 0.2833489775657654)]
का → [('संग्रह', 0.3760721683502197), ('भावों', 0.3529353141784668), ('भण्डार', 0.3354138731956482), ('स्वाध्याय', 0.3071749806404114), ('विशिष्टजनों', 0.3035500943660736)]
और → [('दत्त', 0.35848096013069153), ('पुरी', 0.3386397659778595), ('कपिला', 0.3316999077796936), ('संकुचित', 0.32386764883995056), ('उमा', 0.32314223051071167)]
में → [('पूर्ण', 0.47781601548194885), ('मुझे', 0.342160