In [10]:
import numpy as np
import math
from sklearn.model_selection import train_test_split
from collections import Counter
from scipy.sparse import csr_matrix, save_npz

In [11]:
with open("../lab4/telugu_tokenized_sentences.txt", "r", encoding="utf-8") as f:
    sentences = [line.strip() for line in f if line.strip()]

train_sentences, temp_sentences = train_test_split(sentences, test_size=0.2, random_state=42)
val_sentences, test_sentences = train_test_split(temp_sentences, test_size=0.5, random_state=42)

print(f"Train: {len(train_sentences)}, Val: {len(val_sentences)}, Test: {len(test_sentences)}")

Train: 1449936, Val: 181242, Test: 181243


In [12]:
def tokenize(s): return s.split()

vocab = set()
for s in train_sentences:
    vocab.update(tokenize(s))
vocab = sorted(vocab)
vocab_index = {w: i for i, w in enumerate(vocab)}
V = len(vocab)
print(f"âœ… Vocabulary size: {V}")

âœ… Vocabulary size: 728479


In [13]:
def compute_idf(sentences, vocab_index):
    N = len(sentences)
    df = np.zeros(len(vocab_index))
    for s in sentences:
        for tok in set(tokenize(s)):
            if tok in vocab_index:
                df[vocab_index[tok]] += 1
    return np.log((N + 1) / (df + 1)) + 1

idf = compute_idf(train_sentences, vocab_index)
print("âœ… IDF computed")

âœ… IDF computed


In [None]:
def build_sparse_tfidf(sentences, vocab_index, idf):
    row_idx, col_idx, data = [], [], []
    for i, s in enumerate(sentences):
        counts = Counter(tokenize(s))
        total = len(tokenize(s))
        for tok, cnt in counts.items():
            if tok in vocab_index:
                j = vocab_index[tok]
                tf = cnt / total
                val = tf * idf[j]
                row_idx.append(i)
                col_idx.append(j)
                data.append(val)
        if (i+1) % 50000 == 0:
            print(f"Processed {i+1} sentences...")
    return csr_matrix((data, (row_idx, col_idx)), shape=(len(sentences), len(vocab_index)))

X_train = build_sparse_tfidf(train_sentences, vocab_index, idf)
X_val   = build_sparse_tfidf(val_sentences, vocab_index, idf)
X_test  = build_sparse_tfidf(test_sentences, vocab_index, idf)

print("âœ… Sparse TF-IDF built!")
print(f"Train matrix: {X_train.shape}, nnz={X_train.nnz}")
print(f"Val matrix:   {X_val.shape}, nnz={X_val.nnz}")
print(f"Test matrix:  {X_test.shape}, nnz={X_test.nnz}")

Processed 50000 sentences...
Processed 100000 sentences...
Processed 150000 sentences...
Processed 200000 sentences...
Processed 250000 sentences...
Processed 300000 sentences...
Processed 350000 sentences...
Processed 400000 sentences...
Processed 450000 sentences...
Processed 500000 sentences...
Processed 550000 sentences...
Processed 600000 sentences...
Processed 650000 sentences...
Processed 700000 sentences...
Processed 750000 sentences...
Processed 800000 sentences...
Processed 850000 sentences...
Processed 900000 sentences...
Processed 950000 sentences...
Processed 1000000 sentences...
Processed 1050000 sentences...
Processed 1100000 sentences...
Processed 1150000 sentences...
Processed 1200000 sentences...
Processed 1250000 sentences...
Processed 1300000 sentences...
Processed 1350000 sentences...
Processed 1400000 sentences...
Processed 50000 sentences...
Processed 100000 sentences...
Processed 150000 sentences...
Processed 50000 sentences...
Processed 100000 sentences...
Proc

In [15]:
# Find Nearest Neighbor for each sentence (within same set)
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
def find_nearest_neighbors(X):
    n = X.shape[0]
    nearest_indices = np.zeros(n, dtype=int)
    nearest_scores = np.zeros(n)

    batch_size = 5000
    for start in range(0, n, batch_size):
        end = min(start + batch_size, n)
        sim = cosine_similarity(X[start:end], X)  # (batch_size, n)
        np.fill_diagonal(sim[:, start:end], -1)   # avoid self-match in batch
        idx = np.argmax(sim, axis=1)
        score = np.max(sim, axis=1)
        nearest_indices[start:end] = idx
        nearest_scores[start:end] = score
        print(f"Processed {end}/{n} sentences")

    return nearest_indices, nearest_scores

In [None]:
# Validation Set
print("\nðŸ”¹ Finding nearest neighbors for Validation set...")
val_nn_idx, val_nn_score = find_nearest_neighbors(X_val)
print("âœ… Validation nearest neighbors found.")


ðŸ”¹ Finding nearest neighbors for Validation set...
Processed 5000/181242 sentences
