In [None]:
import pandas as pd
import numpy as np
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split

def load_and_split_data(fname, col_idx, val_sz=0.1, tst_sz=0.1):
    """
    Loads data from a single Parquet file and splits it into train, val, and test.
    """
    try:
        # Read the Parquet file
        df = pd.read_parquet(fname)
    except FileNotFoundError:
        print(f"---" * 20)
        print(f"ЁЯЪи ERROR: Main data file not found: '{fname}'")
        print(f"Please update the 'TOKENIZED_FILE' variable.")
        print(f"---" * 20)
        return None, None, None

    try:
        # Get sentences from the first column (index 0)
        sents = df.iloc[:, col_idx].dropna().tolist()
    except IndexError:
        print(f"---" * 20)
        print(f"ЁЯЪи ERROR: Column index {col_idx} is out of bounds.")
        print(f"Please update the 'TEXT_COLUMN_INDEX' variable.")
        print(f"---" * 20)
        return None, None, None

    # --- Create Splits ---
    train_split = 1.0 - (val_sz + tst_sz) # 1.0 - 0.2 = 0.8
    trn_s, tmp_s = train_test_split(sents, train_size=train_split, random_state=42)

    test_split_rel = tst_sz / (val_sz + tst_sz) # 0.10 / (0.10 + 0.10) = 0.5
    val_s, tst_s = train_test_split(tmp_s, test_size=test_split_rel, random_state=42)

    print(f"Data loaded and split successfully:")
    print(f"Total Sentences: {len(sents)}")
    print(f"Training set:   {len(trn_s)} sentences")
    print(f"Validation set: {len(val_s)} sentences")
    print(f"Test set:       {len(tst_s)} sentences")

    return trn_s, val_s, tst_s

def calc_pmi(w1, w2, u_p, b_p):
    """
    Calculates PMI(w1, w2) = log2( P(w2 | w1) / P(w2) )
    """
    p_cond = b_p.get((w1, w2)) # P(w2 | w1)
    p_w2 = u_p.get(w2)       # P(w2)

    if p_cond is None or p_w2 is None or p_w2 == 0 or p_cond == 0:
        return -np.inf

    pmi = math.log2(p_cond / p_w2)
    return pmi

def get_pmi_for_set(sents, u_p, b_p):
    """Calculates PMI for all unique bigrams in a list of sentences."""
    all_pmi = {}
    for s in sents:
        # Your sentences look pre-tokenized with spaces, so .split() is correct
        words = s.split()
        if len(words) < 2:
            continue

        for i in range(len(words) - 1):
            w1 = words[i]
            w2 = words[i+1]
            bg = (w1, w2)

            if bg not in all_pmi:
                all_pmi[bg] = calc_pmi(w1, w2, u_p, b_p)
    return all_pmi

def find_nn(X_data, sents):
    """Finds the nearest neighbor for each item in X_data within itself."""
    nn = NearestNeighbors(n_neighbors=2, metric='cosine', algorithm='brute')
    nn.fit(X_data)

    d, idx = nn.kneighbors(X_data)

    print(f"\nFound nearest neighbors. Sample results:")

    for i in range(min(5, len(sents))):
        orig_idx = i
        nn_idx = idx[i][1]   # Get the *second* index (the first is the item itself)
        nn_dist = d[i][1] # Get the *second* distance

        print("-" * 25)
        print(f"Original (idx {orig_idx}): {sents[orig_idx]}")
        print(f"Neighbor (idx {nn_idx}): {sents[nn_idx]}")
        print(f"Cosine Distance: {nn_dist:.4f}")

    return idx, d

def main():
    # --- тЪая╕П ACTION REQUIRED: Update these variables ---

    # 1. Your Parquet file with all sentences
    TOKENIZED_FILE = 'tokenized_gujarati_sentences.parquet'

    # 2. The *index* of the column with sentences (0 for the first column)
    TEXT_COLUMN_INDEX = 0

    # 3. Your probability files (from your first image)
    UNIGRAM_CSV = 'unigram_probs.csv'
    BIGRAM_CSV = 'bigram_probs.csv'

    # 4. Define split sizes (80% train, 10% val, 10% test)
    VAL_SIZE = 0.10
    TEST_SIZE = 0.10
    # --- ------------------------------------------ ---

    # --- Load and Split Data ---
    print("--- Loading and Splitting Data ---")
    trn_s, val_s, tst_s = load_and_split_data(
        TOKENIZED_FILE, TEXT_COLUMN_INDEX, VAL_SIZE, TEST_SIZE
    )

    if not trn_s:
        print("ЁЯЪи Stopping execution due to data loading error.")
        return

    print("\n--- Loading Probability Files ---")
    try:
        u_df = pd.read_csv(UNIGRAM_CSV)
        b_df = pd.read_csv(BIGRAM_CSV)
    except FileNotFoundError as e:
        print(f"ЁЯЪи ERROR: Probability file not found: {e.fileName}")
        print(f"Please make sure '{UNIGRAM_CSV}' and '{BIGRAM_CSV}' are present.")
        return

    # --- Task 1: PMI Scores ---
    print("\n--- 1. Starting Task 1: PMI Scores ---")

    u_df = u_df.dropna(subset=['Word'])
    b_df = b_df.dropna(subset=['Context', 'Word'])

    u_p = u_df.set_index('Word')['Probability'].to_dict()
    b_p = b_df.set_index(['Context', 'Word'])['Probability'].to_dict()

    print(f"Loaded {len(u_p)} unigram and {len(b_p)} bigram probabilities.")

    val_pmi = get_pmi_for_set(val_s, u_p, b_p)
    test_pmi = get_pmi_for_set(tst_s, u_p, b_p)

    print(f"Calculated PMI for {len(val_pmi)} unique bigrams in validation set.")
    print(f"Calculated PMI for {len(test_pmi)} unique bigrams in test set.")

    print("\n--- Finished Task 1 ---")

    # --- Task 2: TF-IDF Vectorization ---
    print("\n--- 2. Starting Task 2: TF-IDF Vectorization ---")

    vec = TfidfVectorizer(
        tokenizer=lambda x: x.split(),
        lowercase=False,
        token_pattern=None
    )

    print(f"Fitting TF-IDF on {len(trn_s)} training sentences...")
    X_trn = vec.fit_transform(trn_s)

    print(f"Transforming {len(val_s)} validation sentences...")
    X_val = vec.transform(val_s)

    print(f"Transforming {len(tst_s)} testing sentences...")
    X_tst = vec.transform(tst_s)

    print(f"\nTF-IDF Matrix Shapes:")
    print(f"Train: {X_trn.shape}")
    print(f"Val:   {X_val.shape}")
    print(f"Test:  {X_tst.shape}")
    print(f"Vocabulary size (from train): {len(vec.vocabulary_)}")
    print("--- Finished Task 2 ---")

    # --- Task 3: Nearest Neighbors ---
    print("\n--- 3. Starting Task 3: Nearest Neighbors ---")

    print("--- Finding neighbors for Validation Set (within val) ---")
    val_nn_idxs, val_nn_dists = find_nn(X_val, val_s)

    print("\n--- Finding neighbors for Test Set (within test) ---")
    test_nn_idxs, test_nn_dists = find_nn(X_tst, tst_s)

    print("\n--- Finished Task 3 ---")
    print("\nAssignment Complete.")

if __name__ == "__main__":
    main()

--- Loading and Splitting Data ---
Data loaded and split successfully:
Total Sentences: 100000
Training set:   80000 sentences
Validation set: 10000 sentences
Test set:       10000 sentences

--- Loading Probability Files ---

--- 1. Starting Task 1: PMI Scores ---
Loaded 138132 unigram and 822206 bigram probabilities.
Calculated PMI for 106764 unique bigrams in validation set.
Calculated PMI for 106090 unique bigrams in test set.

--- Finished Task 1 ---

--- 2. Starting Task 2: TF-IDF Vectorization ---
Fitting TF-IDF on 80000 training sentences...
Transforming 10000 validation sentences...
Transforming 10000 testing sentences...

TF-IDF Matrix Shapes:
Train: (80000, 120665)
Val:   (10000, 120665)
Test:  (10000, 120665)
Vocabulary size (from train): 120665
--- Finished Task 2 ---

--- 3. Starting Task 3: Nearest Neighbors ---
--- Finding neighbors for Validation Set (within val) ---

Found nearest neighbors. Sample results:
-------------------------
Original (idx 0): ркЖ ркЖркзрлБркир