In [1]:
import pandas as pd
import numpy as np

In [None]:
unigram_path = "../lab4/unigram_model.csv"
bigram_path = "../lab4/bigram_model.csv"

unigrams = pd.read_csv(unigram_path)
bigrams = pd.read_csv(bigram_path)

print("✅ Files loaded successfully!")
print(f"Unigrams: {len(unigrams)} entries")
print(f"Bigrams:  {len(bigrams)} entries")

✅ Files loaded successfully!
Unigrams: 832991 entries
Bigrams:  7658547 entries


In [3]:
uni_prob = dict(zip(unigrams['unigram'], unigrams['probability']))

In [4]:
def compute_pmi(row):
    """
    Compute PMI(w1, w2) = log2( P(w1, w2) / (P(w1) * P(w2)) )
    """
    try:
        w1, w2 = row['bigram'].split()
        p_bigram = row['probability']
        p_w1 = uni_prob.get(w1, 1e-10)  # small epsilon to avoid division by zero
        p_w2 = uni_prob.get(w2, 1e-10)
        pmi = np.log2(p_bigram / (p_w1 * p_w2))
        return pmi
    except Exception as e:
        print(f"⚠️ Error processing bigram '{row['bigram']}': {e}")
        return np.nan

# Apply PMI computation
bigrams['PMI'] = bigrams.apply(compute_pmi, axis=1)

In [5]:
# output_path = "bigram_pmi_scores.csv"
# bigrams.to_csv(output_path, index=False)

# print("\n✅ PMI computation completed!")
# print(f"Results saved to '{output_path}'")

In [6]:
print("\nTop 10 bigrams with highest PMI:")
display(bigrams.sort_values(by='PMI', ascending=False).head(10))

print("\nBottom 10 bigrams with lowest PMI:")
display(bigrams.sort_values(by='PMI', ascending=True).head(10))


Top 10 bigrams with highest PMI:


Unnamed: 0,bigram,count,prefix,probability,PMI
1682951,"('టున్నారే', 'తప్ప')",1,టున్నారే,1.0,66.438562
1682957,"('పాడాలన్నారు', '.')",5,పాడాలన్నారు,1.0,66.438562
1682960,"('కొళ్లగొడుతున్నారని', 'అన్నారు')",1,కొళ్లగొడుతున్నారని,1.0,66.438562
7658546,"('జర్మనీపైకూడా', 'బలంగా')",1,జర్మనీపైకూడా,1.0,66.438562
4292235,"('ఇంటిపేర్ల', '’')",1,ఇంటిపేర్ల,1.0,66.438562
4292233,"('శ్రధ్ధాంజలి', 'ప్రకటనలు')",1,శ్రధ్ధాంజలి,1.0,66.438562
4292219,"('సమయమిస్తున్నట్లు', 'తెలిపారు')",1,సమయమిస్తున్నట్లు,1.0,66.438562
4292196,"('వాదులెవరూ', 'ఇలాంటి')",1,వాదులెవరూ,1.0,66.438562
972782,"('వికటించడమో', 'జరుగుతుంది')",1,వికటించడమో,1.0,66.438562
972771,"('ఆవేశపు', 'మోతాదు')",1,ఆవేశపు,1.0,66.438562



Bottom 10 bigrams with lowest PMI:


Unnamed: 0,bigram,count,prefix,probability,PMI
3954915,"('.', '313.65')",1,.,4.90199e-07,45.478433
2742423,"('.', 'ప్యాంటు')",1,.,4.90199e-07,45.478433
3380502,"('.', 'కాయలుకాండం')",1,.,4.90199e-07,45.478433
2742412,"('.', 'సాయికృష్ణను')",1,.,4.90199e-07,45.478433
1822013,"('.', 'ఈవిషయం')",1,.,4.90199e-07,45.478433
3985412,"('.', 'రోజుల్లో')",1,.,4.90199e-07,45.478433
1822001,"('.', 'వరకూ')",1,.,4.90199e-07,45.478433
6960246,"('.', 'కోనసీమ')",1,.,4.90199e-07,45.478433
5170614,"('.', 'ఏజ్')",1,.,4.90199e-07,45.478433
3380482,"('.', 'బిజి')",1,.,4.90199e-07,45.478433
