In [1]:
import pandas as pd 

In [2]:
df=pd.read_csv('C:\\Sahil\\NLP\\hindi_reconstructed_sentences2.csv')

In [3]:
df.shape

(10000000, 1)

In [4]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True) 

In [5]:
df.head()

Unnamed: 0,reconstructed_sentence
0,उसे तत्काल ट्रेन के शौचालय में बंद कर ...
1,ऐसे ही एक कार्यक्रम का एक वीडियो सोशल ...
2,आरबीजीडी की टीम ने टास जीत कर पहले खेल...
3,इससे आम आदमी की गाढ़ी कमाई को फर्जी और ...
4,विधि शिक्षा एवं विधि सहायता केन्द्र सेंट...


In [6]:
df.shape

(10000000, 1)

In [7]:
df_val = df.iloc[:1000]
df_test= df.iloc[1000:2000]
df_train = df.iloc[2000:1000000]


In [8]:
df_test.shape, df_val.shape, df_train.shape

((1000, 1), (1000, 1), (998000, 1))

In [9]:
import pickle
import random
import math
from collections import Counter, defaultdict
from typing import List, Dict, Tuple

In [10]:
counts ={}

for sen in df_train['reconstructed_sentence']:
    words = sen.split()
    for word in words:
        counts[word] = counts.get(word, 0) + 1

In [11]:
U=len(counts)

In [12]:
counts_freq={}
for k,v in counts.items():
    counts_freq[v]=counts_freq.get(v,0)+1

In [13]:
len(counts_freq)

2524

In [14]:
N = sum(counts.values())   

In [15]:
print(N,U)

18947241 324962


In [16]:
N1 = counts_freq.get(1, 0) 
print(N1)

188797


In [17]:
mass_unseen = N1 / N 

In [18]:
sorted_counts_freq = dict(sorted(counts_freq.items()))

In [19]:
max_c = max(sorted_counts_freq.keys())
print(max_c)

780209


In [20]:
c_star_table={}
for c in range(1, max_c + 1):
    Nc = counts_freq.get(c, 0)
    Ncp1 = counts_freq.get(c + 1, 0)
    if Nc == 0:
        continue
    c_star = (c + 1) * (Ncp1 / Nc) if Ncp1 > 0 else float(c)
    c_star_table[c] = c_star


In [21]:
V_total = U + 100_000 
num_unseen = V_total - U

In [22]:
total_cstar = sum(c_star_table[counts[token]] for token in counts)
scale = 1 - mass_unseen
token_prob = {token: c_star_table[counts[token]] * scale / total_cstar for token in counts}

In [23]:
P_unseen_each = mass_unseen / num_unseen

In [24]:
for c in range(0,101):
    print(c,counts_freq.get(c,0),c_star_table.get(c,0))

0 0 0
1 188797 0.42804705583245495
2 40407 1.3676590689731976
3 18421 2.3772867922479777
4 10948 3.459535988308367
5 7575 4.37940594059406
6 5529 5.352866702839573
7 4228 6.554399243140965
8 3464 7.2826212471131635
9 2803 8.544416696396718
10 2395 9.26847599164927
11 2018 10.495540138751238
12 1765 11.762606232294617
13 1597 12.325610519724483
14 1406 13.879800853485065
15 1301 13.823212913143736
16 1124 15.351423487544483
17 1015 16.68768472906404
18 941 17.970244420828905
19 890 18.674157303370787
20 831 17.613718411552348
21 697 20.35868005738881
22 645 22.85736434108527
23 641 22.27769110764431
24 595 23.991596638655462
25 571 25.453590192644484
26 559 24.923076923076923
27 516 25.77519379844961
28 475 28.08421052631579
29 460 30.58695652173913
30 469 27.100213219616208
31 410 28.01951219512195
32 359 34.654596100278546
33 377 32.19628647214854
34 357 32.450980392156865
35 331 37.1963746223565
36 342 33.97076023391813
37 314 40.05732484076434
38 331 31.22356495468278
39 265 46.9433

In [25]:
epsilon = 1e-12
def sentence_logprob(df_subset):
    log_probs = []
    for sen in df_subset['reconstructed_sentence']:
        log_prob = 0.0
        for word in sen.split():
            p = token_prob.get(word, P_unseen_each)
            p = max(p, epsilon)
            log_prob += math.log(p)
        log_probs.append(log_prob)
    return log_probs

log_probs_test = sentence_logprob(df_test)
log_probs_val = sentence_logprob(df_val)

print(log_probs_test[:5])
print(log_probs_val[:5])

[-70.13599080682383, -117.81502305816342, -99.40931339232277, -104.62659912038113, -408.5971798670203]
[-159.06877261042916, -84.91946583428575, -157.7340406531229, -140.56175642168213, -299.3386033725079]


In [26]:
def build_ngrams(sentences, n):
    ngram_counts = defaultdict(int)
    for sent in sentences:
        tokens = sent.split()
        tokens = ['<s>']*(n-1) + tokens + ['</s>']
        for i in range(len(tokens) - n + 1):
            ngram = tuple(tokens[i:i+n])
            ngram_counts[ngram] += 1
    return ngram_counts


In [27]:
bigrams = build_ngrams(df_train['reconstructed_sentence'], 2)

trigrams = build_ngrams(df_train['reconstructed_sentence'], 3)

quadrigrams = build_ngrams(df_train['reconstructed_sentence'], 4)

In [118]:
bigrams

defaultdict(int,
            {('<s>', 'सुसाइड'): 26,
             ('सुसाइड', 'करते'): 1,
             ('करते', 'पुरूष'): 1,
             ('पुरूष', ','): 11,
             (',', 'हमरो'): 1,
             ('हमरो', 'नहीं'): 1,
             ('नहीं', 'यह'): 63,
             ('यह', 'कार्य'): 124,
             ('कार्य', '।'): 14,
             ('।', '।'): 377,
             ('।', '</s>'): 773477,
             ('<s>', 'सवा'): 14,
             ('सवा', 'मिनट'): 2,
             ('मिनट', 'के'): 372,
             ('के', 'इस'): 2957,
             ('इस', 'वीडियो'): 753,
             ('वीडियो', 'ने'): 28,
             ('ने', 'पिछले'): 724,
             ('पिछले', 'दिनों'): 816,
             ('दिनों', 'सोशल'): 55,
             ('सोशल', 'मीडिया'): 3851,
             ('मीडिया', 'पर'): 2666,
             ('पर', 'कई'): 680,
             ('कई', 'लोगों'): 712,
             ('लोगों', 'का'): 2397,
             ('का', 'ध्यान'): 784,
             ('ध्यान', 'अपनी'): 52,
             ('अपनी', 'तरफ'): 119,
             

In [28]:
def good_turing_probs(ngram_counts, V, n, V_extra=100_000):
    counts_freq = defaultdict(int)
    for c in ngram_counts.values():
        counts_freq[c] += 1
    
    N = sum(ngram_counts.values())     
    N1 = counts_freq.get(1, 0)         
    mass_unseen = N1 / N               
  
    max_c = max(counts_freq.keys())
    c_star_table = {}
    for c in range(1, max_c + 1):
        Nc = counts_freq.get(c, 0)
        Ncp1 = counts_freq.get(c+1, 0)
        if Nc == 0:
            continue
        c_star = (c+1)*(Ncp1/Nc) if Ncp1>0 else float(c)
        c_star_table[c] = c_star
    

    total_cstar = sum(c_star_table.get(count, count) for count in ngram_counts.values())
    scale = 1 - mass_unseen
    token_prob = {ng: c_star_table.get(count, count) * scale / total_cstar
                  for ng, count in ngram_counts.items()}
    
    
    U = len(ngram_counts)  
    V_total = V + V_extra
    num_unseen = (V_total ** n) - U
    P_unseen_each = mass_unseen / num_unseen
    
    return token_prob, P_unseen_each

In [29]:
def sentence_logprob(sentences, n, token_prob, P_unseen_each, epsilon=1e-12):
    log_probs = []
    for sen in sentences:
        tokens = sen.split()
        log_prob = 0.0
        for i in range(len(tokens)-n+1):
            ngram = tuple(tokens[i:i+n])
            p = token_prob.get(ngram, P_unseen_each)
            log_prob += math.log(max(p, epsilon))
        log_probs.append(log_prob)
    return log_probs


In [30]:
train_vocab = set(word for sen in df_train['reconstructed_sentence'] for word in sen.split())
V = len(train_vocab)


In [31]:
bi_prob, bi_unseen = good_turing_probs(bigrams, V, 2)
tri_prob, tri_unseen = good_turing_probs(trigrams, V, 3)
quad_prob, quad_unseen = good_turing_probs(quadrigrams, V, 4)

In [35]:
log_probs_bigrams  = sentence_logprob(df_test['reconstructed_sentence'], 2, bi_prob, bi_unseen)
log_probs_trigrams = sentence_logprob(df_test['reconstructed_sentence'], 3, tri_prob, tri_unseen)
log_probs_quadrigrams = sentence_logprob(df_test['reconstructed_sentence'], 4, quad_prob, quad_unseen)
print(log_probs_quadrigrams[:5])
print(log_probs_trigrams[:5])
print(log_probs_bigrams[:5])

[-153.9087859967321, -303.941232275214, -293.21054775763486, -266.0713098665141, -1268.5421622559568]
[-148.75652670091927, -309.0680785014838, -231.66573186003606, -233.41787028082024, -1009.4093737709453]
[-132.04303060246627, -228.41266154465856, -159.16290119693576, -171.02332976559572, -696.9609308607262]


In [44]:
unigram =build_ngrams(df_train['reconstructed_sentence'], 1)

In [45]:
uni_prob,uni_unseen=good_turing_probs(unigram,V,1)

In [46]:
log_probs_unigram=sentence_logprob(df_test['reconstructed_sentence'],1,uni_prob,uni_unseen)

In [47]:
df_test["Prob_unigram"]=log_probs_unigram

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["Prob_unigram"]=log_probs_unigram


In [None]:
df_test["Prob_bigram"] = log_probs_bigrams
df_test["Prob_trigram"] = log_probs_trigrams
df_test["Prob_quadrigram"] = log_probs_quadrigrams



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["Prob_bigram"] = log_probs_bigrams
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["Prob_trigram"] = log_probs_trigrams
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["Prob_quadrigram"] = log_probs_quadrigrams


In [48]:
df_test.to_csv("test_with_probs.csv", index=False)

In [37]:
log_probs_bigrams  = sentence_logprob(df_val['reconstructed_sentence'], 2, bi_prob, bi_unseen)
log_probs_trigrams = sentence_logprob(df_val['reconstructed_sentence'], 3, tri_prob, tri_unseen)
log_probs_quadrigrams = sentence_logprob(df_val['reconstructed_sentence'], 4, quad_prob, quad_unseen)
print(log_probs_quadrigrams[:5])
print(log_probs_trigrams[:5])
print(log_probs_bigrams[:5])

[-490.7296943335773, -189.14388594514193, -379.7719009996225, -403.0879968730264, -772.168817821761]
[-421.6821193366938, -151.53353073685884, -332.75966343743175, -358.0166068969892, -719.6531961018237]
[-298.4892828687863, -126.127735474958, -252.1466599594539, -240.66308581555438, -557.4934188440463]


In [52]:
log_probs_unigram=sentence_logprob(df_val['reconstructed_sentence'], 1, uni_prob, uni_unseen)

In [53]:
df_val["Prob_uni"]=log_probs_unigram

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_val["Prob_uni"]=log_probs_unigram


In [38]:
df_val["Prob_bigram"] = log_probs_bigrams
df_val["Prob_trigram"] = log_probs_trigrams
df_val["Prob_quadrigram"] = log_probs_quadrigrams

df_val.to_csv("val_with_probs.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_val["Prob_bigram"] = log_probs_bigrams
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_val["Prob_trigram"] = log_probs_trigrams
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_val["Prob_quadrigram"] = log_probs_quadrigrams


In [54]:
df_val.to_csv("val_with_probs.csv", index=False)

In [55]:
df_val.head()

Unnamed: 0,reconstructed_sentence,Prob_bigram,Prob_trigram,Prob_quadrigram,Prob_uni
0,उसे तत्काल ट्रेन के शौचालय में बंद कर ...,-298.489283,-421.682119,-490.729694,-160.24809
1,ऐसे ही एक कार्यक्रम का एक वीडियो सोशल ...,-126.127735,-151.533531,-189.143886,-85.637311
2,आरबीजीडी की टीम ने टास जीत कर पहले खेल...,-252.14666,-332.759663,-379.771901,-158.759582
3,इससे आम आदमी की गाढ़ी कमाई को फर्जी और ...,-240.663086,-358.016607,-403.087997,-141.535975
4,विधि शिक्षा एवं विधि सहायता केन्द्र सेंट...,-557.493419,-719.653196,-772.168818,-301.235767


In [50]:

df_test_prob=pd.read_csv('test_with_probs.csv')

In [51]:
df_test_prob.head()

Unnamed: 0,reconstructed_sentence,Prob_bigram,Prob_trigram,Prob_quadrigram,Prob_unigram
0,करीब दो माह पूर्व विक्की राजगढ़ आया था ।,-132.043031,-148.756527,-153.908786,-70.597463
1,आपको सही जवाब का ऑप्शन टाइप कर ये एसएम...,-228.412662,-309.068079,-303.941232,-118.532916
2,वहीं रात में ही मुठभेड़ के दौरान उसका ...,-159.162901,-231.665732,-293.210548,-100.127159
3,"येलो टी बनाने के लिए , आपको थोड़ा पानी ...",-171.02333,-233.41787,-266.07131,-105.395719
4,"सुबह 10 बजे से 11 बजे तक नामाकंन , 11 बजे...",-696.960931,-1009.409374,-1268.542162,-411.468562


In [42]:
df_val_prob=pd.read_csv('val_with_probs.csv')

In [43]:
df_val_prob.head()

Unnamed: 0,reconstructed_sentence,Prob_bigram,Prob_trigram,Prob_quadrigram
0,उसे तत्काल ट्रेन के शौचालय में बंद कर ...,-298.489283,-421.682119,-490.729694
1,ऐसे ही एक कार्यक्रम का एक वीडियो सोशल ...,-126.127735,-151.533531,-189.143886
2,आरबीजीडी की टीम ने टास जीत कर पहले खेल...,-252.14666,-332.759663,-379.771901
3,इससे आम आदमी की गाढ़ी कमाई को फर्जी और ...,-240.663086,-358.016607,-403.087997
4,विधि शिक्षा एवं विधि सहायता केन्द्र सेंट...,-557.493419,-719.653196,-772.168818
