In [1]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import re

In [2]:
#Loading the data
df=pd.read_csv("rt_reviews_utf8.csv")
df.head()

Unnamed: 0,Freshness,Review
0,fresh,"Manakamana doesn't answer any questions, yet ..."
1,fresh,Wilfully offensive and powered by a chest-thu...
2,rotten,It would be difficult to imagine material mor...
3,rotten,Despite the gusto its star brings to the role...
4,rotten,If there was a good idea at the core of this ...


In [3]:
#Containing only alphabets
def do_reg(row):
    st=re.split("[^a-zA-Z]",row)
    final_st=[i for i in st if len(i)!=0]
    return " ".join(final_st)

In [4]:
df['Review']=df['Review'].apply(do_reg)

In [5]:
df.head()

Unnamed: 0,Freshness,Review
0,fresh,Manakamana doesn t answer any questions yet ma...
1,fresh,Wilfully offensive and powered by a chest thum...
2,rotten,It would be difficult to imagine material more...
3,rotten,Despite the gusto its star brings to the role ...
4,rotten,If there was a good idea at the core of this f...


In [6]:
df.groupby('Freshness').describe()

Unnamed: 0_level_0,Review,Review,Review,Review
Unnamed: 0_level_1,count,unique,top,freq
Freshness,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
fresh,240000,188091,Parental Content Review,57
rotten,240000,151567,full review at Movies for the Masses,114


In [7]:
#function to convert freshness to numeric 
def freshness_coversion(x):
    if x=="fresh":
        return 1
    else:
        return 0


In [8]:
df['fresh']=df['Freshness'].apply(freshness_coversion)
df.head()

Unnamed: 0,Freshness,Review,fresh
0,fresh,Manakamana doesn t answer any questions yet ma...,1
1,fresh,Wilfully offensive and powered by a chest thum...,1
2,rotten,It would be difficult to imagine material more...,0
3,rotten,Despite the gusto its star brings to the role ...,0
4,rotten,If there was a good idea at the core of this f...,0


In [9]:
#a.dividing the dataset as train, development and test.
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
train = int(len(df) * 0.8)
dev = int(len(df) * 0.1)
test = len(df) - train - dev
train_df = df[:train]
dev_df = df[train:train+dev]
test_df = df[-test:]

In [10]:
print(train_df.shape)
print(dev_df.shape)
print(test_df.shape)

(384000, 3)
(48000, 3)
(48000, 3)


In [11]:
train_df.groupby('Freshness').describe()


Unnamed: 0_level_0,fresh,fresh,fresh,fresh,fresh,fresh,fresh,fresh
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Freshness,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
fresh,191966.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
rotten,192034.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
train_df.head()

Unnamed: 0,Freshness,Review,fresh
0,fresh,Guardians of the Galaxy is first class Grade A...,1
1,rotten,For a while Life Aquatic gets mileage out of i...,0
2,rotten,Director Ken Scott stresses the movie s dude c...,0
3,fresh,More a snapshot then a full blown insight into...,1
4,fresh,The immigrant experience takes on a blacker th...,1


In [13]:
#index starts from 0 for test_df
test_df_reset=test_df.reset_index(drop=True)
test_df_reset

Unnamed: 0,Freshness,Review,fresh
0,fresh,Sure this is slight but it s deftly done fluff...,1
1,rotten,It s a grim familiar place to be in an action ...,0
2,rotten,This speaks to Winchester s overall failure of...,0
3,rotten,full review in Greek,0
4,rotten,Every audience member with an intellect bigger...,0
...,...,...,...
47995,fresh,At its best with Soviets Americans and Raimus ...,1
47996,rotten,Just Friends is a dumb teen comedy,0
47997,rotten,Fairly successful at faking some pretty cool s...,0
47998,fresh,The pacing misses a few beats and the satire n...,1


In [14]:
#b. Build a vocabulary as list.
def vocab_list(train_df):
    word_freq = {}
    for i in range(0,len(train_df['Review'].values)):
        text=train_df['Review'][i]
        words = text.lower().split()
        # Count the frequency of each word
        
        for word in words:
            if word in word_freq:
                word_freq[word] += 1
            else:
                word_freq[word] = 1
    #vocabulary list and omitting less frequency
    voc=[]
    for word, freq in word_freq.items():
        if freq > 5:
            voc.append(word)
    return voc,word_freq
    
    

In [15]:
#vocabulary list and word frequency dictionary
voc,word_freq=vocab_list(train_df)
print(voc)
#vocabulary list for testing data
voc_test,word_freq_test=vocab_list(test_df_reset)
print(voc_test)





In [16]:
# Create a reverse index dictionary for the vocabulary list
reverse_index = {word: index for index, word in enumerate(voc)}
reverse_index


{'guardians': 0,
 'of': 1,
 'the': 2,
 'galaxy': 3,
 'is': 4,
 'first': 5,
 'class': 6,
 'grade': 7,
 'a': 8,
 'space': 9,
 'adventure': 10,
 'comedy': 11,
 'for': 12,
 'while': 13,
 'life': 14,
 'aquatic': 15,
 'gets': 16,
 'mileage': 17,
 'out': 18,
 'its': 19,
 'quirkiness': 20,
 'and': 21,
 'promise': 22,
 'that': 23,
 'real': 24,
 'plot': 25,
 'will': 26,
 'kick': 27,
 'in': 28,
 'but': 29,
 'it': 30,
 's': 31,
 'not': 32,
 'to': 33,
 'be': 34,
 'most': 35,
 'film': 36,
 'second': 37,
 'half': 38,
 'feels': 39,
 'like': 40,
 'anderson': 41,
 'his': 42,
 'characters': 43,
 'are': 44,
 'treading': 45,
 'water': 46,
 'director': 47,
 'ken': 48,
 'scott': 49,
 'stresses': 50,
 'movie': 51,
 'dude': 52,
 'centric': 53,
 'outlook': 54,
 'playing': 55,
 'up': 56,
 'anxieties': 57,
 'impending': 58,
 'fatherhood': 59,
 'neglects': 60,
 'any': 61,
 'sort': 62,
 'maternal': 63,
 'perspective': 64,
 'which': 65,
 'dumbs': 66,
 'down': 67,
 'an': 68,
 'already': 69,
 'moronic': 70,
 'premise'

In [17]:
# C. Calculate the following probability
def prob(train_df,word):
    word_n=0
    for i in range(0,len(train_df['Review'].values)):
        text=train_df['Review'][i].lower()
        if word in text:
            word_n+=1
    prob=word_n/len(train_df['Review'].values)
    return prob
            

In [18]:
#probability of occurence of "the"
print("Probability of occurence of 'the'",prob(train_df,"the"))

Probability of occurence of 'the' 0.7126354166666666


In [19]:
#Conditional probability based on the sentiment
def cond_prob(train_df,word,condition):
    cnt=0
    cnt_condition=0#count of documnets based on the condition
    condition=condition.lower()
    if condition=="positive":
        condition=1
    else:
        condition=0
    for i in range(0,len(train_df['Review'].values)):
        text=train_df['Review'][i].lower()
        if word in text and df['fresh'][i]==condition:
            cnt+=1
        if df['fresh'][i]==condition:
            cnt_condition+=1
            
    cond_prob=cnt/cnt_condition
    
    return cond_prob
        
        
        

In [20]:
# P[“the” | Positive]
print("Conditional probability of occcurence 'the'",cond_prob(train_df,"the","Positive"))

Conditional probability of occcurence 'the' 0.7058906264651031


In [29]:


#Calculating Probability for each word in the vocabulary list

def word_p(df,voc):
    word_prob_dic={}
    for word in tqdm(voc):
        word_cnt = (df['Review'].str.lower().str.contains(word)).sum()

        prob=word_cnt/len(df['Review'].values)
        word_prob_dic[word]=prob
    return word_prob_dic

In [None]:
#Calculating Probability for each word in the vocabulary list
word_prob=word_p(train_df,voc)
word_prob


 21%|███████▋                            | 7262/34088 [46:09<3:05:07,  2.42it/s]

In [None]:
#Calculating Conditional Probability for each word in the vocabulary list while going according to the categories

def cond_p(df,voc,smt):
    categories=list(set(df['Freshness'].values))
    #cat_num=df.groupby(by='Freshness').count()['Review'].to_dict()
    cat_num={}
    for i in range(0,len(df['Review'].values)):
        if df['Freshness'][i] in cat_num:
            cat_num[df['Freshness'][i]]+=1
        else:
            cat_num[df['Freshness'][i]]=1
            
    prob_word={}
    
    for cat in tqdm(categories):
        prob_word[cat]={}
        for word in list(voc):
            word_cnt = df[df['Freshness'] == cat]['Review'].str.lower().str.contains(word).sum()

            value=(word_cnt+smt)/(cat_num[cat]+(smt*len(cat_num)))
            prob_word[cat][word]=value
    return prob_word

In [None]:
cond_prob_train=cond_p(train_df,voc,0)
cond_prob_train


In [None]:
#Calculating prior probabilities of each category
def calc_cp(df):
    cat_num={}
    for i in range(0,len(df['Review'].values)):
        if df['Freshness'][i] in cat_num:
            cat_num[df['Freshness'][i]]+=1
        else:
            cat_num[df['Freshness'][i]]=1
    for key,val in cat_num.items():
        cat_num[key]=(val/df.shape[0])
    return cat_num
            
    

In [None]:
prior_prob=calc_cp(train_df)
print(prior_prob)

In [None]:
#predict function helps in calculating accuracy
def predict (df,cond_prob,word_prob,prior_prob):
    data_test=[]
    labels_test=list(df['Review'].values)
    labels_pred=[]
    review=df['Review'].values
    for rev in review:
        r=rev.split(" ")
        data_test.append(r)
        
    for data in data_test:
        pred_cls={}
        for key,val in cond_prob.items():
            num=1
            d=1
            for word in data:
                if word in val.keys():
                    num=num*val[word]
                if word in word_prob.keys():
                    d=d*word_prob[word]
            n=num * prior_prob[key]
            pred_cls[key]=n/d
        labels_pred.append(max(pred_cls,key=pred_cls.get))
    cnt=0
    for val1,val2 in zip(labels_test,labels_pred):
        if val1==val2:
            cnt+=1
    
    return cnt/len(labels_test)
    

In [None]:
#training accuracy
print("Training Accuracy:",predict(train_df,cond_prob,word_prob,prior_prob))


In [None]:
#5 fold cross validation
def cross_valid(df,smt):
    #print("hi")
    kfold=5
    n=len(df)/5
    accuracy=[]
    for i in range(kfold):
        #print("hello")
        test=df.iloc[int(i*n):int((i+1)*n)]
        #print("here")
        df1=df.iloc[:int(i*n),:]
        df2=df.iloc[int(n*(i+1)):,:]
        train=pd.concat([df1,df2])
        train=train.reset_index(drop=True)
        voc,word_freq=vocab_list(train)
        word_prob=word_p(train,voc)
        cond_prob=cond_p(train,voc,smt)
        prior_prob=calc_cp(train)
        
        accu=predict(test,cond_prob,word_prob,prior_prob)
        
        accuracy.append(accu)
    mean_accuracy=sum(accuracy)/len(accuracy)
    
    return accuracy,mean_accuracy,cond_prob,word_prob,prior_prob
        
    
    

In [None]:
#dev dataset reset the index to start from 0
dev_df_reset=dev_df.reset_index(drop=True)
dev_df
# d. Calculate accuracy using dev dataset
accuracy,mean_accuracy,cond_prob_dev,word_prob_dev,prior_prob_dev=cross_valid(dev_df_reset,0)
print(accuracy)
print(mean_accuracy)

In [None]:
#e. Do following experiments
 
# ■ Compare the effect of Smoothing

accuracy,mean_accuracy,cond_prob_dev,word_prob_dev,prior_prob_dev=cross_valid(dev_df_reset,0.0001)

print("After smoothing Accuracy is:",accuracy)
print("After smoothing Mean Accuracy is:",mean_accuracy)

In [28]:
# ■ Derive Top 10 words that predicts each class

print("Top 10 words that predicts each class")
l=df['Freshness'].unique()
for i in l:
    print(i)
    print(list(dict(sorted(cond_prob_dev[i].items(),key=lambda item:item[1],reverse=True)).keys()[:10]))
    


Top 10 words that predicts each class
fresh
rotten


In [None]:
#f. Using the test dataset with different parameters
cond_prob=cond_p(train_df,voc,0.0001)
print("Final Accuracy for test data",predict(test_df_reset,cond_prob,word_prob,prior_prob))
cond_prob=cond_p(train_df,voc,0.001)
print("Final Accuracy for test data",predict(test_df_reset,cond_prob,word_prob,prior_prob))
cond_prob=cond_p(train_df,voc,0,0.01)
print("Final Accuracy for test data",predict(test_df_reset,cond_prob,word_prob,prior_prob))
