In [None]:
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, naive_bayes, svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np

In [None]:
corpus = pd.read_csv("imdb_tr.csv",encoding='latin-1') #reading csv file

In [None]:
corpus.shape  

(25000, 3)

In [None]:
corpus.describe

<bound method NDFrame.describe of        row_Number                                               text  polarity
0            2148  first think another Disney movie, might good, ...         1
1           23577  Put aside Dr. House repeat missed, Desperate H...         0
2            1319  big fan Stephen King's work, film made even gr...         1
3           13358  watched horrid thing TV. Needless say one movi...         0
4            9495  truly enjoyed film. acting terrific plot. Jeff...         1
5            2154  memory "The Last Hunt" stuck since saw 1956 13...         1
6           19880  Shakespeare fan, appreciate Ken Branagh done b...         0
7            2073  privilege watching Scarface big screen beautif...         1
8           12001  real classic. shipload sailors trying get town...         1
9            9373  Serials short subjects originally shown theate...         1
10          19704  strange sex comedy there`s little comedy whole...         0
11          20033 

In [None]:
def unigram_POS_tokens(text):                      
    tokens = nltk.word_tokenize(text)
    unigram_POS = nltk.pos_tag(tokens)   
    return list(unigram_POS)    #return positive list   of text

In [None]:
def unigram_adjectives(text):
    tokens = nltk.word_tokenize(text)
    adjectives = list()
    for tag in filter(lambda x: x[1] == 'JJ' or x[1] == 'JJS' or x[1] == 'JJR',
                      nltk.pos_tag(tokens)):
        adjectives.extend([tag[0]])

    return adjectives  #return list of adjective 

In [None]:
def unigrams_frequency_vectorization(text):
    vectorizer = CountVectorizer(ngram_range=(1, 1), binary=False)
  # tokenize and build vocab
    vectorizer.fit(text)
  # encode document
    vector = vectorizer.transform(text) 
  # summarize encoded vector
    print('shape: ', vector.shape)
    return vector    #return frequency of word

In [None]:
def unigrams_presence_vectorization(text):
    vectorizer = CountVectorizer(ngram_range=(1, 1), binary=True)
      # tokenize and build vocab
    vectorizer.fit(text)
      # encode document
    vector = vectorizer.transform(text)
      # summarize encoded vector
    print('shape: ', vector.shape)
    return vector 

In [None]:
def bigrams_presence_vectorization(text):
    vectorizer = CountVectorizer(ngram_range=(2, 2), binary=True)
  # tokenize and build vocab
    vectorizer.fit(text)
  # encode document
    vector = vectorizer.transform(text)
  # summarize encoded vector
    print('shape: ', vector.shape)
    return vector

In [None]:
def unigrams_and_bigrams_presence_vectorization(text):
    vectorizer = CountVectorizer(ngram_range=(1, 2), binary=True)
      # tokenize and build vocab
    vectorizer.fit(text)
      # encode document
    vector = vectorizer.transform(text)
      # summarize encoded vector
    print('shape: ', vector.shape)
    return vector

In [None]:
def POStagged_unigrams_presence_vectorization(text):
    vectorizer = CountVectorizer(ngram_range=(1, 1), binary=True, tokenizer=unigram_POS_tokens)
  # tokenize and build vocab
    vectorizer.fit(text)
  # encode document
    vector = vectorizer.transform(text)
  # summarize encoded vector
    print('shape: ', vector.shape)
    return vector

In [None]:
def POStagged_ajectives_presence_vectorization(text):
    vectorizer = CountVectorizer(ngram_range=(1, 1), binary=True, tokenizer=unigram_adjectives)
  # tokenize and build vocab
    vectorizer.fit(text)
  # encode document
    vector = vectorizer.transform(text)
  # summarize encoded vector
    print('shape: ', vector.shape)
    return vector

In [None]:
multinomialNaiveBayes = clf = MultinomialNB()
logReg = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial', max_iter=500)
svm = svm.SVC(kernel='linear', C=1)

In [None]:
data=unigrams_frequency_vectorization(corpus['text'])  #to calculate frequency 
encoder = LabelEncoder()
target = encoder.fit_transform(corpus['polarity']) #to fit in machine readable form 

shape:  (25000, 75532)


In [None]:
scoresNB = cross_val_score(multinomialNaiveBayes, data, target, cv=3)  
print("Accuracy for NB: %0.3f (+/- %0.3f)" % (scoresNB.mean(), scoresNB.std() * 2)) # accuracy for naive bayes

Accuracy for NB: 0.820 (+/- 0.009)


In [None]:
scoresSVM = cross_val_score(svm, data, target, cv=3)
print("Accuracy for SVM: %0.3f (+/- %0.3f)" % (scoresSVM.mean(), scoresSVM.std() * 2)) # accuracy for svm

Accuracy for SVM: 0.857 (+/- 0.008)


In [None]:
unigrams_frequency_acc={'Features:' : 'unigrams(freq.)', 'NB' : [scoresNB.mean()], 'MaxEnt': [np.nan], 'SVM' : [scoresSVM.mean()]}
#particular format

In [None]:
unigrams_frequency_accDF=pd.DataFrame(unigrams_frequency_acc) #dataframe

In [None]:
accuracies = pd.DataFrame(unigrams_frequency_accDF, columns=['Features:', 'NB', 'MaxEnt','SVM'])  #dataframe with columns

In [None]:
data=unigrams_presence_vectorization(corpus['text'])  

shape:  (25000, 75532)


In [None]:
scoresNB = cross_val_score(multinomialNaiveBayes, data, target, cv=3)
print("Accuracy for NB: %0.3f (+/- %0.3f)" % (scoresNB.mean(), scoresNB.std() * 2)) # accuracy for naive bayes

Accuracy for NB: 0.858 (+/- 0.005)


In [None]:
scoresMaxEnt = cross_val_score(logReg, data, target, cv=3)   
print("Accuracy for MaxEnt: %0.3f (+/- %0.3f)" % (scoresMaxEnt.mean(), scoresMaxEnt.std() * 2)) #accuracy for max entropy

Accuracy for MaxEnt: 0.872 (+/- 0.005)


In [None]:
scoresSVM = cross_val_score(svm, data, target, cv=3)
print("Accuracy for SVM: %0.3f (+/- %0.3f)" % (scoresSVM.mean(), scoresSVM.std() * 2)) # accuracy for svm

Accuracy for SVM: 0.853 (+/- 0.005)


In [None]:
unigrams_presence_acc={'Features:' : 'unigrams(pres.)', 'NB' : [scoresNB.mean()], 'MaxEnt': [scoresMaxEnt.mean()], 'SVM' : [scoresSVM.mean()]}

In [None]:
unigrams_presence_accDF=pd.DataFrame(unigrams_presence_acc)
accuracies=accuracies.append(unigrams_presence_accDF, ignore_index=True)  #appending to table above

In [None]:
data=unigrams_and_bigrams_presence_vectorization(corpus['text'])   #finding both  unigram and bigram present frequency

shape:  (25000, 1908608)


In [None]:
scoresNB = cross_val_score(multinomialNaiveBayes, data, target, cv=3) 
print("Accuracy for NB: %0.3f (+/- %0.3f)" % (scoresNB.mean(), scoresNB.std() * 2))  #accuracy for  naivebayes

Accuracy for NB: 0.883 (+/- 0.005)


In [None]:
scoresMaxEnt = cross_val_score(logReg, data, target, cv=3)
print("Accuracy for MaxEnt: %0.3f (+/- %0.3f)" % (scoresMaxEnt.mean(), scoresMaxEnt.std() * 2)) #accuracy for max entropy

Accuracy for MaxEnt: 0.887 (+/- 0.009)


In [None]:
scoresSVM = cross_val_score(svm, data, target, cv=3)
print("Accuracy for SVM: %0.3f (+/- %0.3f)" % (scoresSVM.mean(), scoresSVM.std() * 2)) #accuracy for svm

Accuracy for SVM: 0.883 (+/- 0.006)


In [None]:
unigrams_and_bigrams_presence_acc={'Features:' : 'unigrams and bigrams(pres.) ', 'NB' : [scoresNB.mean()], 'MaxEnt': [scoresMaxEnt.mean()], 'SVM' : [scoresSVM.mean()]}

In [None]:
unigrams_and_bigrams_presence_accDF=pd.DataFrame(unigrams_and_bigrams_presence_acc)

In [None]:
accuracies=accuracies.append(unigrams_and_bigrams_presence_accDF, ignore_index=True) #appending into table rowise

In [None]:
data=bigrams_presence_vectorization(corpus['text'])
scoresNB = cross_val_score(multinomialNaiveBayes, data, target, cv=3)
print("Accuracy for NB: %0.3f (+/- %0.3f)" % (scoresNB.mean(), scoresNB.std() * 2))  #accuracy for naiveebayes

shape:  (25000, 1833076)
Accuracy for NB: 0.870 (+/- 0.005)


In [None]:
scoresMaxEnt = cross_val_score(logReg, data, target, cv=3)
print("Accuracy for MaxEnt: %0.3f (+/- %0.3f)" % (scoresMaxEnt.mean(), scoresMaxEnt.std() * 2)) #accuracy for max entropy 
scoresSVM = cross_val_score(svm, data, target, cv=3)
print("Accuracy for SVM: %0.3f (+/- %0.3f)" % (scoresSVM.mean(), scoresSVM.std() * 2)) #accuracy for svm

Accuracy for MaxEnt: 0.849 (+/- 0.004)
Accuracy for SVM: 0.849 (+/- 0.006)


In [None]:
bigrams_presence_acc={'Features:' : 'bigrams(pres.)', 'NB' : [scoresNB.mean()], 'MaxEnt': [scoresMaxEnt.mean()], 'SVM' : [scoresSVM.mean()]}
bigrams_presence_accDF=pd.DataFrame(bigrams_presence_acc)  #dataframe
accuracies=accuracies.append(bigrams_presence_accDF, ignore_index=True)  #appendiing into table

In [None]:
data=POStagged_unigrams_presence_vectorization(corpus['text'])
scoresNB = cross_val_score(multinomialNaiveBayes, data, target, cv=3)
print("Accuracy for NB: %0.3f (+/- %0.3f)" % (scoresNB.mean(), scoresNB.std() * 2)) #accuracy for naive bayes for pos tagged

shape:  (25000, 191910)
Accuracy for NB: 0.857 (+/- 0.005)


In [None]:
scoresMaxEnt = cross_val_score(logReg, data, target, cv=3)
print("Accuracy for MaxEnt: %0.3f (+/- %0.3f)" % (scoresMaxEnt.mean(), scoresMaxEnt.std() * 2)) #accuracy for max entropy for pos tagged
scoresSVM = cross_val_score(svm, data, target, cv=3)
print("Accuracy for SVM: %0.3f (+/- %0.3f)" % (scoresSVM.mean(), scoresSVM.std() * 2)) #accuracy for svm for pos tagged

Accuracy for MaxEnt: 0.874 (+/- 0.009)
Accuracy for SVM: 0.861 (+/- 0.010)


In [None]:
POStagged_unigrams_presence_acc={'Features:' : 'unigrams+POS', 'NB' : [scoresNB.mean()], 'MaxEnt': [scoresMaxEnt.mean()], 'SVM' : [scoresSVM.mean()]}
POStagged_unigrams_presence_accDF=pd.DataFrame(POStagged_unigrams_presence_acc) #making the dataframe of postagged
accuracies=accuracies.append(POStagged_unigrams_presence_accDF, ignore_index=True) #appending into table

In [None]:
data=POStagged_ajectives_presence_vectorization(corpus['text'])
scoresNB = cross_val_score(multinomialNaiveBayes, data, target, cv=3)
print("Accuracy for NB: %0.3f (+/- %0.3f)" % (scoresNB.mean(), scoresNB.std() * 2))  #accuracy for naive bayes for pos tagged adjective
scoresMaxEnt = cross_val_score(logReg, data, target, cv=3)
print("Accuracy for MaxEnt: %0.3f (+/- %0.3f)" % (scoresMaxEnt.mean(), scoresMaxEnt.std() * 2))
 #accuracy for max entropyfor pos tagged adjective

shape:  (25000, 49228)
Accuracy for NB: 0.820 (+/- 0.009)
Accuracy for MaxEnt: 0.804 (+/- 0.003)


In [None]:
scoresSVM = cross_val_score(svm, data, target, cv=3)
print("Accuracy for SVM: %0.3f (+/- %0.3f)" % (scoresSVM.mean(), scoresSVM.std() * 2)) 
 #accuracy for svm for pos tagged adjective
POStagged_ajectives_presence_acc={'Features:' : 'adjectives', 'NB' : [scoresNB.mean()], 'MaxEnt': [scoresMaxEnt.mean()], 'SVM' : [scoresSVM.mean()]}
POStagged_ajectives_presence_accDF=pd.DataFrame(POStagged_ajectives_presence_acc) #making the dataframe of postagged adjective
accuracies=accuracies.append(POStagged_ajectives_presence_accDF, ignore_index=True) #appending to table

Accuracy for SVM: 0.786 (+/- 0.003)


In [None]:
accuracies.describe

<bound method NDFrame.describe of                       Features:   MaxEnt        NB      SVM
0               unigrams(freq.)      NaN  0.856081  0.85652
1               unigrams(pres.)  0.87168  0.858080  0.85304
2  unigrams and bigrams(pres.)   0.88716  0.882520  0.88308
3                bigrams(pres.)  0.84916  0.870280  0.84856
4                bigrams(pres.)  0.84916  0.856960  0.84856
5                bigrams(pres.)  0.84916  0.856960  0.84856
6                  unigrams+POS  0.87416  0.856960  0.86072
7                    adjectives  0.80412  0.819720  0.78644>