# Sentiment Analysis : analysis of IMDB review

### Data set description
IMDB dataset :
* IMDB dataset consists of 15000 reviews in training set, 10000 reviews in validation set, and 25000 reviews in test set. This is a 2 class problem with class 1 being positive sentiment and class 0 being negative sentiment.

Preprocess of the data:
* Excluded all punctuation and converted everything into lower case
* Made a vocabulary of 10000 most common words from training data and discarded anything not found within the vocabulary.
* constructed bag of words, as in converted text into number that works as features.

Classifer used:
* As a baseline, report the performance of the random classiffier (a classiffier which classiffies a review into an uniformly random class). And then have done comparative studies using Naive Bayes, Decision Trees, and Linear SVM 

In [1]:
import pandas as pd
import numpy as np
#loc = 'D:/University materials/Winter 2018/Applied ML/winter 2018/Assignments/assignment 3/hwk3_datasets/IMDB-train.txt'
loc = 'IMDB-train.txt'
IMDB_train = pd.read_table(loc,header=None,names=['review','label'])

In [2]:
labels = IMDB_train.label.unique()
labels.sort()

In [3]:
frq_vect = [] # frequency vector; counts the number of time lables got repeated
for itr in range ( 0,len(labels) ):
    frq_vect = np.append (frq_vect, (sum(IMDB_train.label==labels[itr])) ) 
    
No_examples = len(IMDB_train.label)
class_prob = frq_vect/No_examples # probability of perticular class occurance

### Using Random Classifier computing F1 score for IMDB train,test,valid data prediction 

In [4]:
def getRandomPred (labels,No_examples,class_prob,true_labels):
    # prediticng class with random classifier and getting F1 score
    pred_labels = np.random.choice(labels, No_examples, p=class_prob)
    from sklearn.metrics import f1_score,accuracy_score

    IMDB_f1 = f1_score(true_labels,pred_labels,average='micro')
    IMDB_acc = accuracy_score(true_labels,pred_labels)

    print('F1 score',IMDB_f1)
    print('Accuracy',IMDB_acc)
    return (IMDB_f1,IMDB_acc)

#### F1 score for IMDB training data

In [5]:
No_examples = len(IMDB_train.label)
true_labels = np.asarray(IMDB_train.label)
f1_train,train_acc = getRandomPred (labels,No_examples,class_prob,true_labels)

F1 score 0.5068666666666667
Accuracy 0.5068666666666667


#### F1 score for IMDBvalidaion data

In [6]:
#loc = 'D:/University materials/Winter 2018/Applied ML/winter 2018/Assignments/assignment 3/hwk3_datasets/IMDB-valid.txt'
loc = 'IMDB-valid.txt'
IMDB_valid = pd.read_table(loc,header=None,names=['review','label'])
# for IMDB training data
true_labels = np.asarray(IMDB_valid.label)
No_examples = len(IMDB_valid.label)
f1_valid,valid_acc = getRandomPred (labels,No_examples,class_prob,true_labels)

F1 score 0.5038
Accuracy 0.5038


#### F1 score for IMDB testing data

In [7]:
#loc = 'D:/University materials/Winter 2018/Applied ML/winter 2018/Assignments/assignment 3/hwk3_datasets/IMDB-test.txt'
loc = 'IMDB-test.txt'
IMDB_test = pd.read_table(loc,header=None,names=['review','label'])
# for IMDB training data
true_labels = np.asarray(IMDB_test.label)
No_examples = len(IMDB_test.label)
f1_test,test_acc = getRandomPred (labels,No_examples,class_prob,true_labels)

F1 score 0.50136
Accuracy 0.50136


In [None]:
###############################################################################################################################



###############################################################################################################################

# Analysis of Classification IMDB data using Binary bag-of-words

In [16]:
import re
import pandas as pd
import numpy as np

In [17]:
def TextPreprocess(text):   
    text = text.str.lower()
    text = text.str.replace('[^\w\s]','')    
    text = text.str.replace('[0-9]','')
    text = text.str.replace('_','')
    return text

In [18]:
vocab = pd.read_csv('IMDB-vocab.txt',header=None,names = ['word','id','frequency'])
#vocabtotext = vocab.word.str.cat(sep=' ')

In [19]:
#loc = 'D:/University materials/Winter 2018/Applied ML/winter 2018/Assignments/assignment 3/hwk3_datasets/IMDB-train.txt'
loc = 'IMDB-train.txt'
IMDB_train = pd.read_table(loc,header=None,names=['review','label'])
IMDB_train.review = TextPreprocess(IMDB_train.review)

In [20]:
#loc = 'D:/University materials/Winter 2018/Applied ML/winter 2018/Assignments/assignment 3/hwk3_datasets/IMDB-test.txt'
loc = 'IMDB-test.txt'
IMDB_test = pd.read_table(loc,header=None,names=['review','label'])
IMDB_test.review = TextPreprocess(IMDB_test.review)

In [21]:
#loc = 'D:/University materials/Winter 2018/Applied ML/winter 2018/Assignments/assignment 3/hwk3_datasets/IMDB-valid.txt'
loc = 'IMDB-valid.txt'
IMDB_valid = pd.read_table(loc,header=None,names=['review','label'])
IMDB_valid.review = TextPreprocess(IMDB_valid.review)

In [22]:
def doc_counts(doc): 
    #myDictionary = collections.OrderedDict()
    import collections
    myDictionary = {}  
    myFile = doc
    field = myFile.split()
    frequency=collections.Counter(field) #for bag of frequency
    
    #for line in range(len(field)):
        #myDictionary[line] = [field[line] , line]
        #myDictionary[field[line]] =  line
    return frequency

In [23]:
def generate_sparse_matrix(texts, vocab):
    
    from scipy.sparse import csr_matrix
    """ Generate a sparse matrix from the given texts, using doc_counts function """
    D = len(texts)
    V = len(vocab.word)
        
    mat_bag_data = []
    mat_freq_data = []
    mat_indptr = [0]
    mat_indices = []

    for i,doc in enumerate(texts):
      #  counts,frequency = doc_counts(doc) # counts basically nested list contains words
                                            # frequency hold the number repeatations
        frequency = doc_counts(doc) 
        
        #N = len(counts)  # idk why N requires 
        used = 0
        for word,count in frequency.items():
            if vocab.loc[vocab.word== word].empty:
                # if the word is missing in vocab we skip it
                continue                        
            else:
                index = vocab[vocab.word == word].iloc[0].id - 1  
                # -1 cause our id starts from 1 but in matrix first indice is 0, so we get 10001 example matrix as we skipped 0
                # so -1 so that
                #print(word)
                mat_indices.append(index)
                mat_bag_data.append(1)
                mat_freq_data.append(frequency[word])
                used += 1
        mat_indptr.append(mat_indptr[-1] + used)
        
    mat_bag = csr_matrix((mat_bag_data, mat_indices, mat_indptr), (D,V+1), dtype='int')
    mat_freq = csr_matrix((mat_freq_data, mat_indices, mat_indptr), (D,V+1), dtype='int') 
    # mat_freq has not normalized, normalize it while using it in classifier 
    #mat[:,0] = 1
    
        
    return mat_bag,mat_freq

In [None]:
#############################################################################################################################
'''
One sprase matrix constructed save it and call the data when its needed as computation is time consuming
'''

############################################################################################################################

### computing sparse matrix of training data

In [24]:
'''
%lsmagic
%time mat_bag,mat_freq = generate_sparse_matrix(IMDB_train.review, vocab)
'''

Wall time: 1h 21min 37s


In [25]:
'''
import scipy.sparse
scipy.sparse.save_npz('IMDB_train_bag_mat.npz', mat_bag)
scipy.sparse.save_npz('IMDB_train_freq_mat.npz', mat_freq)
'''

### computing sparse matrix of testing data
### computing sparse matrix of validation data

In [26]:
'''
%lsmagic
%time test_mat_bag,test_mat_freq = generate_sparse_matrix(IMDB_test.review, vocab)
%time valid_mat_bag,valid_mat_freq = generate_sparse_matrix(IMDB_valid.review, vocab)
'''

Wall time: 7h 44min 18s
Wall time: 56min 46s


In [28]:
'''
scipy.sparse.save_npz('IMDB_test_bag_mat.npz', test_mat_bag)
scipy.sparse.save_npz('IMDB_test_freq_mat.npz', test_mat_freq)

scipy.sparse.save_npz('IMDB_valid_bag_mat.npz', valid_mat_bag)
scipy.sparse.save_npz('IMDB_valid_freq_mat.npz', valid_mat_freq)
'''

In [None]:
##########################################################################################################################



##########################################################################################################################

### Analysis if Classification Efficiency using bag-of-words

In [29]:
import scipy.sparse
train_mat = scipy.sparse.load_npz('IMDB_train_bag_mat.npz')
test_mat = scipy.sparse.load_npz('IMDB_test_bag_mat.npz')
valid_mat = scipy.sparse.load_npz('IMDB_valid_bag_mat.npz')


In [None]:
#train_set = IMDB_train.iloc[np.random.choice(len(IMDB_train.review),20)]       # randomly picking dataset to train
#validation_set = IMDB_valid.iloc[np.random.choice(len(IMDB_valid.review),20)]      # randomly picking dataset to classify

In [30]:
train_y_true = IMDB_train.label
valid_y_true = IMDB_valid.label
test_y_true = IMDB_test.label

### Applying Naive bayes classifier

In [31]:
# computeing accuracy and F1 score for predicting "training data"
# input sparse matrix data that needs to be predicted
def getClassifierEff (Data,true_y,clf):
    from sklearn.metrics import f1_score,accuracy_score
    y_pred = clf.predict(Data)
    acc = accuracy_score(true_y,y_pred)
    f1 = f1_score(true_y,y_pred,average='micro')
    return f1,acc

In [32]:
# training classifier with "training data"
from sklearn.naive_bayes import BernoulliNB

In [37]:
Hyp = [0, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1,1]
IMDB_valid_f1 = []
IMDB_valid_acc = []
for itr in range(len(Hyp)):
    naive_clf = BernoulliNB(alpha=Hyp[itr]).fit(train_mat, train_y_true)
    f1 , acc = getClassifierEff (valid_mat,valid_y_true,naive_clf)
    IMDB_valid_f1.append(f1)
    IMDB_valid_acc.append(acc)
   # print('IMDB_valid_f1',IMDB_valid_f1)

  'setting alpha = %.1e' % _ALPHA_MIN)


In [38]:
IMDB_valid_f1

[0.841,
 0.841,
 0.8412,
 0.8413000000000002,
 0.8416,
 0.8415,
 0.8416999999999999,
 0.8421,
 0.8422999999999999,
 0.8428,
 0.8433,
 0.8424000000000001]

In [39]:
optim_alpha = Hyp[np.argmax(IMDB_valid_f1)]
print('for hyper parameter alpha =',Hyp[np.argmax(IMDB_valid_f1)],'we get max F1 score')

for hyper parameter alpha = 0.1 we get max F1 score


In [40]:
naive_clf = BernoulliNB( alpha=optim_alpha ).fit(train_mat, train_y_true)

train_f1 , train_acc = getClassifierEff (train_mat,train_y_true,naive_clf)
test_f1 , test_acc = getClassifierEff (test_mat,test_y_true,naive_clf)

print('IMDB_train_f1',train_f1)
print('IMDB_test_f1',test_f1)


IMDB_train_f1 0.8707333333333334
IMDB_test_f1 0.8318399999999999


### Applying Decision Tree

In [41]:
from sklearn import tree

class sklearn.tree.DecisionTreeClassifier(criterion=’gini’, splitter=’best’, max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort=False)

In [42]:
crit = ['gini','entropy'] #criterion 
split = ['best','random'] #splitter


In [55]:
IMDB_valid_f1 = []
IMDB_valid_acc = []
for itr1 in range(len(crit)):
    for itr2 in range(len(split)):
        tree_clf = tree.DecisionTreeClassifier(criterion = crit[itr1], splitter = split[itr2]).fit(train_mat, train_y_true)
        f1,acc = getClassifierEff (valid_mat,valid_y_true,tree_clf)
        IMDB_valid_f1.append(f1)
        IMDB_valid_acc.append(acc)
        
        print('IMDB_valid_f1',f1,'when we use criterion',crit[itr1],'and splitter',split[itr2])

IMDB_valid_f1 0.6894 when we use criterion gini and splitter best
IMDB_valid_f1 0.6979 when we use criterion gini and splitter random
IMDB_valid_f1 0.6947 when we use criterion entropy and splitter best
IMDB_valid_f1 0.6966 when we use criterion entropy and splitter random


In [56]:
max(IMDB_valid_f1)

0.6979

In [62]:
tree_clf = tree.DecisionTreeClassifier(criterion = 'gini', splitter = 'random').fit(train_mat, train_y_true)
IMDB_train_f1,IMDB_train_acc = getClassifierEff (train_mat,train_y_true,tree_clf)
IMDB_test_f1,IMDB_test_acc = getClassifierEff (test_mat,test_y_true,tree_clf)

print('IMDB_train_f1',IMDB_train_f1)
print('IMDB_test_f1',IMDB_test_f1)

IMDB_train_f1 1.0
IMDB_test_f1 0.69776


### Applying Linear SVC

In [45]:
from sklearn.svm import LinearSVC
linear_clf = LinearSVC().fit(train_mat, train_y_true)

### for combination of penalty='l1','l2' , loss='squred_hinge',dual=False

In [48]:
pen = ['l1','l2']
los = ['squared_hinge'] # gives error for hinge
dul = [False] # gives error for dual = true. 
tolerance = [1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1,1 ]
C_param = [1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1]


In [49]:
IMDB_valid_f1 = []
IMDB_valid_acc = []
for itr1 in range(len(pen)):

    linear_clf = LinearSVC(penalty=pen[itr1] , dual=False).fit(train_mat, train_y_true)
    f1 , acc = getClassifierEff (valid_mat,valid_y_true,linear_clf)
    IMDB_valid_f1.append(f1)
    IMDB_valid_acc.append(acc)
            
    print('IMDB_valid_f1',f1,'when we use penalty',pen[itr1])

IMDB_valid_f1 0.8454 when we use penalty l1
IMDB_valid_f1 0.8428 when we use penalty l2


In [50]:
IMDB_valid_f1 = []
IMDB_valid_acc = []
for itr1 in range(len(pen)):
    for itr2 in range(len(tolerance)):
        for itr3 in range(len(C_param)):
            linear_clf = LinearSVC(penalty=pen[itr1],tol=tolerance[itr2],C=C_param[itr3],dual=False).fit(train_mat, train_y_true)
            f1 , acc = getClassifierEff (valid_mat,valid_y_true,linear_clf)
            IMDB_valid_f1.append(f1)
            IMDB_valid_acc.append(acc)
            
            print('IMDB_valid_f1',f1,'when we use penalty',pen[itr1],'tolerence',tolerance[itr2],'C',C_param[itr3])

IMDB_valid_f1 0.5 when we use penalty l1 tolerence 1e-10 C 1e-10
IMDB_valid_f1 0.5 when we use penalty l1 tolerence 1e-10 C 1e-09
IMDB_valid_f1 0.5 when we use penalty l1 tolerence 1e-10 C 1e-08
IMDB_valid_f1 0.5 when we use penalty l1 tolerence 1e-10 C 1e-07
IMDB_valid_f1 0.5 when we use penalty l1 tolerence 1e-10 C 1e-06
IMDB_valid_f1 0.5 when we use penalty l1 tolerence 1e-10 C 1e-05
IMDB_valid_f1 0.5 when we use penalty l1 tolerence 1e-10 C 0.0001
IMDB_valid_f1 0.7042 when we use penalty l1 tolerence 1e-10 C 0.001
IMDB_valid_f1 0.8344 when we use penalty l1 tolerence 1e-10 C 0.01
IMDB_valid_f1 0.869 when we use penalty l1 tolerence 1e-10 C 0.1
IMDB_valid_f1 0.8455 when we use penalty l1 tolerence 1e-10 C 1
IMDB_valid_f1 0.5 when we use penalty l1 tolerence 1e-09 C 1e-10
IMDB_valid_f1 0.5 when we use penalty l1 tolerence 1e-09 C 1e-09
IMDB_valid_f1 0.5 when we use penalty l1 tolerence 1e-09 C 1e-08
IMDB_valid_f1 0.5 when we use penalty l1 tolerence 1e-09 C 1e-07
IMDB_valid_f1 0.5 wh

IMDB_valid_f1 0.7100000000000001 when we use penalty l2 tolerence 1e-10 C 1e-06
IMDB_valid_f1 0.7907 when we use penalty l2 tolerence 1e-10 C 1e-05
IMDB_valid_f1 0.8315 when we use penalty l2 tolerence 1e-10 C 0.0001
IMDB_valid_f1 0.8673 when we use penalty l2 tolerence 1e-10 C 0.001
IMDB_valid_f1 0.8741 when we use penalty l2 tolerence 1e-10 C 0.01
IMDB_valid_f1 0.8571000000000001 when we use penalty l2 tolerence 1e-10 C 0.1
IMDB_valid_f1 0.8427 when we use penalty l2 tolerence 1e-10 C 1
IMDB_valid_f1 0.5979 when we use penalty l2 tolerence 1e-09 C 1e-10
IMDB_valid_f1 0.5979 when we use penalty l2 tolerence 1e-09 C 1e-09
IMDB_valid_f1 0.6 when we use penalty l2 tolerence 1e-09 C 1e-08
IMDB_valid_f1 0.614 when we use penalty l2 tolerence 1e-09 C 1e-07
IMDB_valid_f1 0.7100000000000001 when we use penalty l2 tolerence 1e-09 C 1e-06
IMDB_valid_f1 0.7907 when we use penalty l2 tolerence 1e-09 C 1e-05
IMDB_valid_f1 0.8315 when we use penalty l2 tolerence 1e-09 C 0.0001
IMDB_valid_f1 0.8673 

In [52]:
linear_clf = LinearSVC(penalty='l2',tol=0.01,C=0.01,dual=False).fit(train_mat, train_y_true)
IMDB_train_f1,IMDB_train_acc = getClassifierEff (train_mat,train_y_true,linear_clf)
IMDB_test_f1,IMDB_test_acc = getClassifierEff (test_mat,test_y_true,linear_clf)
IMDB_valid_f1,IMDB_valid_acc = getClassifierEff (valid_mat,valid_y_true,linear_clf)
print('IMDB_train_f1',IMDB_train_f1)
print('IMDB_test_f1',IMDB_test_f1)
print('IMDB_valid_f1',IMDB_valid_f1)

IMDB_train_f1 0.9630666666666666
IMDB_test_f1 0.86896
IMDB_valid_f1 0.8744


### for combination of penalty='l2' , loss='hinge',dual=True

In [57]:
pen = ['l2']
los = ['hinge'] # gives error for hinge
dul = [True] # gives error for dual = true. 
tolerance = [1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1,1]
C_param = [1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1,1]

In [59]:
IMDB_valid_f1 = []
IMDB_valid_acc = []
for itr1 in range(len(pen)):
    for itr2 in range(len(tolerance)):
        for itr3 in range(len(C_param)):
            linear_clf = LinearSVC(penalty=pen[itr1],tol=tolerance[itr2],C=C_param[itr3],dual=False).fit(train_mat, train_y_true)
            f1 , acc = getClassifierEff (valid_mat,valid_y_true,linear_clf)
            IMDB_valid_f1.append(f1)
            IMDB_valid_acc.append(acc)
            
            print('IMDB_valid_f1',f1,'when we use penalty',pen[itr1],'tolerence',tolerance[itr2],'C',C_param[itr3])

IMDB_valid_f1 0.5979 when we use penalty l2 tolerence 1e-10 C 1e-10
IMDB_valid_f1 0.5979 when we use penalty l2 tolerence 1e-10 C 1e-09
IMDB_valid_f1 0.6 when we use penalty l2 tolerence 1e-10 C 1e-08
IMDB_valid_f1 0.614 when we use penalty l2 tolerence 1e-10 C 1e-07
IMDB_valid_f1 0.7100000000000001 when we use penalty l2 tolerence 1e-10 C 1e-06
IMDB_valid_f1 0.7907 when we use penalty l2 tolerence 1e-10 C 1e-05
IMDB_valid_f1 0.8315 when we use penalty l2 tolerence 1e-10 C 0.0001
IMDB_valid_f1 0.8673 when we use penalty l2 tolerence 1e-10 C 0.001
IMDB_valid_f1 0.8741 when we use penalty l2 tolerence 1e-10 C 0.01
IMDB_valid_f1 0.8571000000000001 when we use penalty l2 tolerence 1e-10 C 0.1
IMDB_valid_f1 0.8427 when we use penalty l2 tolerence 1e-10 C 1
IMDB_valid_f1 0.5979 when we use penalty l2 tolerence 1e-09 C 1e-10
IMDB_valid_f1 0.5979 when we use penalty l2 tolerence 1e-09 C 1e-09
IMDB_valid_f1 0.6 when we use penalty l2 tolerence 1e-09 C 1e-08
IMDB_valid_f1 0.614 when we use penal

In [61]:
linear_clf = LinearSVC(penalty='l2',tol=0.01,C=0.01,loss='hinge',dual=True).fit(train_mat, train_y_true)
IMDB_train_f1,IMDB_train_acc = getClassifierEff (train_mat,train_y_true,linear_clf)
IMDB_test_f1,IMDB_test_acc = getClassifierEff (test_mat,test_y_true,linear_clf)
IMDB_valid_f1,IMDB_valid_acc = getClassifierEff (valid_mat,valid_y_true,linear_clf)
print('IMDB_train_f1',IMDB_train_f1)
print('IMDB_test_f1',IMDB_test_f1)
print('IMDB_valid_f1',IMDB_valid_f1)

IMDB_train_f1 0.9286666666666666
IMDB_test_f1 0.87072
IMDB_valid_f1 0.8736000000000002
