In [1]:
import os
import nltk
import random
from sklearn.metrics import confusion_matrix
import pandas as pd

## Reading the text file

In [2]:
with open("positive.txt", "r", encoding='latin-1') as f:
    pos_docs = f.read()
    pos_docs = pos_docs.split('\n')

with open("negative.txt", "r", encoding='latin-1') as f:
    neg_docs = f.read()
    neg_docs = neg_docs.split('\n')

In [3]:
print(len(pos_docs))
print(len(neg_docs))

5332
5332


## Combining documents

In [4]:
combined_docs = pos_docs + neg_docs
labels = ['POS']*len(pos_docs) + ['NEG']*len(pos_docs)

combined_df = pd.DataFrame({"Review": combined_docs, "Sentiment": labels})
#combined_df = combined_df.sample(frac=1)
combined_df = combined_df.sample(frac=1)


combined_df.head(10)

Unnamed: 0,Review,Sentiment
7367,for anyone who grew up on disney's 1950 treasu...,NEG
8634,"sadly , 'garth' hasn't progressed as nicely as...",NEG
29,it helps that lil bow wow . . . tones down his...,POS
7152,. . . the film suffers from a lack of humor (...,NEG
5621,bad company . bad movie . just plain bad .,NEG
4191,miller comes at film with bracing intelligence...,POS
8853,. . . the good and different idea [of middle-...,NEG
10279,"while hoffman's performance is great , the sub...",NEG
6342,"the jokes are sophomoric , stereotypes are spr...",NEG
5368,"please , someone , stop eric schaeffer before ...",NEG


## Train-Test Split

In [5]:
from sklearn.model_selection import train_test_split

y = combined_df['Sentiment'].tolist()
X = combined_df.loc[:,'Review'].tolist()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1224)

## TF-IDF Vectorization

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
tokenizer = TfidfVectorizer(ngram_range = (1,3), stop_words=None,min_df=10)
X_train_tf = tokenizer.fit_transform(X_train).toarray()
X_test_tf = tokenizer.transform(X_test).toarray()

print(X_train_tf.shape)
print(X_test_tf.shape)

(8531, 3204)
(2133, 3204)


## Bringing down the dimentionality using SVD (optional)

In [7]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=2000, n_iter=10, random_state=42)
X_train_tf = svd.fit_transform(X_train_tf)
print(svd.explained_variance_ratio_.sum())
X_test_tf = svd.transform(X_test_tf)

0.925589315213874


## Naive Bayes

In [8]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report

NBclassifier = GaussianNB()
NBclassifier.fit(X_train_tf, y_train)

## Predictions
train_nb_preds = NBclassifier.predict(X_train_tf)
test_nb_preds = NBclassifier.predict(X_test_tf)

print("Train Accuracy",NBclassifier.score(X_train_tf,y_train))
print(confusion_matrix(y_train,train_nb_preds))
print(classification_report(y_train,train_nb_preds))

print("Test Accuracy",NBclassifier.score(X_test_tf,y_test))
print(confusion_matrix(y_test,test_nb_preds))
print(classification_report(y_test,test_nb_preds))

Train Accuracy 0.6948775055679287
[[3005 1253]
 [1350 2923]]
              precision    recall  f1-score   support

         NEG       0.69      0.71      0.70      4258
         POS       0.70      0.68      0.69      4273

   micro avg       0.69      0.69      0.69      8531
   macro avg       0.69      0.69      0.69      8531
weighted avg       0.69      0.69      0.69      8531

Test Accuracy 0.6324425691514299
[[825 249]
 [535 524]]
              precision    recall  f1-score   support

         NEG       0.61      0.77      0.68      1074
         POS       0.68      0.49      0.57      1059

   micro avg       0.63      0.63      0.63      2133
   macro avg       0.64      0.63      0.62      2133
weighted avg       0.64      0.63      0.63      2133



## Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression
logit = LogisticRegression()
logit.fit(X_train_tf,y_train)

## Predictions
train_logit_preds = logit.predict(X_train_tf)
test_logit_preds = logit.predict(X_test_tf)

print("Train Accuracy",logit.score(X_train_tf,y_train))
print(confusion_matrix(y_train,train_logit_preds))
print(classification_report(y_train,train_logit_preds))

print("Test Accuracy",logit.score(X_test_tf,y_test))
print(confusion_matrix(y_test,test_logit_preds))
print(classification_report(y_test,test_logit_preds))



Train Accuracy 0.8262806236080178
[[3594  711]
 [ 771 3455]]
              precision    recall  f1-score   support

         NEG       0.82      0.83      0.83      4305
         POS       0.83      0.82      0.82      4226

   micro avg       0.83      0.83      0.83      8531
   macro avg       0.83      0.83      0.83      8531
weighted avg       0.83      0.83      0.83      8531

Test Accuracy 0.7501172058134083
[[784 243]
 [290 816]]
              precision    recall  f1-score   support

         NEG       0.73      0.76      0.75      1027
         POS       0.77      0.74      0.75      1106

   micro avg       0.75      0.75      0.75      2133
   macro avg       0.75      0.75      0.75      2133
weighted avg       0.75      0.75      0.75      2133



## Random Forest with parameter tuning

In [42]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import recall_score, precision_score, f1_score, make_scorer
score_metric = make_scorer(f1_score, pos_label='POS')
## n_jobs = -1 uses all cores of processor
## max_features is the maximum number of attributes to select for each tree
rfc_grid = RandomForestClassifier(n_jobs=-1, max_features='sqrt', class_weight='balanced')
 
# Use a grid over parameters of interest
## n_estimators is the number of trees in the forest
## max_depth is how deep each tree can be
## min_sample_leaf is the minimum samples required in each leaf node for the root node to split
## "A node will only be split if in each of it's leaf nodes there should be min_sample_leaf"

param_grid = {"n_estimators" : [10, 25, 50, 75, 100],
           "max_depth" : [10, 12, 14, 16, 18, 20],
           "min_samples_leaf" : [5, 10, 15, 20]}
 
rfc_cv_grid = RandomizedSearchCV(estimator = rfc_grid, param_distributions = param_grid, cv = 3, n_iter=20, scoring=score_metric, n_jobs=8)
rfc_cv_grid.fit(X_train_tf, y_train)
rfc_cv_grid.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=16, max_features='sqrt',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=20,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [43]:
rfc_train_pred = rfc_cv_grid.best_estimator_.predict(X_train_tf)
rfc_test_pred = rfc_cv_grid.best_estimator_.predict(X_test_tf)

print("Train")
print(accuracy_score(y_train,rfc_train_pred))
print(confusion_matrix(y_train,rfc_train_pred))
print(classification_report(y_train,rfc_train_pred))

print("OOB Score", rfc_50.oob_score_)

print("Test")
print(accuracy_score(y_test,rfc_test_pred))
print(confusion_matrix(y_test,rfc_test_pred))
print(classification_report(y_test,rfc_test_pred))

Train
0.7015590200445434
[[3057 1225]
 [1321 2928]]
              precision    recall  f1-score   support

         NEG       0.70      0.71      0.71      4282
         POS       0.71      0.69      0.70      4249

   micro avg       0.70      0.70      0.70      8531
   macro avg       0.70      0.70      0.70      8531
weighted avg       0.70      0.70      0.70      8531

OOB Score 0.5596061423045364
Test
0.6451007969995312
[[671 379]
 [378 705]]
              precision    recall  f1-score   support

         NEG       0.64      0.64      0.64      1050
         POS       0.65      0.65      0.65      1083

   micro avg       0.65      0.65      0.65      2133
   macro avg       0.65      0.65      0.65      2133
weighted avg       0.65      0.65      0.65      2133



## KNN

In [24]:
# from sklearn.neighbors import KNeighborsClassifier
# knn = KNeighborsClassifier(n_neighbors=5)
# knn.fit(X_train,y_train)
# print("Train Accuracy",knn.score(X_train,y_train))
# print("Test Accuracy",knn.score(X_test,y_test))

## SVM

In [None]:
from sklearn.svm import SVC
svm_model = SVC(C=.1, gamma=1, kernel='rbf', cache_size=5900, )
svm_model.fit(X_train_tf, y_train)

train_svm_preds = svm_model.predict(X_train_tf)
test_svm_preds = svm_model.predict(X_test_tf)

print("Train Accuracy",svm_model.score(X_train_tf,y_train))
print(confusion_matrix(y_train,train_svm_preds))
print(classification_report(y_train,train_svm_preds))

print("Test Accuracy",svm_model.score(X_test_tf,y_test))
print(confusion_matrix(y_test,test_svm_preds))
print(classification_report(y_test,test_svm_preds))

In [28]:
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier

BagClassifier = BaggingClassifier(SVC(C=50,gamma=1,kernel='rbf',cache_size=1900),n_estimators=12,max_samples=0.20,n_jobs=-1)
BagClassifier.fit(X_train_tf,y_train)
print("Train Accuracy",BagClassifier.score(X_train_tf,y_train))
print("Test Accuracy",BagClassifier.score(X_test_tf,y_test))

Train Accuracy 0.8110420818192474
Test Accuracy 0.7238631036099391
