In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import sys
from sklearn.metrics import accuracy_score
from collections import Counter
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import NearestNeighbors

In [2]:
df = pd.read_csv('CleanIMDB_train.csv') # Windows single-byte character encoding
df.head()

Unnamed: 0,review,sentiment
0,steve carell stars person relate to sort of da...,1
1,found enjoyable muppets movie felt light heart...,1
2,altogether bad start program slap face real la...,0
3,saw film tonight nyc landmark sunshine did not...,1
4,much like japanese movies one did not cut it m...,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 2 columns):
review       5000 non-null object
sentiment    5000 non-null int64
dtypes: int64(1), object(1)
memory usage: 78.2+ KB


In [4]:
print('Total dataset length: %d' %(len(df)))

Total dataset length: 5000


In [5]:
# Split the data to train and test
X = df.iloc[:, 0].values
y = df.iloc[:, 1].values

In [6]:
cvec = CountVectorizer()
# vectorize the data set
X_matrix = cvec.fit_transform(X)
print('Train data count vector total number of features: %d' %(len(cvec.get_feature_names())))
X_matrix.shape
X_matrix_ = X_matrix.toarray()

Train data count vector total number of features: 37996


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_matrix_,y,test_size=0.20, random_state=42)
y_test.shape

(1000,)

In [8]:
print('Total training entries length: %d' %(len(X_train)))
print('Negative training entries account for percent: %f' %(Counter(y_train)[0]/len(y_train)*100))
print('Positive training entries account for percent: %f' %(Counter(y_train)[0]/len(y_train)*100))
print('Total test entries length: %d' %(len(y_test)))
print('Negative test entries account for percent: %f' %(Counter(y_test)[0]/len(y_test)*100))
print('Positive test entries account for percent: %f' %(Counter(y_test)[1]/len(y_test)*100))

Total training entries length: 4000
Negative training entries account for percent: 51.125000
Positive training entries account for percent: 51.125000
Total test entries length: 1000
Negative test entries account for percent: 51.900000
Positive test entries account for percent: 48.100000


In [9]:
# Split the train to train and validation - Holdout Method
X_train_, X_val,y_train_,y_val = train_test_split(X_train,y_train,test_size=0.20, random_state=42)

In [10]:
print("Total training entries after validation split: %d" %(len(X_train_)))
print('Negative train entries after val split account for percent: %f' %(Counter(y_train_)[0]/len(y_train_)*100))
print('Positive train entries after val split account for percent: %f' %(Counter(y_train_)[1]/len(y_train_)*100))
print("Total validation entries after train split: %d" %(len(X_val)))
print('Negative validation entries after val split account for percent: %f' %(Counter(y_val)[0]/len(y_val)*100))
print('Positive validation entries after val split account for percent: %f' %(Counter(y_val)[1]/len(y_val)*100))

Total training entries after validation split: 3200
Negative train entries after val split account for percent: 51.156250
Positive train entries after val split account for percent: 48.843750
Total validation entries after train split: 800
Negative validation entries after val split account for percent: 51.000000
Positive validation entries after val split account for percent: 49.000000


In [26]:
# Baseline
log_model = LogisticRegression(C=1.0, random_state=None, solver='liblinear', penalty='l2')
# Fit the train dataset with LR
log_model = log_model.fit(X=X_train_, y=y_train_)

In [27]:
y_pred = log_model.predict(X_val)
print('Accuracy validation dataset logistics regression: %f' % accuracy_score(y_val, y_pred))

Accuracy validation dataset logistics regression: 0.843750


In [13]:
confusion_matrix(y_val, y_pred)

array([[344,  64],
       [ 61, 331]], dtype=int64)

In [14]:
print(classification_report(y_val, y_pred))

             precision    recall  f1-score   support

          0       0.85      0.84      0.85       408
          1       0.84      0.84      0.84       392

avg / total       0.84      0.84      0.84       800



In [18]:
svm_model = SVC(C=100.0, kernel='rbf', random_state=None, gamma='auto')
svm_model = svm_model.fit(X=X_train_, y=y_train_)

In [19]:
y_pred = svm_model.predict(X_val)
print('Accuracy validation dataset SVM: %f' % accuracy_score(y_val, y_pred))

Accuracy validation dataset SVM: 0.841250


In [35]:
confusion_matrix(y_val, y_pred)

array([[408,   0],
       [ 23, 369]], dtype=int64)

In [36]:
print(classification_report(y_val, y_pred))

             precision    recall  f1-score   support

          0       0.95      1.00      0.97       408
          1       1.00      0.94      0.97       392

avg / total       0.97      0.97      0.97       800



In [30]:
# Naive Bayes
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_val)

In [31]:
print('Accuracy validation dataset Naive Bayes: %f' % accuracy_score(y_val, y_pred))

Accuracy validation dataset Naive Bayes: 0.971250


In [32]:
confusion_matrix(y_val, y_pred)

array([[408,   0],
       [ 23, 369]], dtype=int64)

In [34]:
print(classification_report(y_val, y_pred))

             precision    recall  f1-score   support

          0       0.95      1.00      0.97       408
          1       1.00      0.94      0.97       392

avg / total       0.97      0.97      0.97       800

