# Machine learning for sentiment classification on movie reviews


In [None]:
# load common librairies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Load and prepare data

In [None]:
# preprocessing: prepare data
col_names = ['content', 'label']
pos = pd.DataFrame(columns = col_names)
neg = pd.DataFrame(columns = col_names)

import os
# add positive samples to the DataFrame structure
i=1
for fend in os.listdir('./dataset1/pos/'):
    #data = pd.read_csv('./dataset1/pos/'+fend, sep = None, header = None)
    file = open('./dataset1/pos/'+fend, 'r')
    data = file.read()
    #print(data)
    file.close()
    pos = pos.append(pd.DataFrame({'content':[data], 'label':int(1)}, index=[i]))
    i+=1
# add negative samples to the DataFrame structure
i=1
for fend in os.listdir('./dataset1/neg/'):
    #data = pd.read_csv('./dataset1/neg/'+fend, sep = None, header = None)
    file = open('./dataset1/neg/'+fend, 'r')
    data = file.read()
    file.close()
    neg = neg.append(pd.DataFrame({'content':[data],'label':int(-1)}, index=[i]))
    i+=1

print('done')
print('number of positive samples: {} '.format(len(pos)))
print('number of negative samples: {} '.format(len(neg)))

In [None]:
pos.head(10)

In [None]:
# concat positive and negative samples
reviews = pos.append(neg)
print(reviews.head(10))
print(reviews.tail(10))

In [None]:
# convert label to a numerical variable
#reviews["label_num"] = reviews.label.map({"1":int(1), "-1":int(0)})
reviews["label_num"] = reviews['label'].astype(int)
reviews.label_num

In [None]:
# define X (items) and y (labels)
X = reviews.content
y = reviews.label_num

In [None]:
# split randomly X and y into train and test sets
from sklearn.model_selection import train_test_split
#from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.25)
print('number of train samples: {} '.format(len(X_train)))
print('number of test samples: {} '.format(len(X_test)))
print([X_train.head(10), y_train.head(10)])
print([X_test.head(10), y_test.head(10)])

## Representing text as numerical data

In [None]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [None]:
# learn the "vocabulary" of the training data (occurs in-place)
vect.fit(X_train)

In [None]:
# examine the fitted vocabulary
vocabulary = vect.get_feature_names()
print('number of words in the vocabulary: {} '.format(len(vocabulary)))
vocabulary

In [None]:
# transform training data into a "document-term matrix'
X_train_dtm = vect.transform(X_train)
X_train_dtm

In [None]:
# examine the content of the sparse matrix
print(X_train_dtm)

In [None]:
# examine the vocabulary and document-term matrix together (X_train_dtm.toarray() converts sparse matrix to a dense matrix)
pd.DataFrame(X_train_dtm.toarray(), columns=vect.get_feature_names())

In [None]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

## Class prediction with Multinomial Naive Bayes

In [None]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [None]:
# train the model using X_train_dtm (timing it with an IPython "magic command")
%time nb.fit(X_train_dtm, y_train)

In [None]:
# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)

### Performance evaluation

In [None]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

In [None]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

In [None]:
# print reviews for the false positives
X_test[(y_pred_class==1) & (y_test==-1)]

In [None]:
# print reviews for the false negatives
X_test[(y_pred_class==-1) & (y_test==1)]

In [None]:
# calculate predicted probabilities for X_test_dtm (poorly calibrated)
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]

In [None]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

## Class prediction with logistic regression

In [None]:
# import and instantiate a logistic regression model 
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [None]:
# train the model using X_train_dtm
%time logreg.fit(X_train_dtm, y_train)

In [None]:
# make class predictions for X_test_dtm
y_pred_class = logreg.predict(X_test_dtm)

### Performance evaluation

In [None]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)

In [None]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

In [None]:
# calculate predicted probabilities for X_test_dtm (well calibrated)
y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]

In [None]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

## Class prediction with SVM

In [None]:
# import, instantiate and train a SVM model without probability estimation
from sklearn.svm import SVC
clf = SVC(kernel='linear')
%time clf.fit(X_train_dtm, y_train)

In [None]:
# make class predictions for X_test_dtm
y_pred_class = clf.predict(X_test_dtm)

### Performance evaluation

In [None]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)

In [None]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

In [None]:
# import, instantiate and train a SVM model with probability estimation
clf = SVC(kernel='linear', probability=True)
%time clf.fit(X_train_dtm, y_train)

In [None]:
# calculate predicted probabilities for X_test_dtm
y_pred_prob = clf.predict_proba(X_test_dtm)[:, 1]

In [None]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

### Find the best SVM parameters with grid search

In [None]:
# 1st step: logarithmic grid search
from sklearn.model_selection import GridSearchCV
C_range = np.logspace(-2, 9, 12)
gamma_range = np.logspace(-8, 3, 12)
param_grid = dict(gamma=gamma_range, C=C_range)
param_grid

In [None]:
# operates grid search
grid = GridSearchCV(SVC(kernel='linear'), param_grid=param_grid, cv=3, return_train_score=True)
grid.fit(X_train_dtm, y_train)

In [None]:
print("The best parameters are %s with a score of %0.2f" % (grid.best_params_, grid.best_score_))
grid.cv_results_['mean_test_score'].reshape(len(C_range),len(gamma_range))

In [None]:
# 2nd step: precise grid search
