# Machine learning for sentiment classification on movie reviews


In [1]:
# load common librairies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Load and prepare data

In [2]:
# preprocessing: prepare data
col_names = ['content', 'label']
pos = pd.DataFrame(columns = col_names)
neg = pd.DataFrame(columns = col_names)

import os
# add positive samples to the DataFrame structure
i=1
for fend in os.listdir('./dataset1/pos/'):
    #data = pd.read_csv('./dataset1/pos/'+fend, sep = None, header = None)
    file = open('./dataset1/pos/'+fend, 'r')
    data = file.read()
    #print(data)
    file.close()
    pos = pos.append(pd.DataFrame({'content':[data], 'label':int(1)}, index=[i]))
    i+=1
# add negative samples to the DataFrame structure
i=1
for fend in os.listdir('./dataset1/neg/'):
    #data = pd.read_csv('./dataset1/neg/'+fend, sep = None, header = None)
    file = open('./dataset1/neg/'+fend, 'r')
    data = file.read()
    file.close()
    neg = neg.append(pd.DataFrame({'content':[data],'label':int(-1)}, index=[i]))
    i+=1

print('done')
print('number of positive samples: {} '.format(len(pos)))
print('number of negative samples: {} '.format(len(neg)))

done
number of positive samples: 1000 
number of negative samples: 1000 


In [None]:
pos.head(10)

In [3]:
# concat positive and negative samples
reviews = pos.append(neg)
print(reviews.head(10))
print(reviews.tail(10))

                                              content label
1   it's wednesday , march 27 , and a murder inves...     1
2   it was with great trepidation that i approache...     1
3   in 1977 , something never though possible happ...     1
4   sometimes you just have to tip your hat to a f...     1
5   the trailers and the beginning of the move sum...     1
6   earlier this year , the movie holy man opened ...     1
7   luckily , some people got starship troopers . ...     1
8   it's been a good long while since we had a goo...     1
9   the start of this movie reminded me of parts f...     1
10  star wars : episode i - the phantom menace rev...     1
                                                content label
991   the " disney stick-to-what-you-do-best " rule ...    -1
992   the only two really good things that i can say...    -1
993   1 . he doesn't have a hard-to-decipher accent ...    -1
994   filmmakers will use all manner of tricks to fl...    -1
995   ugh . that about sums th

In [4]:
# convert label to a numerical variable
#reviews["label_num"] = reviews.label.map({"1":int(1), "-1":int(0)})
reviews["label_num"] = reviews['label'].astype(int)
reviews.label_num

1       1
2       1
3       1
4       1
5       1
6       1
7       1
8       1
9       1
10      1
11      1
12      1
13      1
14      1
15      1
16      1
17      1
18      1
19      1
20      1
21      1
22      1
23      1
24      1
25      1
26      1
27      1
28      1
29      1
30      1
       ..
971    -1
972    -1
973    -1
974    -1
975    -1
976    -1
977    -1
978    -1
979    -1
980    -1
981    -1
982    -1
983    -1
984    -1
985    -1
986    -1
987    -1
988    -1
989    -1
990    -1
991    -1
992    -1
993    -1
994    -1
995    -1
996    -1
997    -1
998    -1
999    -1
1000   -1
Name: label_num, Length: 2000, dtype: int64

In [5]:
# define X (items) and y (labels)
X = reviews.content
y = reviews.label_num

In [6]:
# split randomly X and y into train and test sets (NB: always uses the same seed)
from sklearn.model_selection import train_test_split
#from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.25)
print('number of train samples: {} '.format(len(X_train)))
print('number of test samples: {} '.format(len(X_test)))
print([X_train.head(10), y_train.head(10)])
print([X_test.head(10), y_test.head(10)])

number of train samples: 1500 
number of test samples: 500 
[651    after bloody clashes and independence won , lu...
105    alexander dumas' the three musketeers is one o...
562    no , i did not read the novel by thomas hardy ...
644    billy bob thornton , who had a sudden rise to ...
442    george little ( jonathan lipnicki ) wants a li...
629    fantastically over hyped , godzila finally lum...
997    the kids in the hall are an acquired taste . \...
681    titantic , writer and director james cameron's...
813    synopsis : captain picard and the crew of the ...
505    i think maybe it's time for the batman series ...
Name: content, dtype: object, 651    1
105   -1
562    1
644    1
442    1
629   -1
997   -1
681    1
813    1
505   -1
Name: label_num, dtype: int64]
[675    modern audiences are more likely to be familia...
700    this movie about two dysfunctional families ne...
283    in 1989 , director edward zwick began his care...
316    when critics attack seemingly well-inte

## Representing text as numerical data

In [7]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [8]:
# learn the "vocabulary" of the training data (occurs in-place)
vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [9]:
# examine the fitted vocabulary
vocabulary = vect.get_feature_names()
print('number of words in the vocabulary: {} '.format(len(vocabulary)))
vocabulary

number of words in the vocabulary: 35604 


['00',
 '000',
 '007',
 '00s',
 '03',
 '04',
 '05',
 '05425',
 '10',
 '100',
 '1000',
 '10000',
 '100m',
 '101',
 '102',
 '103',
 '105',
 '106',
 '107',
 '108',
 '10b',
 '10s',
 '10th',
 '11',
 '110',
 '111',
 '112',
 '113',
 '114',
 '115',
 '117',
 '118',
 '11th',
 '12',
 '121',
 '123',
 '125',
 '126',
 '127',
 '1272',
 '128',
 '129',
 '1298',
 '12th',
 '13',
 '130',
 '1305',
 '132',
 '133',
 '135',
 '137',
 '138',
 '139',
 '13th',
 '14',
 '140',
 '1400',
 '143',
 '144',
 '14th',
 '15',
 '150',
 '1500s',
 '150th',
 '152',
 '1521',
 '153',
 '155',
 '1554',
 '157',
 '1583',
 '1590',
 '15th',
 '16',
 '160',
 '1600',
 '1600s',
 '161',
 '165',
 '167',
 '1692',
 '16mm',
 '16th',
 '16x9',
 '17',
 '170',
 '1700s',
 '1709',
 '172',
 '175',
 '1773',
 '1791',
 '1792',
 '1793',
 '1794',
 '1799',
 '17th',
 '18',
 '180',
 '1800s',
 '1812',
 '1830s',
 '1839',
 '1847',
 '1862',
 '1869',
 '1871',
 '1885',
 '1888',
 '189',
 '1896',
 '1898',
 '18th',
 '19',
 '1900',
 '1900s',
 '1903',
 '1908',
 '1912',


In [10]:
# transform training data into a "document-term matrix'
X_train_dtm = vect.transform(X_train)
X_train_dtm

<1500x35604 sparse matrix of type '<class 'numpy.int64'>'
	with 502716 stored elements in Compressed Sparse Row format>

In [11]:
# examine the content of the sparse matrix
print(X_train_dtm)

  (0, 346)	1
  (0, 596)	1
  (0, 626)	1
  (0, 839)	1
  (0, 1090)	1
  (0, 1401)	1
  (0, 1445)	1
  (0, 1482)	1
  (0, 1545)	1
  (0, 1586)	10
  (0, 1953)	1
  (0, 2022)	1
  (0, 2115)	1
  (0, 2244)	1
  (0, 2279)	1
  (0, 2792)	1
  (0, 2981)	1
  (0, 3182)	1
  (0, 3183)	1
  (0, 3690)	1
  (0, 3866)	1
  (0, 4468)	1
  (0, 4646)	1
  (0, 4681)	3
  (0, 5115)	1
  :	:
  (1499, 31662)	14
  (1499, 31730)	1
  (1499, 31768)	7
  (1499, 31963)	1
  (1499, 32055)	4
  (1499, 32144)	2
  (1499, 32245)	1
  (1499, 32683)	1
  (1499, 33323)	1
  (1499, 33888)	2
  (1499, 33971)	1
  (1499, 34060)	1
  (1499, 34091)	1
  (1499, 34419)	1
  (1499, 34431)	1
  (1499, 34463)	1
  (1499, 34473)	1
  (1499, 34503)	1
  (1499, 34733)	1
  (1499, 34815)	2
  (1499, 34913)	1
  (1499, 35029)	2
  (1499, 35202)	1
  (1499, 35446)	2
  (1499, 35501)	1


In [12]:
# examine the vocabulary and document-term matrix together (X_train_dtm.toarray() converts sparse matrix to a dense matrix)
pd.DataFrame(X_train_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,00,000,007,00s,03,04,05,05425,10,100,...,zucker,zuehlke,zuko,zukovsky,zulu,zurg,zweibel,zwick,zwigoff,zzzzzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

<500x35604 sparse matrix of type '<class 'numpy.int64'>'
	with 159659 stored elements in Compressed Sparse Row format>

## Class prediction with Multinomial Naive Bayes

In [14]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [15]:
# train the model using X_train_dtm (timing it with an IPython "magic command")
%time nb.fit(X_train_dtm, y_train)

CPU times: user 11.5 ms, sys: 3 ms, total: 14.5 ms
Wall time: 13.6 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [None]:
# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)

### Performance evaluation

In [None]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

In [None]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

In [None]:
# print reviews for the false positives
X_test[(y_pred_class==1) & (y_test==-1)]

In [None]:
# print reviews for the false negatives
X_test[(y_pred_class==-1) & (y_test==1)]

In [None]:
# calculate predicted probabilities for X_test_dtm (poorly calibrated)
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]

In [None]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

## Class prediction with logistic regression

In [None]:
# import and instantiate a logistic regression model 
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [None]:
# train the model using X_train_dtm
%time logreg.fit(X_train_dtm, y_train)

In [None]:
# make class predictions for X_test_dtm
y_pred_class = logreg.predict(X_test_dtm)

### Performance evaluation

In [None]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)

In [None]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

In [None]:
# calculate predicted probabilities for X_test_dtm (well calibrated)
y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]

In [None]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

## Class prediction with SVM

In [None]:
# import, instantiate and train a SVM model without probability estimation
from sklearn.svm import SVC
clf = SVC(kernel='linear')
%time clf.fit(X_train_dtm, y_train)

In [None]:
# make class predictions for X_test_dtm
y_pred_class = clf.predict(X_test_dtm)

### Performance evaluation

In [None]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)

In [None]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

In [None]:
# import, instantiate and train a SVM model with probability estimation
clf = SVC(kernel='linear', probability=True)
%time clf.fit(X_train_dtm, y_train)

In [None]:
# calculate predicted probabilities for X_test_dtm
y_pred_prob = clf.predict_proba(X_test_dtm)[:, 1]

In [None]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

### Find the best SVM parameters with grid search

In [None]:
# 1st step: large logarithmic grid search
from sklearn.model_selection import GridSearchCV
C_range = np.logspace(0, 10, 6)
gamma_range = np.logspace(-9, -1, 5)
param_grid = dict(gamma=gamma_range, C=C_range)
param_grid

In [None]:
# operate grid search with default RBF kernel
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=3, return_train_score=True)
grid.fit(X_train_dtm, y_train)

In [None]:
# print results
print("The best parameters are %s with a score of %0.2f" % (grid.best_params_, grid.best_score_))
scores = grid.cv_results_['mean_test_score'].reshape(len(C_range),len(gamma_range))
scores

In [None]:
# draw heatmap
plt.figure(figsize=(8, 6))
plt.imshow(scores, interpolation='nearest', cmap=plt.cm.hot)
plt.xlabel('gamma')
plt.ylabel('C')
plt.colorbar()
plt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45)
plt.yticks(np.arange(len(C_range)), C_range)
plt.title('Validation accuracy')
plt.show()

In [None]:
# 2nd step: precise logarithmic grid search on selected range
C_range = np.logspace(4, 8, 5)
gamma_range = np.logspace(-8, -4, 5)
param_grid = dict(gamma=gamma_range, C=C_range)
grid = GridSearchCV(SVC(kernel='linear'), param_grid=param_grid)
param_grid

In [None]:
# operate grid search with default RBF kernel
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=3, return_train_score=True)
grid.fit(X_train_dtm, y_train)

In [None]:
# print results
print("The best parameters are %s with a score of %0.2f" % (grid.best_params_, grid.best_score_))
scores = grid.cv_results_['mean_test_score'].reshape(len(C_range),len(gamma_range))
scores

In [None]:
# draw heatmap
plt.figure(figsize=(8, 6))
plt.imshow(scores, interpolation='nearest', cmap=plt.cm.hot)
plt.xlabel('gamma')
plt.ylabel('C')
plt.colorbar()
plt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45)
plt.yticks(np.arange(len(C_range)), C_range)
plt.title('Validation accuracy')
plt.show()

## Class prediction using bidirectional LSTM

In [None]:
print(y_train.values.reshape(1, y_train.shape[0], 1))

In [None]:
print(X_train_dtm.toarray().reshape(1, X_train_dtm.shape[0], X_train_dtm.shape[1]))

In [31]:
import keras
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import TimeDistributed
from keras.layers import Bidirectional

#on fixe la seed utilisée par LSTM()
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)


y_tr = [y_train.values.reshape(1, y_train.shape[0], 1).tolist()]
y_ts = [y_test.values.reshape(1, y_test.shape[0], 1).tolist()]

X_tr = X_train_dtm.toarray()
X_ts = X_test_dtm.toarray()

X_tr = np.reshape(X_tr, X_tr.shape + (1,))
X_ts = np.reshape(X_ts, X_ts.shape + (1,))

print(X_tr)
print(y_tr)

#https://stackoverflow.com/questions/44273249/in-keras-what-exactly-am-i-configuring-when-i-create-a-stateful-lstm-layer-wi

#dans un terminal, nous pouvons ensuite lancer tensorboard de cette façon:
#    tensorboard --logdir path_to_current_dir/Graph 

#on crée le tensorflow callback pour la visualisation dans le terminal avec tensorboard
tbCallBack = keras.callbacks.TensorBoard(log_dir='./Graph', histogram_freq=0, write_graph=True, write_images=True)

model = Sequential()
model.add(Bidirectional(LSTM(128, return_sequences=True), input_shape=X_tr.shape[1:], merge_mode='concat'))
model.add(TimeDistributed(Dense(1, activation='sigmoid')))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

#Entrainement
for epoch in range(250):
    model.fit(X_tr, y_tr, epochs=1, batch_size=1, verbose=2, callbacks=[tbCallBack])
    
#Accuracy
score = model.evaluate(X_ts, y_ts, batch_size=1)
print(score)

[[[0]
  [0]
  [0]
  ...
  [0]
  [0]
  [0]]

 [[0]
  [0]
  [0]
  ...
  [0]
  [0]
  [0]]

 [[0]
  [0]
  [0]
  ...
  [0]
  [0]
  [0]]

 ...

 [[0]
  [0]
  [0]
  ...
  [0]
  [0]
  [0]]

 [[0]
  [0]
  [0]
  ...
  [0]
  [0]
  [0]]

 [[0]
  [0]
  [0]
  ...
  [0]
  [0]
  [0]]]
[[[1], [-1], [1], [1], [1], [-1], [-1], [1], [1], [-1], [-1], [1], [1], [-1], [-1], [1], [-1], [-1], [1], [1], [1], [1], [1], [-1], [-1], [1], [-1], [1], [1], [1], [-1], [-1], [1], [1], [1], [1], [1], [1], [-1], [-1], [-1], [1], [-1], [1], [-1], [1], [-1], [1], [-1], [-1], [-1], [-1], [1], [-1], [-1], [-1], [-1], [-1], [-1], [-1], [1], [-1], [-1], [-1], [-1], [-1], [-1], [1], [-1], [-1], [1], [-1], [-1], [-1], [1], [-1], [1], [-1], [-1], [-1], [-1], [1], [1], [1], [1], [-1], [-1], [1], [-1], [-1], [-1], [1], [1], [1], [-1], [-1], [-1], [1], [1], [-1], [-1], [1], [-1], [-1], [-1], [-1], [1], [-1], [-1], [1], [-1], [-1], [1], [1], [1], [-1], [-1], [-1], [-1], [1], [-1], [1], [1], [1], [1], [1], [1], [-1], [1], [1], [-1], [

ValueError: Error when checking target: expected time_distributed_9 to have 3 dimensions, but got array with shape (1500, 1)