In [1]:
# import necessary packages
import numpy as np
import pandas as pd
from IPython.display import display # Allows the use of display() for DataFrames
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno # module for missing value visualization
from scipy import stats # implement box-cox transformation
from math import ceil
from sklearn.utils import shuffle # shuffling the dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.naive_bayes import MultinomialNB # for sentiment analysis benchmark model
from sklearn.model_selection import cross_val_score # cross validation score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from keras.utils import np_utils # encode categorical variable
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.callbacks import ModelCheckpoint, EarlyStopping  


# Pretty display for notebooks
%matplotlib inline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


<a id="step1"></a>
#### step1: import dataset `part6_dataset.pickle` as `part7_dataset`

In [2]:
part7_dataset = pd.read_pickle("part6_dataset.pickle")

<a id="step2"></a>
#### step2: shuffle and sampling 50% of the dataset
1. Use the first 50% of dataset as training and validation dataset for sentiment analysis.

In [3]:
# separate target variable out - review_sentiment
target_variable = part7_dataset.review_sentiment
target_variable = target_variable.astype("category")

# just sample 50% of the whole dataset - use train_test_split() to achieve same result
X_first50, X_remaining50, y_first50, y_remaining50 = train_test_split(part7_dataset, target_variable,
                                                                      test_size = 0.5, stratify = target_variable)

<a id="step3"></a>
#### step3: create necessary class and self-defined-fun for Sentiment Analysis

In [4]:
# create a class for lemmatizer
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

<a id="step4"></a>
#### step4: create a bag of words solely for Sentiment Analysis
1. setting the ngram can be up to bi-grams.
2. later I will use MultinomialNB model, so it's better to use `CountVectorizer` instead of `TfidfVectorizer`.

In [5]:
# build up a bag of words for Sentiment Analysis
n_features = 5000

sentiment_count_vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(),
                                             max_df=0.5, min_df=2, # word fequency less than 50% and shows at least in 2 doc
                                             max_features=n_features,
                                             stop_words="english",
                                             ngram_range=(1,2))

# fit and transform data
sentiment_count = sentiment_count_vectorizer.fit_transform(X_first50["combined_review"])
sentiment_count = sentiment_count.toarray() # transform from sparse to dense matrix

<a id="step5"></a>
#### step5: build up the benchmark model (naive bayes) for sentiment analysis

In [6]:
# create the benchmark model
naive_bayes = MultinomialNB()
naive_bayes.fit(sentiment_count, y_first50)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

<a id="step6"></a>
#### step6: evaluate the naive bayes model performance based on cross-validation 

In [7]:
# evaluate the model performance based on cross-validation
mean_accuracy = cross_val_score(estimator=naive_bayes, X=sentiment_count, y=y_first50, cv=5, scoring="accuracy").mean()
print("mean accuracy of corss-validation: {}".format(round(mean_accuracy,2)))

# calculate the f1 score
mean_f1 = cross_val_score(estimator=naive_bayes, X=sentiment_count, y=y_first50, cv=5, scoring="f1").mean()
print("mean f1 of corss-validation: {}".format(round(mean_f1,2)))

# calculate the recall
mean_recall = cross_val_score(estimator=naive_bayes, X=sentiment_count, y=y_first50, cv=5, scoring="recall").mean()
print("mean recall of corss-validation: {}".format(round(mean_recall,2)))

mean accuracy of corss-validation: 0.86
mean f1 of corss-validation: 0.92
mean recall of corss-validation: 0.88


<a id="step7"></a>
#### step7: general output of the model prediction and confusion matrix on the training dataset (nb model)

In [8]:
# create the prediction from naive_bayes model
prediction = naive_bayes.predict(sentiment_count)

# print the confusion matrix
print("confusion matrix from naive bayes model")
print(confusion_matrix(y_first50, prediction))
print()

# print the overview of performance metrics
print("the overview of performance metrics")
print(classification_report(y_first50, prediction))

confusion matrix from naive bayes model
[[ 18331   7837]
 [ 26677 203127]]
()
the overview of performance metrics
             precision    recall  f1-score   support

          0       0.41      0.70      0.52     26168
          1       0.96      0.88      0.92    229804

avg / total       0.91      0.87      0.88    255972



<a id="step8"></a>
#### step8: evaluate navie bayes model on the remaining 50% dataset.

In [9]:
# transform the remaining 50% dataset
test_sentiment_count = sentiment_count_vectorizer.transform(X_remaining50["combined_review"])
test_sentiment_count = test_sentiment_count.toarray()

In [14]:
# evaluate the performance on the remaining dataset
test_prediction = naive_bayes.predict(test_sentiment_count)

# print the confusion matrix
print("confusion matrix from naive bayes model (in remaining test dataset)")
print(confusion_matrix(y_remaining50, test_prediction))
print()

# print the overview of performance metrics
print("the overview of performance metrics (in remaining test dataset)")
print(classification_report(y_remaining50, test_prediction))

confusion matrix from naive bayes model (in remaining test dataset)
[[ 18233   7936]
 [ 26784 203019]]
()
the overview of performance metrics (in remaining test dataset)
             precision    recall  f1-score   support

          0       0.41      0.70      0.51     26169
          1       0.96      0.88      0.92    229803

avg / total       0.91      0.86      0.88    255972



<a id="step9"></a>
#### step9: the comparison model - multi-layer perceptron model

In [15]:
# encode the target variable
target_variable = np_utils.to_categorical(y_first50, num_classes=2) # training
test_target_varialbe = np_utils.to_categorical(y_remaining50, num_classes=2) # test

In [16]:
# Building the model architecture
model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(n_features,)))
model.add(Dropout(0.2))
model.add(Dense(256, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(2, activation='softmax'))
model.summary()

# Compiling the model using categorical_crossentropy loss, and rmsprop optimizer.
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               2560512   
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_3 (Dense)              (None, 64)                16448     
_________________________________________________________________
dense_4 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 66        
Total params: 2,710,434
Trainable params: 2,710,434
Non-trainable params: 0
_________________________________________________________________


In [17]:
# Running and evaluating the model

checkpointer = ModelCheckpoint(filepath='sentiment.model.best.hdf5', 
                               verbose=1, save_best_only=True)

earlystop = EarlyStopping(patience=2)

hist = model.fit(sentiment_count, target_variable,
          batch_size=50,
          epochs=20,
          validation_split=0.25,
          callbacks=[checkpointer, earlystop],
          verbose=2,
          shuffle=True)

Train on 191979 samples, validate on 63993 samples
Epoch 1/20
Epoch 00000: val_loss improved from inf to 0.24348, saving model to sentiment.model.best.hdf5
36s - loss: 0.2307 - acc: 0.9145 - val_loss: 0.2435 - val_acc: 0.9145
Epoch 2/20
Epoch 00001: val_loss improved from 0.24348 to 0.22029, saving model to sentiment.model.best.hdf5
34s - loss: 0.2192 - acc: 0.9198 - val_loss: 0.2203 - val_acc: 0.9170
Epoch 3/20
Epoch 00002: val_loss did not improve
33s - loss: 0.2155 - acc: 0.9213 - val_loss: 0.2255 - val_acc: 0.9165
Epoch 4/20
Epoch 00003: val_loss did not improve
34s - loss: 0.2133 - acc: 0.9225 - val_loss: 0.2414 - val_acc: 0.9122
Epoch 5/20
Epoch 00004: val_loss did not improve
34s - loss: 0.2108 - acc: 0.9243 - val_loss: 0.2279 - val_acc: 0.9160


<a id="step10"></a>
#### step10: general output of the model prediction and confusion matrix on the training dataset (mlp model)

In [18]:
# create the prediction from multi-layer perceptron model
prediction_mlp = model.predict(sentiment_count)
prediction_mlp = prediction_mlp.argmax(axis=1)

# print the confusion matrix
print("confusion matrix from multi-layer perceptron model")
print(confusion_matrix(y_first50, prediction_mlp))
print()

# print the overview of performance metrics
print("the overview of performance metrics")
print(classification_report(y_first50, prediction_mlp))

confusion matrix from multi-layer perceptron model
[[  8745  17423]
 [  2067 227737]]
()
the overview of performance metrics
             precision    recall  f1-score   support

          0       0.81      0.33      0.47     26168
          1       0.93      0.99      0.96    229804

avg / total       0.92      0.92      0.91    255972



<a id="step11"></a>
#### step11: evaluate multi-layer perceptron model on the remaining 50% dataset.

In [19]:
# evaluate the performance on the remaining dataset
test_prediction_mlp = model.predict(test_sentiment_count)
test_prediction_mlp = test_prediction_mlp.argmax(axis=1)

# print the confusion matrix
print("confusion matrix from multi-layer perceptron model (in remaining test dataset)")
print(confusion_matrix(y_remaining50, test_prediction_mlp))
print()

# print the overview of performance metrics
print("the overview of performance metrics (in remaining test dataset)")
print(classification_report(y_remaining50, test_prediction_mlp))

confusion matrix from multi-layer perceptron model (in remaining test dataset)
[[  7790  18379]
 [  2683 227120]]
()
the overview of performance metrics (in remaining test dataset)
             precision    recall  f1-score   support

          0       0.74      0.30      0.43     26169
          1       0.93      0.99      0.96    229803

avg / total       0.91      0.92      0.90    255972



#### step12: export the remaining 50% dataset for the next modeling process
1. Overall speaking, multi-layer perceptron model has higher f1 score, but when look at the performance on each class, especially in this case of an unbalanced dataset, naive bayes model has a better performance of recall and f1 on the small class (that is the 0 group). Hence, I will use naive bayes as the final model.  

In [21]:
# add in the predicted review sentiment from naive bayes model 
X_remaining50["bayes_predict_review_sentiment"] =  test_prediction
X_remaining50.to_pickle("X_remaining50.pickle")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
