In [None]:
# 進行sentiment analysis，需要進行的步驟：
# 先利用train_test_split提取50%的資料(記得要用text sentiment的binary資料，進行strafified
# 使用CountVectorizer，而不要使用tfidfVectorizer來進行bag of words，因為MultinomialNB只能使用count資料
# 這一次的資料，就全部都是training的資料(validation的資料，就在cross validation的時候做)，因為沒有要進行grid-search，所以validation score就是這個模型的表現
# 第一個模型是MultinomialNB，它有一個alpha的參數可以調整，但是在此我們不要調整他的參數
# 第二個模型，使用Multi-layer-perceptron的模型
# 比較兩者在f1 score與accuracy上面的差異，記得這兩個指標，在sklearn裡面都有專門的函數可以用，查一下第一次的專案作業boston house pricing
# 利用MLP模型來預測剩下50%資料的text sentitment，並且把該變數加到資料集裡面

* [step0](#step0): import necessary packages
* [step1](#step1): import dataset `part6_dataset.pickle` as `part7_dataset`
* [step2](#step2): extract `combined_review` from `part6_dataset`
* [step3](#step3): create necessary `class` and `self-defined-fun` for LDA model
* [step4](#step4): create a bag of words solely for LDA model
* [step5](#step5): build up a LDA model
* [step6](#step6): calculate the topic probabilities for each document
* [step7](#step7): have a look at the doc topic assigned and the doc text
* [step8](#step8): join the topic back to the original dataset
* [step9](#step9): save the output as `part6_dataset.pickle`

In [53]:
# import necessary packages
import numpy as np
import pandas as pd
from IPython.display import display # Allows the use of display() for DataFrames
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno # module for missing value visualization
from scipy import stats # implement box-cox transformation
from math import ceil
from sklearn.utils import shuffle # shuffling the dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.naive_bayes import MultinomialNB # for sentiment analysis benchmark model
from sklearn.model_selection import cross_val_score # cross validation score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from keras.utils import np_utils # encode categorical variable
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.callbacks import ModelCheckpoint  


# Pretty display for notebooks
%matplotlib inline

<a id="step1"></a>
#### step1: import dataset `part6_dataset.pickle` as `part7_dataset`

In [2]:
part7_dataset = pd.read_pickle("part6_dataset.pickle")

<a id="step2"></a>
#### step2: shuffle and sampling 50% of the dataset
1. Use the first 50% of dataset as training and validation dataset for sentiment analysis.

In [3]:
# separate target variable out - review_sentiment
target_variable = part7_dataset.review_sentiment
target_variable = target_variable.astype("category")

# just sample 50% of the whole dataset - use train_test_split() to achieve same result
X_first50, X_remaining50, y_first50, y_remaining50 = train_test_split(part7_dataset, target_variable,
                                                                      test_size = 0.5, stratify = target_variable)

<a id="step3"></a>
#### step3: create necessary class and self-defined-fun for Sentiment Analysis

In [4]:
# create a class for lemmatizer
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

<a id="step4"></a>
#### step4: create a bag of words solely for Sentiment Analysis
1. setting the ngram can be up to bi-grams.
2. later I will use MultinomialNB model, so it's better to use `CountVectorizer` instead of `TfidfVectorizer`.

In [65]:
# build up a bag of words for Sentiment Analysis
n_features = 10000

sentiment_count_vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(),
                                             max_df=0.5, min_df=2, # word fequency less than 50% and shows at least in 2 doc
                                             max_features=n_features,
                                             stop_words="english",
                                             ngram_range=(1,2))

# fit and transform data
sentiment_count = sentiment_count_vectorizer.fit_transform(X_first50["combined_review"])
sentiment_count = sentiment_count.toarray() # transform from sparse to dense matrix

<a id="step5"></a>
#### step5: build up the benchmark model (naive bayes) for sentiment analysis

In [15]:
# create the benchmark model
naive_bayes = MultinomialNB()
naive_bayes.fit(sentiment_count, y_first50)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

<a id="step6"></a>
#### step6: evaluate the naive bayes model performance based on cross-validation 

In [37]:
# evaluate the model performance based on cross-validation
mean_accuracy = cross_val_score(estimator=naive_bayes, X=sentiment_count, y=y_first50, cv=5, scoring="accuracy").mean()
print("mean accuracy of corss-validation: {}".format(round(mean_accuracy,2)))

# calculate the f1 score
mean_f1 = cross_val_score(estimator=naive_bayes, X=sentiment_count, y=y_first50, cv=5, scoring="f1").mean()
print("mean f1 of corss-validation: {}".format(round(mean_f1,2)))

# calculate the recall
mean_recall = cross_val_score(estimator=naive_bayes, X=sentiment_count, y=y_first50, cv=5, scoring="recall").mean()
print("mean recall of corss-validation: {}".format(round(mean_recall,2)))

mean accuracy of corss-validation: 0.86
mean f1 of corss-validation: 0.92
mean recall of corss-validation: 0.88


<a id="step7"></a>
#### step7: general output of the model prediction and confusion matrix on the training dataset (nb model)

In [40]:
# create the prediction from naive_bayes model
prediction = naive_bayes.predict(sentiment_count)

# print the confusion matrix
print("confusion matrix from naive bayes model")
print(confusion_matrix(y_first50, prediction))
print()

# print the overview of performance metrics
print("the overview of performance metrics")
print(classification_report(y_first50, prediction))

confusion matrix from naive bayes model
[[ 18927   7241]
 [ 27379 202425]]

the overview of performance metrics
             precision    recall  f1-score   support

          0       0.41      0.72      0.52     26168
          1       0.97      0.88      0.92    229804

avg / total       0.91      0.86      0.88    255972



<a id="step8"></a>
#### step8: evaluate navie bayes model on the remaining 50% dataset.


In [46]:
# transform the remaining 50% dataset
test_sentiment_count = sentiment_count_vectorizer.transform(X_remaining50["combined_review"])
test_sentiment_count = test_sentiment_count.toarray()

In [48]:
# evaluate the performance on the remaining dataset
test_prediction = naive_bayes.predict(test_sentiment_count)

# print the confusion matrix
print("confusion matrix from naive bayes model (in remaining test dataset)")
print(confusion_matrix(y_remaining50, test_prediction))
print()

# print the overview of performance metrics
print("the overview of performance metrics (in remaining test dataset)")
print(classification_report(y_remaining50, test_prediction))


confusion matrix from naive bayes model (in remaining test dataset)
[[ 18720   7449]
 [ 27604 202199]]

the overview of performance metrics (in remaining test dataset)
             precision    recall  f1-score   support

          0       0.40      0.72      0.52     26169
          1       0.96      0.88      0.92    229803

avg / total       0.91      0.86      0.88    255972



<a id="step9"></a>
#### step9: the comparison model - multi-layer perceptron model

In [50]:
# encode the target variable
target_variable = np_utils.to_categorical(y_first50, num_classes=2) # training
test_target_varialbe = np_utils.to_categorical(y_remaining50, num_classes=2) # test


In [71]:
# Building the model architecture
model = Sequential()
model.add(Dense(256, activation='relu', input_shape=(10000,)))
model.add(Dropout(0.7))
model.add(Dense(2, activation='softmax'))
model.summary()

# Compiling the model using categorical_crossentropy loss, and rmsprop optimizer.
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_13 (Dense)             (None, 256)               2560256   
_________________________________________________________________
dropout_10 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 2)                 514       
Total params: 2,560,770
Trainable params: 2,560,770
Non-trainable params: 0
_________________________________________________________________


In [72]:
# Running and evaluating the model

checkpointer = ModelCheckpoint(filepath='sentiment.model.best.hdf5', 
                               verbose=1, save_best_only=True)

hist = model.fit(sentiment_count, target_variable,
          batch_size=50,
          epochs=20,
          validation_split=0.25,
          callbacks=[checkpointer],
          verbose=2,
          shuffle=True)

Train on 191979 samples, validate on 63993 samples
Epoch 1/20
Epoch 00000: val_loss improved from inf to 0.23597, saving model to sentiment.model.best.hdf5
323s - loss: 0.2554 - acc: 0.9103 - val_loss: 0.2360 - val_acc: 0.9164
Epoch 2/20


KeyboardInterrupt: 

<a id="step10"></a>
#### step10: general output of the model prediction and confusion matrix on the training dataset (mlp model)

In [None]:
# create the prediction from multi-layer perceptron model
prediction_mlp = model.predict(sentiment_count)

# print the confusion matrix
print("confusion matrix from multi-layer perceptron model")
print(confusion_matrix(y_first50, prediction_mlp))
print()

# print the overview of performance metrics
print("the overview of performance metrics")
print(classification_report(y_first50, prediction_mlp))

<a id="step11"></a>
#### step11: evaluate multi-layer perceptron model on the remaining 50% dataset.

In [None]:
# evaluate the performance on the remaining dataset
test_prediction = naive_bayes.predict(test_sentiment_count)

# print the confusion matrix
print("confusion matrix from multi-layer perceptron model (in remaining test dataset)")
print(confusion_matrix(y_remaining50, test_prediction))
print()

# print the overview of performance metrics
print("the overview of performance metrics (in remaining test dataset)")
print(classification_report(y_remaining50, test_prediction))