# HW4: Movie Review Sentiment Analysis
*Boris Evstratov*

## Task:
1. Choose and argue your measure of a test's accuracy;
2. Build data processing and classification pipeline; Please compare word-embeddings vs classical methods;
3. Tune your model.

### 0. Importing packages

In [86]:
import pandas as pd
import numpy as np
from numpy import zeros
from keras.models import Sequential
from keras.layers import Bidirectional,CuDNNLSTM,CuDNNGRU,Dense,Dropout,Embedding,LocallyConnected1D
from keras.layers import Conv1D,GlobalAveragePooling1D,MaxPooling1D,GlobalMaxPooling1D,Flatten
from keras.layers.recurrent import LSTM
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical,np_utils
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import sklearn.metrics as sklm
from tqdm import tqdm
import nltk
nltk.download('punkt')
from nltk import FreqDist
from nltk.stem import SnowballStemmer,WordNetLemmatizer,PorterStemmer
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 1. Data Management

#### 1.1 Data import and preprocessing

In [0]:
# Convert CSVs to Panda's dataframes
features_train = ['pid','sid','p','s']
features_test = ['pid','sid','p']
train = pd.read_csv('train.tsv', names=features_train, sep="\t", header=0)
test = pd.read_csv('test.tsv', names=features_test, sep="\t", header=0)
sub = pd.read_csv('sampleSubmission.csv', sep=",")

In [7]:
train.tail()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
80884,80885,4169,less pimps,2.0
80885,80886,4169,pimps,2.0
80886,80887,4169,ho 's,2.0
80887,80888,4169,ho,2.0
80888,80889,4170,The,


In [28]:
print(train.s.value_counts())

2    79582
3    32927
1    27273
4     9206
0     7072
Name: s, dtype: int64


Values correspond to the following sentimentals:
```
0 - negative
1 - somewhat negative
2 - neutral
3 - somewhat positive
4 - positive


```



In [70]:
# Stemming and Lower-casing
X = train.p
Y = train.s

ps = PorterStemmer()
l2 = []
review = []
s2 = ''
for row in X:
    for words in nltk.word_tokenize(row):
            l2.append(words.lower())
            l2.append(' ')
    s2 = ''.join(l2)
    review.append(s2)
    s2 = ''
    l2 = []
X = review
print(X[:1])

['a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . ']


#### 1.2 Train, Test, Validation Split

In [30]:
X_train, X_inter, Y_train, Y_inter = train_test_split(X, Y,test_size=0.3,random_state=1)
X_val, X_test, Y_val, Y_test = train_test_split(X_inter, Y_inter,test_size=0.5,random_state=1)
print('X_train:',len(X_train))
print('X_val:',len(X_val))
print('X_test:',len(X_test))

X_train: 109242
X_val: 23409
X_test: 23409


In [87]:
# Getting unique words
all_words = ' '.join(X_train)
all_words = word_tokenize(all_words)
dist = FreqDist(all_words)
num_unique_word = len(dist)
num_unique_word

16506

In [88]:
# Max length of a review
r_len = []
for text in X_train:
    word = word_tokenize(text)
    l = len(word)
    r_len.append(l)
    
max_review_len = np.max(r_len)
max_review_len

53

### 2. Embeddings Model

#### 2.1 Glove Embeddings

In [0]:
# Fitting training text on tokenizer for indexing
max_sentence=len(max(X,key=len))

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [33]:
# Load up embeddings library
embeddings_index = dict()
f = open('glove.6B.100d.txt')
embeddings_index = {}
for line in f:
    values = line.split(' ')
    word = values[0]
    coefs = np.array(values[1:],dtype = 'float32')
    embeddings_index[word]= coefs
f.close()
print('Loaded',len(embeddings_index),'word vectors.')


Loaded 400001 word vectors.


In [0]:
# Creating embedded matrix
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [37]:
# Padding and conversion of the text into the sequencies
max_sentence=len(max(X,key=len))

encoded_docs = tokenizer.texts_to_sequences(X_train)
train_x = pad_sequences(encoded_docs, maxlen=max_sentence, padding='post')
print(train_x[0])    

encoded_docs=0
encoded_docs = tokenizer.texts_to_sequences(X_val)
val_x = pad_sequences(encoded_docs, maxlen=max_sentence, padding='post')
print(val_x[1])

encoded_docs=0
encoded_docs = tokenizer.texts_to_sequences(X_test)
test_x = pad_sequences(encoded_docs, maxlen=max_sentence, padding='post')
print(test_x[1])

encoder = LabelEncoder()
encoder.fit(Y_train)
encoded_Y_train = encoder.transform(Y_train)
dummy_y_train = np_utils.to_categorical(encoded_Y_train)
print(dummy_y_train[:3])

encoded_Y_val = encoder.transform(Y_val)

# One-hot encoding
dummy_y_val = np_utils.to_categorical(encoded_Y_val)
vocab_size = len(tokenizer.word_index) + 1

[5751    4 2084   78 7002   10    3    1  364  996    9  101 1752   91
    7  177 4902   49   31  199 2801   43    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

#### 2.2 CNN creation

In [79]:
# Building a multilayered Sequential model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=max_sentence, trainable=True, weights=[embedding_matrix] ))
model.add(LocallyConnected1D(128, 2,strides=1,padding='valid', activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(256, activation='relu'))          
model.add(Dense(5, activation='softmax'))
print(model.summary())
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 284, 100)          1526100   
_________________________________________________________________
locally_connected1d_3 (Local (None, 283, 128)          7281024   
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 128)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 256)               33024     
_________________________________________________________________
dense_6 (Dense)              (None, 5)                 1285      
Total params: 8,841,433
Trainable params: 8,841,433
Non-trainable params: 0
_________________________________________________________________
None


#### 2.3 Model training

In [42]:
model.fit(train_x, dummy_y_train,  validation_data=(val_x, dummy_y_val), epochs=4,batch_size=128,verbose=1)

Train on 109242 samples, validate on 23409 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f3e2f5b0550>

#### 2.4 Model evaluation and metrics

In [71]:
# Calculate main metrics
predictions = model.predict(test_x)
pred=[]
for idx,val in enumerate(predictions):
    pred.append(np.argmax(val))

print('Overall model Accuracy:  %0.2f' % sklm.accuracy_score(Y_test, pred))
print()

metrics = sklm.precision_recall_fscore_support(Y_test, pred)
cols_name = ('0','1','2','3','4')
rows_name = ('Precision','Recall','F1-score','Support')
print(pd.DataFrame(metrics,rows_name,cols_name))

Overall model Accuracy:  0.64

                     0            1             2            3            4
Precision     0.457317     0.522884      0.744098     0.540774     0.537538
Recall        0.287632     0.510449      0.772712     0.577564     0.409924
F1-score      0.353149     0.516592      0.758135     0.558564     0.465136
Support    1043.000000  4163.000000  11910.000000  4983.000000  1310.000000


### 3. LSTM Model

#### 3.1 Building the model

In [98]:
model1 = Sequential()
model1.add(Embedding(num_unique_word,100,mask_zero=True))
model1.add(LSTM(64,dropout=0.4, recurrent_dropout=0.4,return_sequences=True))
model1.add(LSTM(32,dropout=0.5, recurrent_dropout=0.5,return_sequences=False))
model1.add(Dense(num_classes,activation='softmax'))
model1.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, None, 100)         1650600   
_________________________________________________________________
lstm_5 (LSTM)                (None, None, 64)          42240     
_________________________________________________________________
lstm_6 (LSTM)                (None, 32)                12416     
_________________________________________________________________
dense_9 (Dense)              (None, 5)                 165       
Total params: 1,705,421
Trainable params: 1,705,421
Non-trainable params: 0
_________________________________________________________________


#### 3.2 Training the model

In [99]:
model1.fit(train_x, dummy_y_train, validation_data=(val_x, dummy_y_val),epochs=4, batch_size=128, verbose=1)

Train on 109242 samples, validate on 23409 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f3e2f074f28>

#### 3.3 Model evaluation and metrics

In [100]:
# Calculate main metrics
predictions = model1.predict(test_x)
pred=[]
for idx,val in enumerate(predictions):
    pred.append(np.argmax(val))

print('Overall model Accuracy:  %0.2f' % sklm.accuracy_score(Y_test, pred))
print()

metrics = sklm.precision_recall_fscore_support(Y_test, pred)
cols_name = ('0','1','2','3','4')
rows_name = ('Precision','Recall','F1-score','Support')
print(pd.DataFrame(metrics,rows_name,cols_name))

Overall model Accuracy:  0.65

                     0            1             2            3            4
Precision     0.501433     0.518886      0.768761     0.552910     0.598240
Recall        0.335570     0.590680      0.750042     0.629139     0.311450
F1-score      0.402068     0.552460      0.759286     0.588567     0.409639
Support    1043.000000  4163.000000  11910.000000  4983.000000  1310.000000


### 4. Conclusion
We see that CNN that used Embeddings has overall same values of the main metrics