In [3]:
import numpy as np
import pandas as pd

from gensim import corpora
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import SnowballStemmer

from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

Using TensorFlow backend.


In [4]:
train = pd.read_csv('/home/ubuntu/Downloads/sentiment-analysis-on-movie-reviews/train.tsv', sep='\t', header=0)
test = pd.read_csv('/home/ubuntu/Downloads/sentiment-analysis-on-movie-reviews/test.tsv', sep='\t', header=0)

In [5]:
train.shape, test.shape

((156060, 4), (66292, 3))

In [6]:
train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [7]:
test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [8]:
raw_docs_train = train['Phrase'].values
raw_docs_test = test['Phrase'].values
sentiment_train = train['Sentiment'].values
num_labels = len(np.unique(sentiment_train))

In [9]:

np.unique(sentiment_train)

array([0, 1, 2, 3, 4])

Preproceesing

In [10]:
stop_words = set(stopwords.words('english'))
print (stop_words)

{'how', 'll', 'doing', "you'll", 'couldn', 'have', 'under', 'her', 'but', 'hadn', 'yours', 'again', 'so', 'will', 'at', 'mustn', 'most', 'he', 'after', 'some', 'should', 'out', "hadn't", 'that', 'having', 'i', 'yourself', "couldn't", 'is', 'off', 'are', "don't", "you're", 'his', 'weren', "you've", 'because', 'between', 'to', "weren't", 'ain', "shouldn't", 'we', 'than', "won't", "aren't", 'no', 'why', 'through', 'won', 'needn', 'by', 'other', 'not', 'don', "she's", 'whom', 'more', 'in', "needn't", 'himself', 'these', 'about', 'any', 'nor', 'am', 'this', 'does', 'from', 'me', 'over', 'ourselves', "wasn't", 'or', 'was', 'for', 'ours', 'being', 'before', 'has', 'herself', 'such', "hasn't", 'an', 'below', 'our', 'do', 'where', 'further', 'did', 'with', 't', 'while', "doesn't", 'just', 'she', 'their', 'itself', 'both', 'only', 'doesn', 'shouldn', 's', 'you', 'it', 'ma', "wouldn't", 'now', 'own', "didn't", 'aren', 'haven', 'm', 'each', 'were', 'then', 'hasn', 'isn', 'didn', "you'd", 'who', 'i

In [11]:
stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])
print (stop_words)

{'how', 'll', 'doing', "you'll", 'couldn', 'have', 'under', 'her', 'but', 'hadn', 'yours', 'again', 'so', 'will', 'at', 'mustn', 'most', '}', 'he', 'after', 'some', 'should', 'out', "hadn't", 'that', '(', 'having', 'i', 'yourself', "couldn't", 'is', 'off', 'are', "don't", "you're", 'his', 'weren', "you've", 'because', 'between', 'to', "weren't", ',', 'ain', "shouldn't", 'we', 'than', "won't", "aren't", 'no', 'why', 'through', 'won', 'needn', 'by', 'other', 'not', '[', 'don', "she's", 'whom', 'more', 'in', "needn't", 'himself', 'these', 'about', '"', 'any', 'nor', 'am', 'this', 'does', 'from', 'me', 'over', 'ourselves', "wasn't", 'or', 'was', 'for', 'ours', 'being', 'before', 'has', 'herself', 'such', "hasn't", 'an', 'below', '.', 'our', 'do', 'where', 'further', ']', 'did', 'with', 't', 'while', "doesn't", 'just', 'she', 'their', 'itself', 'both', 'only', 'doesn', 'shouldn', 's', 'you', 'it', 'ma', "wouldn't", 'now', 'own', "didn't", 'aren', 'haven', 'm', 'each', 'were', 'then', 'hasn'

In [12]:
stemmer = SnowballStemmer('english')

In [14]:
print ("pre-processing train docs...")
processed_docs_train = []
for index, doc in enumerate(raw_docs_train):
    tokens = word_tokenize(doc)
    filtered = [word for word in tokens if word not in stop_words]
    stemmed = [stemmer.stem(word) for word in filtered]
    processed_docs_train.append(stemmed)
    
    if index == 0:
        print ('\n')
        print (doc)
        print ('\n')
        print (tokens)
        print ('\n')
        print (filtered)
        print ('\n')
        print (stemmed)

pre-processing train docs...


A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .


['A', 'series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose', 'is', 'also', 'good', 'for', 'the', 'gander', ',', 'some', 'of', 'which', 'occasionally', 'amuses', 'but', 'none', 'of', 'which', 'amounts', 'to', 'much', 'of', 'a', 'story', '.']


['A', 'series', 'escapades', 'demonstrating', 'adage', 'good', 'goose', 'also', 'good', 'gander', 'occasionally', 'amuses', 'none', 'amounts', 'much', 'story']


['a', 'seri', 'escapad', 'demonstr', 'adag', 'good', 'goos', 'also', 'good', 'gander', 'occasion', 'amus', 'none', 'amount', 'much', 'stori']


In [17]:
print ("pre-processing test docs...")
processed_docs_test = []
for doc in raw_docs_test:
    tokens = word_tokenize(doc)
    filtered = [word for word in tokens if word not in stop_words]
    stemmed = [stemmer.stem(word) for word in filtered]
    processed_docs_test.append(stemmed)

pre-processing test docs...


In [18]:
processed_docs_all = np.concatenate((processed_docs_train, processed_docs_test), axis=0)

In [23]:
dictionary = corpora.Dictionary(processed_docs_all)
dictionary_size = len(dictionary.keys())
print ("dictionary size: ", dictionary_size)

dictionary size:  13758


In [24]:
dictionary[0], dictionary[14]

('a', 'stori')

In [26]:
print ("converting to token ids...")
word_id_train, word_id_len = [], []
for index,doc in enumerate(processed_docs_train):
    word_ids = [dictionary.token2id[word] for word in doc]
    word_id_train.append(word_ids)
    word_id_len.append(len(word_ids))
    
    if index == 0:
        print (doc)
        print (word_ids)
        print (word_id_train)
        print (word_id_len)

converting to token ids...
['a', 'seri', 'escapad', 'demonstr', 'adag', 'good', 'goos', 'also', 'good', 'gander', 'occasion', 'amus', 'none', 'amount', 'much', 'stori']
[0, 13, 6, 5, 1, 8, 9, 2, 8, 7, 12, 4, 11, 3, 10, 14]
[[0, 13, 6, 5, 1, 8, 9, 2, 8, 7, 12, 4, 11, 3, 10, 14]]
[16]


In [27]:
word_id_test, word_ids = [], []
for doc in processed_docs_test:
    word_ids = [dictionary.token2id[word] for word in doc]
    word_id_test.append(word_ids)
    word_id_len.append(len(word_ids))

In [28]:
seq_len = np.round((np.mean(word_id_len) + 2*np.std(word_id_len))).astype(int)
print (np.mean(word_id_len))
print (np.std(word_id_len))
print (seq_len)

4.169919766856156
3.8047838578714424
12


In [29]:
#pad sequences
word_id_train = sequence.pad_sequences(np.array(word_id_train), maxlen=seq_len)
word_id_test = sequence.pad_sequences(np.array(word_id_test), maxlen=seq_len)
y_train_enc = np_utils.to_categorical(sentiment_train, num_labels)

In [30]:
print (word_id_train)

[[    1     8     9 ...     3    10    14]
 [    0     0     0 ...     1     8     9]
 [    0     0     0 ...     0     0    13]
 ...
 [    0     0     0 ...     0 11848 11849]
 [    0     0     0 ...     0     0 11848]
 [    0     0     0 ...     0     0 11849]]


In [31]:
print (y_train_enc)

[[0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 ...
 [0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]]


In [32]:
#LSTM
print ("fitting LSTM ...")
model = Sequential()
model.add(Embedding(dictionary_size, 128))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(num_labels))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

fitting LSTM ...


In [33]:
model.fit(word_id_train, y_train_enc, epochs=3, batch_size=256, verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f366d607ac8>

In [34]:
test_pred = model.predict_classes(word_id_test)


In [35]:
test_pred

array([2, 2, 2, ..., 1, 1, 1])

In [39]:
#make a submission
test['Sentiment'] = test_pred.reshape(-1,1) 
header = ['PhraseId', 'Sentiment']
test.to_csv('./final_lstm_cnn.csv', columns=header, index=False, header=True)