In [9]:
import numpy as np
import pandas as pd

# Neural Netwok imports
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding,LSTM,Bidirectional
from keras.layers import Conv1D ,MaxPooling1D
from keras.utils import np_utils

#text Processing import
from gensim import corpora
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import SnowballStemmer

## 1- Load Data

In [3]:
train_data = pd.read_csv('train.tsv', sep='\t', header=0)
test_data  = pd.read_csv('test.tsv' , sep='\t', header=0)
phrase_train = train_data['Phrase'].values
sentiments_data = train_data['Sentiment'].values
num_labels = len(np.unique(sentiments_data))
phrase_test  = test_data ['Phrase'].values

## 2 - Text pre-processing

In [4]:
# generate stop word
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])
stemmer = SnowballStemmer('english')

In [5]:
# tokonize phrases "train,test"
train_data_clean = []
test_data_clean = []
# stem, tokenize sentiment train data
for sentiment in phrase_train:
    token = word_tokenize(sentiment)
    sentiment_words = [stemmer.stem(word) for word in token if word not in stop_words]
    train_data_clean.append(sentiment_words)
# stem, tokenize sentiment test data
for sentiment in phrase_test:
    token = word_tokenize(sentiment)
    sentiment_words = [stemmer.stem(word) for word in token if word not in stop_words]
    test_data_clean.append(sentiment_words)

In [6]:
# create vocab for all words (test,train)
vocab = np.concatenate((train_data_clean,test_data_clean),axis = 0)
vocab_dic = corpora.Dictionary(vocab)

x_train = []
x_train_seq_len = []
x_test = []

for sent in train_data_clean:
    word_ids = [vocab_dic.token2id[word] for word in sent]
    x_train.append(word_ids)
    x_train_seq_len.append(len(word_ids))
    
for sent in test_data_clean:
    word_ids = [vocab_dic.token2id[word] for word in sent]
    x_test.append(word_ids)
    x_train_seq_len.append(len(word_ids))

In [7]:
# get the average sequence length
seq_len = np.round((np.mean(x_train_seq_len) + 2*np.std(x_train_seq_len))).astype(int)
# pad each sequence "train,test"
x_train = sequence.pad_sequences(np.array(x_train), maxlen=seq_len)
x_test = sequence.pad_sequences(np.array(x_test), maxlen=seq_len)
# convert sentiment to categorical
y_train = np_utils.to_categorical(sentiments_data, num_labels)

## 3- Build Network

In [19]:
print 'Build model LSTM...'
model_lstm = Sequential()
model_lstm.add(Embedding(len(vocab_dic.keys()), 128, input_length=seq_len))
model_lstm.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model_lstm.add(Dense(num_labels))
model_lstm.add(Activation('softmax'))

model_lstm.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])

Build model LSTM...


In [20]:
print 'Build model Bidirectional LSTM...'
model_bi = Sequential()
model_bi.add(Embedding(len(vocab_dic.keys()), 128, input_length=seq_len))
model_bi.add(Bidirectional(LSTM(64)))
model_bi.add(Dropout(0.5))
model_bi.add(Dense(num_labels))
model_bi.add(Activation('softmax'))

model_bi.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])

Build model Bidirectional LSTM...


In [21]:
print 'Build model LSTM + CNN...'
model_lstm_cnn = Sequential()
model_lstm_cnn.add(Embedding(len(vocab_dic.keys()), 128, input_length=seq_len))
model_lstm_cnn.add(Dropout(0.25))
model_lstm_cnn.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
model_lstm_cnn.add(MaxPooling1D(pool_size=4))
model_lstm_cnn.add(LSTM(128))
model_lstm_cnn.add(Dense(num_labels))
model_lstm_cnn.add(Activation('softmax'))

model_lstm_cnn.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])

Build model LSTM + CNN...


## 4 - Train Network

In [22]:
batch_size = 256
num_epoch = 15

In [23]:
print 'Train LSTM...'
model_lstm.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=num_epoch)

Train LSTM...
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f4fa3853f10>

In [24]:
print 'Train Bidirectional...'
model_bi.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=num_epoch)

Train Bidirectional...
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f4fa0127c10>

In [25]:
print 'Train Bidirectional...'
model_lstm_cnn.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=num_epoch)

Train Bidirectional...
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f4f2aa09bd0>

## 5 - Predict Sentiment

In [33]:
y_test_lstm = model_lstm.predict(x_test)
y_result_lstm = model_lstm.predict_classes(x_test)

In [34]:
y_test_bi = model_lstm.predict(x_test)
y_result_bi = model_lstm.predict_classes(x_test)

In [35]:
y_test_lstm_cnn = model_lstm_cnn.predict(x_test)
y_result_lstm_cnn = model_lstm.predict_classes(x_test)

## 6 - Evaluate Model

In [36]:
score, acc = model_lstm.evaluate(x_test, y_test_lstm, batch_size=batch_size)
print "{} {}".format(score,acc)

0.618767441686 1.0


In [43]:
score, acc = model_bi.evaluate(x_test, y_test_bi, batch_size=batch_size)
print "{} {}".format(score,acc)

0.815549035388 0.812420805021


In [38]:
score, acc = model_lstm_cnn.evaluate(x_test, y_test_lstm_cnn, batch_size=batch_size)
print "{} {}".format(score,acc)

0.900319858494 1.0


## 7 - Make Submission

In [47]:
test_data  = pd.read_csv('test.tsv' , sep='\t', header=0)
test_data['Sentiment'] = y_result_lstm.reshape(-1,1) 
header = ['PhraseId', 'Sentiment']
test_data.to_csv('lstm_sentiment.csv', columns=header, index=False, header=True)

In [48]:
test_data  = pd.read_csv('test.tsv' , sep='\t', header=0)
test_data['Sentiment'] = y_result_bi.reshape(-1,1) 
header = ['PhraseId', 'Sentiment']
test_data.to_csv('Bi_sentiment.csv', columns=header, index=False, header=True)

In [49]:
test_data  = pd.read_csv('test.tsv' , sep='\t', header=0)
test_data['Sentiment'] = y_result_lstm_cnn.reshape(-1,1) 
header = ['PhraseId', 'Sentiment']
test_data.to_csv('lstmCNN_sentiment.csv', columns=header, index=False, header=True)

In [52]:
test_data  = pd.read_csv('test.tsv' , sep='\t', header=0)
test_data.to_csv('test.csv', columns=header, index=False, header=True)