In [1]:
# Keras
from keras.models import Sequential
from keras.layers import Bidirectional, LSTM, Dense, Embedding, SpatialDropout1D
from keras.optimizers import adam
from keras.callbacks import EarlyStopping
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
import keras

# Regular Expression
import re

# NLTK
from nltk.tokenize import word_tokenize
from nltk import FreqDist
from nltk.stem import WordNetLemmatizer

# EDA
from string import punctuation
import pandas as pd
import numpy as np

In [62]:
# Data Load
df = pd.read_csv("../data/train.tsv", sep="	")
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [63]:
len(df)

156060

In [64]:
df.isnull().sum()

PhraseId      0
SentenceId    0
Phrase        0
Sentiment     0
dtype: int64

In [65]:
df['Sentiment'].value_counts()

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

In [68]:
# Preprocessing
df['Phrase'] = df['Phrase'].apply(lambda x: x.lower())
df['Phrase'] = df['Phrase'].apply((lambda x: re.sub('[^A-z\s]','',x)))

In [141]:
stemmer=SnowballStemmer('english')
lemma=WordNetLemmatizer()
def clean_text(text):
    text_corpus=[]
    for i in range(0,len(text)):
        review = str(text[i])
        review = [lemma.lemmatize(w) for w in word_tokenize(str(review).lower())]
        review = ' '.join(review)
        text_corpus.append(review)
    return text_corpus

In [113]:
#df['Phrase'] = df['Phrase'].map(lambda x : x if len(x.split(" ")) > 1 else None)

In [142]:
df['clean_text'] = clean_text(df['Phrase'].values)
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,clean_review,clean_text
63145,63146,3192,with flashes of warmth and gentle humor,3,with flash of warmth and gentle humor,with flash of warmth and gentle humor
140829,140830,7639,norton has to recite bland police procedural ...,1,norton ha to recite bland police procedural de...,norton ha to recite bland police procedural de...
52522,52523,2600,the good is very very good the rest runs fro...,2,the good is very very good the rest run from m...,the good is very very good the rest run from m...
9703,9704,403,a sweet little girl,3,a sweet little girl,a sweet little girl
61646,61647,3115,melancholia,2,melancholia,melancholia


In [116]:
# Total Words
aa = ' '.join(list(df['clean_text']))
aa = list(set(aa.split(" ")))
len(aa)

In [111]:
from sklearn.utils import shuffle

In [143]:
df = shuffle(df)
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,clean_review,clean_text
131481,131482,7084,short in explaining the music and its roots,2,short in explaining the music and it root,short in explaining the music and it root
21386,21387,955,of college football games,2,of college football game,of college football game
17261,17262,749,issues,2,issue,issue
52002,52003,2564,jackass the movie,2,jackass the movie,jackass the movie
99742,99743,5233,misogyny and unprovoked violence,1,misogyny and unprovoked violence,misogyny and unprovoked violence


In [118]:
# Tokenizer
vocabulary_size = len(aa)
tokenizer = Tokenizer(num_words=vocabulary_size, split=' ')
tokenizer.fit_on_texts(df['clean_text'].values)
sequences = tokenizer.texts_to_sequences(df['clean_text'].values)
data = pad_sequences(sequences)#, maxlen=45)

In [119]:
from keras.utils.np_utils import to_categorical

In [120]:
# Encoder
encoder = LabelEncoder()
encoder = encoder.fit_transform(df['Sentiment'])
target = to_categorical(encoder)

In [121]:
data.shape, target.shape

((156060, 48), (156060, 5))

In [122]:
from keras.backend import zeros

In [123]:
embeddings_index = dict()
f = open('../glove.6B/glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [124]:
embedding_matrix = np.zeros((vocabulary_size, 100))
for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [107]:
from keras.layers import GRU, Dropout

In [128]:
# Model
embed_dim = 128
lstm_out = 128

model = Sequential()
model.add(Embedding(vocabulary_size, 100, input_length = data.shape[1], weights = [embedding_matrix], trainable=True))
model.add(SpatialDropout1D(0.2))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Bidirectional(LSTM(128)))
model.add(Dropout(0.5))
model.add(Dense(target.shape[1], activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_15 (Embedding)     (None, 48, 100)           1487600   
_________________________________________________________________
spatial_dropout1d_11 (Spatia (None, 48, 100)           0         
_________________________________________________________________
bidirectional_23 (Bidirectio (None, 48, 256)           234496    
_________________________________________________________________
bidirectional_24 (Bidirectio (None, 256)               394240    
_________________________________________________________________
dropout_6 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 5)                 1285      
Total params: 2,117,621
Trainable params: 2,117,621
Non-trainable params: 0
_________________________________________________________________


In [129]:
early_stopping_filter = EarlyStopping(monitor='val_loss', patience=5)

In [130]:
model.fit(data, target, validation_split=0.2, epochs=20, callbacks=[early_stopping_filter], batch_size=256)

Train on 124848 samples, validate on 31212 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20


<keras.callbacks.History at 0x47bc742438>

# Testing

In [131]:
testdf = pd.read_csv("../data/test.tsv", sep="	")
testdf.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [133]:
testdf['Phrase'] = testdf['Phrase'].apply(lambda x: x.lower())
testdf['Phrase'] = testdf['Phrase'].apply((lambda x: re.sub('[^A-z\s]','',x)))
testdf['clean_test'] = clean_text(testdf.Phrase.values)

In [134]:
test_sequences = tokenizer.texts_to_sequences(testdf['clean_test'].values)
test_data = pad_sequences(test_sequences, maxlen=48)

In [136]:
y_pred = model.predict_classes(test_data, verbose=1)



In [139]:
subdf = pd.DataFrame({'PhraseId': testdf['PhraseId'], 'Sentiment': y_pred})

In [140]:
subdf.head()

Unnamed: 0,PhraseId,Sentiment
0,156061,3
1,156062,3
2,156063,2
3,156064,3
4,156065,3


In [144]:
subdf.to_csv("../data/sampleSubmission.csv", index=False)