In [1]:
# Keras
from keras.models import Sequential
from keras.layers import Bidirectional, LSTM, Dense, Embedding, SpatialDropout1D
from keras.optimizers import adam
from keras.callbacks import EarlyStopping
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
import keras

# Regular Expression
import re

# NLTK
from nltk.tokenize import word_tokenize
from nltk import FreqDist
from nltk.stem import WordNetLemmatizer

# EDA
from string import punctuation
import pandas as pd
import numpy as np

Using TensorFlow backend.


In [2]:
# Data Load
df = pd.read_csv("../input/movie-review-sentiment-analysis-kernels-only/train.tsv", sep="	")
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [3]:
len(df)

156060

In [4]:
df.isnull().sum()

PhraseId      0
SentenceId    0
Phrase        0
Sentiment     0
dtype: int64

In [5]:
df['Sentiment'].value_counts()

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

In [6]:
# Preprocessing
df['Phrase'] = df['Phrase'].apply(lambda x: x.lower())
df['Phrase'] = df['Phrase'].apply((lambda x: re.sub('[^A-z\s]','',x)))

In [7]:
lemma=WordNetLemmatizer()
def clean_text(text):
    text_corpus=[]
    for i in range(0,len(text)):
        review = str(text[i])
        review = [lemma.lemmatize(w) for w in word_tokenize(str(review))]
        review = ' '.join(review)
        text_corpus.append(review)
    return text_corpus

In [8]:
#df['Phrase'] = df['Phrase'].map(lambda x : x if len(x.split(" ")) > 1 else None)

In [9]:
df['clean_text'] = clean_text(df['Phrase'].values)
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,clean_text
0,1,1,a series of escapades demonstrating the adage ...,1,a series of escapade demonstrating the adage t...
1,2,1,a series of escapades demonstrating the adage ...,2,a series of escapade demonstrating the adage t...
2,3,1,a series,2,a series
3,4,1,a,2,a
4,5,1,series,2,series


In [10]:
# Total Words
aa = ' '.join(list(df['clean_text']))
aa = list(set(aa.split(" ")))
len(aa)

14876

In [11]:
from sklearn.utils import shuffle

In [12]:
df = shuffle(df)
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,clean_text
72812,72813,3718,one of the most unpleasant things the studio h...,0,one of the most unpleasant thing the studio ha...
96920,96921,5066,weave,2,weave
68291,68292,3466,do nt get williams usual tear and a smile ju...,1,do nt get williams usual tear and a smile just...
73471,73472,3755,is rrb a fascinating character and deserves a...,0,is rrb a fascinating character and deserves a ...
102250,102251,5380,of its development,2,of it development


In [13]:
# Tokenizer
vocabulary_size = len(aa)
tokenizer = Tokenizer(num_words=vocabulary_size, split=' ')
tokenizer.fit_on_texts(df['clean_text'].values)
sequences = tokenizer.texts_to_sequences(df['clean_text'].values)
data = pad_sequences(sequences)#, maxlen=45)

In [14]:
from keras.utils.np_utils import to_categorical

In [15]:
# Encoder
encoder = LabelEncoder()
encoder = encoder.fit_transform(df['Sentiment'])
target = to_categorical(encoder)

In [16]:
data.shape, target.shape

((156060, 48), (156060, 5))

In [17]:
from keras.backend import zeros

In [18]:
embeddings_index = dict()
f = open('../input/glove6b300dtxt/glove.6B.300d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [19]:
embedding_matrix = np.zeros((vocabulary_size, 300))
for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [20]:
from keras.layers import GRU, Dropout

In [21]:
# Model
model = Sequential()
model.add(Embedding(vocabulary_size, 300, input_length = data.shape[1], weights = [embedding_matrix], trainable=True))
model.add(SpatialDropout1D(0.2))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(Bidirectional(GRU(256)))
model.add(Dropout(0.5))
model.add(Dense(target.shape[1], activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 48, 300)           4462800   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 48, 300)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 48, 512)           1140736   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 512)               1181184   
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 2565      
Total params: 6,787,285
Trainable params: 6,787,285
Non-trainable params: 0
_________________________________________________________________


In [22]:
early_stopping_filter = EarlyStopping(monitor='val_loss', patience=2)

In [23]:
#model.fit(data, target, validation_split=0.1, epochs=4, callbacks=[early_stopping_filter], batch_size=256)
model.fit(data, target, epochs=4, callbacks=[early_stopping_filter], batch_size=256)

Epoch 1/4
Epoch 2/4




Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7fd38aa6ca58>

# Testing

In [24]:
testdf = pd.read_csv("../input/movie-review-sentiment-analysis-kernels-only/test.tsv", sep="	")
testdf.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [25]:
testdf['Phrase'] = testdf['Phrase'].apply(lambda x: x.lower())
testdf['Phrase'] = testdf['Phrase'].apply((lambda x: re.sub('[^A-z\s]','',x)))
testdf['clean_test'] = clean_text(testdf['Phrase'].values)

In [26]:
test_sequences = tokenizer.texts_to_sequences(testdf['clean_test'].values)
test_data = pad_sequences(test_sequences, maxlen=data.shape[1])

In [27]:
y_pred = model.predict_classes(test_data, verbose=1)



In [28]:
submissiondf = pd.DataFrame({'PhraseId': testdf['PhraseId'], 'Sentiment': y_pred})
submissiondf.head()

Unnamed: 0,PhraseId,Sentiment
0,156061,3
1,156062,3
2,156063,2
3,156064,2
4,156065,3


In [29]:
submissiondf.to_csv("sampleSubmission.csv", index=False)