In [1]:
import numpy as np
import pandas as pd
import re

%matplotlib inline

import sys
import os

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Masking
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers import Conv1D, GlobalMaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional
from keras.models import Sequential, Model

from Attention_layer import  Attention_layer

Using TensorFlow backend.


In [2]:
import keras
keras.__version__

'1.2.2'

In [3]:
train = pd.read_csv('../input/train.csv')

train['len']=train['Description'].apply(lambda x:len(x))

In [4]:
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [5]:
GLOVE_DIR = "../input"
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [6]:
print('Total %s word vectors.' % len(embeddings_index))

Total 400000 word vectors.


In [7]:
train['target']=train['Is_Response'].apply(lambda x: 0 if x=='not happy' else 1)
train.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response,target
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy,0
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy,0
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy,0
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy,1
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy,0


In [8]:
%%time
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(train['Description'])
sequences = tokenizer.texts_to_sequences(train['Description'])
word_index = tokenizer.word_index
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

CPU times: user 9.08 s, sys: 288 ms, total: 9.37 s
Wall time: 9.29 s


In [9]:
labels = to_categorical(np.asarray(train['target']))

In [10]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

In [11]:
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [12]:
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
print ('Length of embedding_matrix:', embedding_matrix.shape[0])

Length of embedding_matrix: 50579


In [13]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            mask_zero=False,
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

print('Traing and validation set number of positive and negative reviews')
print (y_train.sum(axis=0))
print (y_val.sum(axis=0))

Traing and validation set number of positive and negative reviews
[  9959.  21187.]
[ 2452.  5334.]


In [19]:
## MLP
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

dense_1 = Dense(100,activation='relu')(embedded_sequences)
max_pooling = GlobalMaxPooling1D()(dense_1)

drop_3 = Dropout(0.2)(max_pooling)
dense_2 = Dense(2, activation='softmax')(drop_3)

model = Model(sequence_input, dense_2)

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_4 (InputLayer)             (None, 1000)          0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 1000, 100)     5057900     input_4[0][0]                    
____________________________________________________________________________________________________
dense_7 (Dense)                  (None, 1000, 100)     10100       embedding_1[3][0]                
____________________________________________________________________________________________________
globalmaxpooling1d_2 (GlobalMaxP (None, 100)           0           dense_7[0][0]                    
___________________________________________________________________________________________

## LSTM
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
l_gru = Bidirectional(LSTM(100, return_sequences=True))(embedded_sequences)
dense_1 = Dense(100,activation='tanh')(l_gru)
flatten = Flatten()(dense_1)
dense_2 = Dense(2, activation='softmax')(flatten)

model = Model(sequence_input, dense_2)

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

model.summary()

## Lstm with attetion
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
l_gru = Bidirectional(LSTM(100, return_sequences=True))(embedded_sequences)
l_att = Attention_layer()(l_gru)
dense_1 = Dense(100,activation='tanh')(l_att)
dense_2 = Dense(2, activation='softmax')(dense_1)

model = Model(sequence_input, dense_2)

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

model.summary()

In [20]:
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          nb_epoch=10, batch_size=200)

Train on 31146 samples, validate on 7786 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f447cda4a58>