In [1]:
import pandas as pd
train = pd.read_csv(r"train\labeledTrainData.tsv",sep='\t')
test = pd.read_csv(r"test\testData.tsv",sep='\t')

In [2]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [3]:
MAX_NB_WORDS=20000
tokenizer = Tokenizer(MAX_NB_WORDS)

In [4]:
import numpy as np
texts = np.concatenate([train.review.values,test.review.values])

In [5]:
tokenizer.fit_on_texts(texts=texts)

In [6]:
sequences = tokenizer.texts_to_sequences(texts)

In [7]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 124252 unique tokens.


In [65]:
MAX_SEQUENCE_LENGTH = 500
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [68]:
train_x = data[:25000]
train_l = train.sentiment.as_matrix()

In [69]:
test_x = data[25000:]

In [70]:
idx = np.random.choice(25000,2500)

In [71]:
x_valid = train_x[idx]
l_valid = train_l[idx]

In [72]:
allind = np.arange(len(train_x))
idx2 = np.setdiff1d(allind,idx)
x_train = train_x[idx2]
l_train = train_l[idx2]

In [73]:
import os
embeddings_index = {}
GLOVE_DIR = r'..\Glove\glove.6B'
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'),encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    #print(values)
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [74]:
EMBEDDING_DIM = 100
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [75]:
from keras.models import Sequential
from keras.layers import LSTM,Embedding,Dense,Dropout

# create the model
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False))
model.add(LSTM(300,dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 500, 100)          12425300  
_________________________________________________________________
lstm_3 (LSTM)                (None, 300)               481200    
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 301       
Total params: 12,906,801
Trainable params: 481,501
Non-trainable params: 12,425,300
_________________________________________________________________
None


In [76]:
model.fit(batch_size=128,epochs=3,validation_data=(x_valid,l_valid),x=x_train,y=l_train)

Train on 22626 samples, validate on 2500 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x25226ddf748>

In [77]:
model.weights

[<tf.Variable 'lstm_3/kernel:0' shape=(100, 1200) dtype=float32_ref>,
 <tf.Variable 'lstm_3/recurrent_kernel:0' shape=(300, 1200) dtype=float32_ref>,
 <tf.Variable 'lstm_3/bias:0' shape=(1200,) dtype=float32_ref>,
 <tf.Variable 'dense_3/kernel:0' shape=(300, 1) dtype=float32_ref>,
 <tf.Variable 'dense_3/bias:0' shape=(1,) dtype=float32_ref>,
 <tf.Variable 'embedding_3/embeddings:0' shape=(124253, 100) dtype=float32_ref>]

In [78]:
np.max(list(map(len,sequences)))

2322

In [79]:
predicted = model.predict(test_x,batch_size=128)
ans=np.rint(predicted).astype(np.int32)
import datetime
now = datetime.datetime.now()
df = pd.DataFrame()
df['id'] = test.id
df['sentiment'] = ans
df.to_csv('submit'+str(now.hour)+str(now.minute)+'.csv',index=False)

In [67]:
data

array([[   0,    0,    0, ...,   21,    1, 1598],
       [   0,    0,    0, ...,   26,   92, 5686],
       [   0,    0,    0, ..., 1305,    4, 5344],
       ..., 
       [   0,    0,    0, ...,  289,   64,  566],
       [   0,    0,    0, ...,   22,  112, 9811],
       [   0,    0,    0, ...,    5,   12,   27]])