In [2]:
import pandas as pd

In [3]:
train = pd.read_csv(r"train\labeledTrainData.tsv",sep='\t')
test = pd.read_csv(r"test\testData.tsv",sep='\t')

In [4]:
train.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [6]:
MAX_NB_WORDS=20000

In [7]:
tokenizer = Tokenizer(MAX_NB_WORDS)

In [9]:
import numpy as np
texts = np.concatenate([train.review.values,test.review.values])

In [11]:
texts.shape

(50000,)

In [12]:
tokenizer.fit_on_texts(texts=texts)

In [13]:
sequences = tokenizer.texts_to_sequences(texts)

In [14]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 124252 unique tokens.


In [20]:
MAX_SEQUENCE_LENGTH = 1000
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [21]:
data

array([[   0,    0,    0, ...,   21,    1, 1598],
       [   0,    0,    0, ...,   26,   92, 5686],
       [   0,    0,    0, ..., 1305,    4, 5344],
       ..., 
       [   0,    0,    0, ...,  289,   64,  566],
       [   0,    0,    0, ...,   22,  112, 9811],
       [   0,    0,    0, ...,    5,   12,   27]])

In [22]:
data.shape

(50000, 1000)

In [79]:
train_x = data[:25000]
train_l=train.sentiment.astype(str).str.get_dummies()

In [51]:
test_x = data[25000:]

In [52]:
tokenizer.word_index['beauty']

918

In [53]:
test.review.iloc[0]

"Naturally in a film who's main themes are of mortality, nostalgia, and loss of innocence it is perhaps not surprising that it is rated more highly by older viewers than younger ones. However there is a craftsmanship and completeness to the film which anyone can enjoy. The pace is steady and constant, the characters full and engaging, the relationships and interactions natural showing that you do not need floods of tears to show emotion, screams to show fear, shouting to show dispute or violence to show anger. Naturally Joyce's short story lends the film a ready made structure as perfect as a polished diamond, but the small changes Huston makes such as the inclusion of the poem fit in neatly. It is truly a masterpiece of tact, subtlety and overwhelming beauty."

In [57]:
test_ids = test.id

In [62]:
import os
embeddings_index = {}
GLOVE_DIR = r'..\Glove\glove.6B'
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'),encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    #print(values)
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [64]:
EMBEDDING_DIM = 100
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [65]:
from keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [107]:
y_train = train.sentiment.as_matrix()

In [109]:
from keras.layers import Input,Dense,Flatten
from keras.models import Model
from keras.layers.convolutional import Conv1D,MaxPooling1D
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x)  # global max pooling
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(1, activation='sigmoid')(x)

model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

# happy learning!
model.fit(train_x, y_train,
          epochs=2, batch_size=128)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x246705cbc18>

In [110]:
predicted = model.predict(test_x,batch_size=128)

In [115]:
ans=np.rint(predicted)
ans = ans.astype(np.int32)

In [117]:
import datetime
now = datetime.datetime.now()
df = pd.DataFrame()
df['id'] = test_ids
df['sentiment'] = ans
df.to_csv('submit'+str(now.hour)+str(now.minute)+'.csv',index=False)

In [116]:
ans

array([[1],
       [0],
       [0],
       ..., 
       [0],
       [1],
       [1]])