In [4]:
from __future__ import print_function

import pandas as pd

from gensim.corpora.dictionary import Dictionary

from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout
from keras.layers import LSTM, GlobalAveragePooling1D
from keras.datasets import imdb
from keras.callbacks import ModelCheckpoint

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [5]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [6]:
texts = list(train_df['text'])
texts.extend(test_df['text'])
len(texts)

4649

In [7]:
dct = Dictionary(documents=[i.split() for i in texts])

In [8]:
def doc2idx(text):
    return dct.doc2idx(text.split())

In [9]:
def yes_no_int(text):
    if text == "Yes":
        return 1
    else:
        return 0

In [10]:
train_df['idx'] = train_df['text'].apply(doc2idx)
test_df['idx'] = test_df['text'].apply(doc2idx)

In [11]:
train_df['labels'] = train_df['labels'].apply(yes_no_int)
test_df['labels'] = test_df['labels'].apply(yes_no_int)

In [12]:
max([len(i) for i in test_df['idx']])

120

In [13]:
max([len(i) for i in train_df['idx']])

216

In [14]:
len(dct.keys())

6892

In [12]:
train_df['idx'] = pad_sequences(train_df['idx'], maxlen=216, padding='post', value=6892)
test_df['idx'] = pad_sequences(test_df['idx'], maxlen=216, padding='post', value=6892)

In [None]:
model = Sequential()
model.add(Embedding(6893, 1024))
model.add(LSTM(1024, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# here's the checkpointer. The model is saved into "weights.hdf5" file.
checkpointer = ModelCheckpoint(filepath="weights.hdf5", verbose=1, save_best_only=True)

print('Training...')
model.fit(train_df['idx'], train_df['labels'],
          batch_size=64,
          epochs=20,
          validation_data=(test_df['idx'], test_df['labels']),
          callbacks=[checkpointer])

Training...
Train on 3657 samples, validate on 992 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.52960, saving model to weights.hdf5
Epoch 2/20

Epoch 00002: val_loss did not improve from 0.52960
Epoch 3/20

KeyboardInterrupt: 

In [16]:
score, acc = model.evaluate(test_df['idx'], test_df['labels'], batch_size=32)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.5901284525471349
Test accuracy: 0.7399193548387096


### Try 2

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=4)

In [15]:
all_texts = list(train_df['text'])
all_texts.extend(test_df['text'])

In [16]:
vectorizer.fit(all_texts)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=4,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [17]:
x_train = vectorizer.transform(train_df['text']).toarray()
x_test = vectorizer.transform(test_df['text']).toarray()

In [18]:
x_train = x_train[:, :, None]
x_test = x_test[:, :, None]

In [19]:
x_test.shape

(992, 1782, 1)

In [None]:
model = Sequential()
model.add(LSTM(units=1782, input_shape = x_train.shape[1:]))
# model.add(LSTM(units=3294))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# here's the checkpointer. The model is saved into "weights.hdf5" file.
checkpointer = ModelCheckpoint(filepath="weights.hdf5", verbose=1, save_best_only=True)

print('Training...')
model.fit(x_train, train_df['labels'],
          batch_size=8,
          epochs=20,
          validation_data=(x_test, test_df['labels']),
          callbacks=[checkpointer])

Training...
Train on 3657 samples, validate on 992 samples
Epoch 1/20
   8/3657 [..............................] - ETA: 10:06:42 - loss: 0.6931 - acc: 0.7500

In [90]:
score, acc = model.evaluate(x_test, test_df['labels'], batch_size=32)
print('Test score:', score)
print('Test accuracy:', acc)


Test score: 10.976460549139222
Test accuracy: 0.31149193548387094


Try 3