### Preprocessing

In [1]:
import pickle

with open('embedding_labels.pkl', 'rb') as lf:
    labels = pickle.load(lf)

with open('embedding_posts.pkl', 'rb') as pf:
    posts = pickle.load(pf)

In [2]:
len(labels)

388065

In [3]:
# Tokenizing data
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

max_features = 10000
max_len = 50
batch_size = 32
train_samples = 329885 # Roughly about 70% of Data
max_words = 10000

Using TensorFlow backend.


In [4]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(posts)
sequences = tokenizer.texts_to_sequences(posts)
word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

Found 116166 unique tokens


In [5]:
data = sequence.pad_sequences(sequences, maxlen=max_len)
label_dictionary = {
    'I': 0,
    'E': 1
}

labels = [label_dictionary[label[:1]] for label in labels]

In [6]:
import numpy as np
labels = np.asarray(labels)

In [7]:
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (388065, 50)
Shape of label tensor: (388065,)


In [8]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

In [9]:
x_test = data[train_samples:]
y_test = labels[train_samples:]

x_train = data[:train_samples]
y_train = labels[:train_samples]

In [10]:
from collections import Counter
Counter(labels)

Counter({1: 90733, 0: 297332})

In [11]:
class_weights = {
    0:1.0,
    1:3.27
}

In [None]:
from keras import models
from keras import layers
from keras.regularizers import l1_l2
from keras.layers import Embedding, Dropout, LSTM, Dense

model = models.Sequential()
model.add(Embedding(max_features, max_len))
model.add(LSTM(16, return_sequences=True))
model.add(LSTM(16))
model.add(Dense(1, activation='sigmoid'))

# model.layers[0].set_weights([embedding_matrix])
# model.layers[0].trainable = False

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

history = model.fit(x_train, y_train, epochs=40, batch_size=256, class_weight=class_weights, validation_split=0.15)

Train on 280402 samples, validate on 49483 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40