In [1]:
import pickle as pkl
import numpy as np

In [2]:
input_dim = 300

In [3]:
with open('Train_lemmas.txt', 'r', encoding='utf-8') as t:
    train_lem = t.read().split('\n')
t.close()

In [4]:
with open('Val_lemmas.txt', 'r', encoding='utf-8') as t:
    val_lem = t.read().split('\n')
t.close()

In [5]:
with open('Test_lemmas.txt', 'r', encoding='utf-8') as t:
    test_lem = t.read().split('\n')
t.close()

In [6]:
train_lem_count = len(set(' '.join(train_lem).split()))
train_lem_count

121484

In [7]:
sum([len(a.split()) for a in train_lem])/len(train_lem)

38.99672146077137

In [8]:
with open('Train_twenty.pkl', 'rb') as tr:
    X_train_20 = pkl.load(tr)
tr.close()

In [9]:
with open('Val_twenty.pkl', 'rb') as val:
    X_val_20 = pkl.load(val)
val.close()

In [10]:
with open('Test_twenty.pkl', 'rb') as test:
    X_test_20 = pkl.load(test)
test.close()

In [11]:
with open('Train_labels.txt', 'r') as l:
    train_labels = l.read().split('\n')
l.close()
len(train_labels)

87844

In [12]:
with open('Val_labels.txt', 'r') as v:
    val_labels = v.read().split('\n')
v.close()
len(val_labels)

1000

In [13]:
with open('Test_labels.txt', 'r') as t:
    test_labels = t.read().split('\n')
t.close()
len(test_labels)

1000

In [14]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
enc.fit(train_labels)

LabelEncoder()

In [15]:
y_train_labels = enc.transform(train_labels)
y_train_labels.shape

(87844,)

In [16]:
y_val_labels = enc.transform(val_labels)
y_val_labels.shape

(1000,)

In [17]:
y_test_labels = enc.transform(test_labels)
y_test_labels.shape

(1000,)

In [19]:
import keras
from keras import Model, metrics
from keras.models import Sequential
from keras.layers import Embedding, Dense, Dropout, Flatten, LSTM, Bidirectional, Input, Concatenate, Conv1D, MaxPooling1D
from keras.layers.advanced_activations import LeakyReLU
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

In [20]:
y_train = to_categorical(y_train_labels)
y_train.shape

(87844, 10)

In [21]:
y_val = to_categorical(y_val_labels)
y_val.shape

(1000, 10)

In [22]:
y_test = to_categorical(y_test_labels)
y_test.shape

(1000, 10)

In [23]:
num_words = 10000
#num_words = train_lem_count

In [24]:
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_lem)

In [25]:
X_train_300 = tokenizer.texts_to_sequences(train_lem)
X_val_300 = tokenizer.texts_to_sequences(val_lem)
X_test_300 = tokenizer.texts_to_sequences(test_lem)

In [26]:
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index
embedding_size = 300

In [27]:
maxlen = 100

X_train_300 = pad_sequences(X_train_300, padding='post', maxlen=maxlen)
X_val_300 = pad_sequences(X_val_300, padding='post', maxlen=maxlen)
X_test_300 = pad_sequences(X_test_300, padding='post', maxlen=maxlen)

In [28]:
from sklearn.utils.class_weight import compute_class_weight

In [29]:
# Instantiate the label encoder
le = LabelEncoder()

# Fit the label encoder to our label series
le.fit(list(y_train_labels))

# Create integer based labels Series
y_integers = le.transform(list(y_train_labels))

In [30]:
class_weights = compute_class_weight('balanced', np.unique(y_train_labels), y_train_labels)
class_weights_dict = dict(zip(le.transform(list(le.classes_)), class_weights))

In [31]:
# Parameters
batch_size = 512
epochs = 100
num_classes = 10

In [32]:
inputA = Input(shape=(maxlen,))
inputB = Input(shape=(20,))

In [33]:
emb = Embedding(output_dim=embedding_size, input_dim=num_words, input_length=maxlen, trainable = True)(inputA)
x = Bidirectional(LSTM(200))(emb)
x = Dropout(0.2)(x)
x = Model(inputs = inputA, outputs = x)

In [34]:
y = Dense(20, activation="relu")(inputB)
y = Dense(10, activation="relu")(y)
y = Model(inputs=inputB, outputs = y)

In [35]:
combined = Concatenate()([x.output, y.output])

In [36]:
z = Dense(200, activation='relu')(combined)
z = Dense(100, activation='relu')(combined)
z = Dense(num_classes, activation='softmax')(z)

In [37]:
model = Model(inputs =[x.input, y.input], outputs = z)

In [38]:
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adam(),
              metrics=[metrics.categorical_accuracy])
print(model.summary())

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 100, 300)     3000000     input_1[0][0]                    
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 400)          801600      embedding_1[0][0]                
____________________________________________________________________________________________

In [39]:
earlystop = EarlyStopping(monitor='val_categorical_accuracy', min_delta = 0.0001, patience=5, verbose=1, mode='auto')
callbacks_list = [earlystop]

In [40]:
model.fit([X_train_300, X_train_20], y_train, batch_size=batch_size, epochs=epochs, callbacks=callbacks_list,
          validation_data=([X_val_300, X_val_20], y_val), class_weight = class_weights_dict, verbose=1)

Train on 87844 samples, validate on 1000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 00010: early stopping


<keras.callbacks.History at 0x7f7defcf47f0>

In [41]:
scores = model.evaluate([X_val_300, X_val_20], y_val, verbose=0)
print('Val loss:', scores[0])
print('Val accuracy:', scores[1])

Val loss: 0.9091554753780365
Val accuracy: 0.845


In [42]:
y_pred = model.predict([X_val_300, X_val_20])
y_pred_classes = [[1 if c == max(a) else 0 for c in a] for a in y_pred]
y_pred_labels = [a.index(1) for a in y_pred_classes]
confusion_matrix(y_pred_labels, y_val_labels)
#y_val_labels

array([[74,  0,  0,  3,  2,  0,  2,  3,  1,  2],
       [ 0, 78,  3,  1,  0,  0,  1,  0,  2,  2],
       [ 7,  6, 96,  3,  1,  0,  0,  2,  3,  2],
       [ 6,  2,  0, 86,  0,  2,  3,  2,  0,  0],
       [ 1,  3,  0,  2, 91,  0,  0,  2, 16,  6],
       [ 1,  2,  0,  1,  0, 94,  0,  0,  2,  1],
       [ 6,  2,  1,  2,  1,  3, 92,  0,  4,  3],
       [ 3,  2,  0,  1,  1,  0,  1, 90,  2,  0],
       [ 0,  2,  0,  1,  0,  0,  0,  0, 66,  6],
       [ 2,  3,  0,  0,  4,  1,  1,  1,  4, 78]])

In [43]:
test_scores = model.evaluate([X_test_300, X_test_20], y_test, verbose=0)
print('Test loss:', test_scores[0])
print('Test accuracy:', test_scores[1])

Test loss: 0.9949663322893902
Test accuracy: 0.837


In [44]:
y_pred = model.predict([X_test_300, X_test_20])
y_pred_classes = [[1 if c == max(a) else 0 for c in a] for a in y_pred]
y_pred_labels = [a.index(1) for a in y_pred_classes]
confusion_matrix(y_pred_labels, y_test_labels)

array([[ 83,   5,   3,   0,   1,   0,   1,   2,   0,   0],
       [  1,  75,   1,   0,   0,   0,   0,   1,   5,   2],
       [  3,   4,  88,   7,   1,   0,   2,   1,   4,   1],
       [  1,   4,   2,  81,   0,   0,   0,   4,   0,   1],
       [  2,   2,   0,   0,  95,   0,   0,   2,  16,   3],
       [  1,   0,   0,   2,   0, 100,   4,   1,   2,   0],
       [  4,   6,   2,   7,   0,   0,  91,   2,   4,   2],
       [  4,   2,   3,   2,   1,   0,   1,  85,   6,   4],
       [  0,   2,   0,   0,   2,   0,   1,   0,  56,   4],
       [  1,   0,   1,   1,   0,   0,   0,   2,   7,  83]])

In [45]:
print(classification_report(y_pred_labels, y_test_labels))

              precision    recall  f1-score   support

           0       0.83      0.87      0.85        95
           1       0.75      0.88      0.81        85
           2       0.88      0.79      0.83       111
           3       0.81      0.87      0.84        93
           4       0.95      0.79      0.86       120
           5       1.00      0.91      0.95       110
           6       0.91      0.77      0.83       118
           7       0.85      0.79      0.82       108
           8       0.56      0.86      0.68        65
           9       0.83      0.87      0.85        95

    accuracy                           0.84      1000
   macro avg       0.84      0.84      0.83      1000
weighted avg       0.85      0.84      0.84      1000



In [None]:
from keras.utils.vis_utils import plot_model
plot_model(model, to_file='model_plot cnn.png', show_shapes=True, show_layer_names=True)