In [46]:
import pickle as pkl
import numpy as np

In [47]:
input_dim = 300

In [48]:
with open('Train_lemmas.txt', 'r', encoding='utf-8') as t:
    train_lem = t.read().split('\n')
t.close()

In [49]:
with open('Val_lemmas.txt', 'r', encoding='utf-8') as t:
    val_lem = t.read().split('\n')
t.close()

In [50]:
with open('Test_lemmas.txt', 'r', encoding='utf-8') as t:
    test_lem = t.read().split('\n')
t.close()

In [51]:
train_lem_count = len(set(' '.join(train_lem).split()))
train_lem_count

121484

In [52]:
sum([len(a.split()) for a in train_lem])/len(train_lem)

38.99672146077137

In [53]:
with open('Train_twenty.pkl', 'rb') as tr:
    X_train_20 = pkl.load(tr)
tr.close()

In [54]:
with open('Val_twenty.pkl', 'rb') as val:
    X_val_20 = pkl.load(val)
val.close()

In [55]:
with open('Test_twenty.pkl', 'rb') as test:
    X_test_20 = pkl.load(test)
test.close()

In [56]:
with open('Train_labels.txt', 'r') as l:
    train_labels = l.read().split('\n')
l.close()
len(train_labels)

87844

In [57]:
with open('Val_labels.txt', 'r') as v:
    val_labels = v.read().split('\n')
v.close()
len(val_labels)

1000

In [58]:
with open('Test_labels.txt', 'r') as t:
    test_labels = t.read().split('\n')
t.close()
len(test_labels)

1000

In [59]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
enc.fit(train_labels)

LabelEncoder()

In [60]:
y_train_labels = enc.transform(train_labels)
y_train_labels.shape

(87844,)

In [61]:
y_val_labels = enc.transform(val_labels)
y_val_labels.shape

(1000,)

In [62]:
y_test_labels = enc.transform(test_labels)
y_test_labels.shape

(1000,)

In [63]:
import keras
from keras import Model, metrics
from keras.models import Sequential
from keras.layers import Embedding, Dense, Dropout, Flatten, LSTM, Bidirectional, Input, Concatenate, Conv1D, MaxPooling1D
from keras.layers.advanced_activations import LeakyReLU
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

In [64]:
y_train = to_categorical(y_train_labels)
y_train.shape

(87844, 10)

In [65]:
y_val = to_categorical(y_val_labels)
y_val.shape

(1000, 10)

In [66]:
y_test = to_categorical(y_test_labels)
y_test.shape

(1000, 10)

In [67]:
num_words = 10000
#num_words = train_lem_count

In [68]:
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_lem)

In [69]:
X_train_300 = tokenizer.texts_to_sequences(train_lem)
X_val_300 = tokenizer.texts_to_sequences(val_lem)
X_test_300 = tokenizer.texts_to_sequences(test_lem)

In [70]:
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index
embedding_size = 300

In [71]:
maxlen = 100

X_train_300 = pad_sequences(X_train_300, padding='post', maxlen=maxlen)
X_val_300 = pad_sequences(X_val_300, padding='post', maxlen=maxlen)
X_test_300 = pad_sequences(X_test_300, padding='post', maxlen=maxlen)

In [72]:
from sklearn.utils.class_weight import compute_class_weight

In [73]:
# Instantiate the label encoder
le = LabelEncoder()

# Fit the label encoder to our label series
le.fit(list(y_train_labels))

# Create integer based labels Series
y_integers = le.transform(list(y_train_labels))

In [74]:
class_weights = compute_class_weight('balanced', np.unique(y_train_labels), y_train_labels)
class_weights_dict = dict(zip(le.transform(list(le.classes_)), class_weights))

In [75]:
# Parameters
batch_size = 512
epochs = 100
num_classes = 10

In [76]:
inputA = Input(shape=(maxlen,))
inputB = Input(shape=(20,))

In [78]:
emb = Embedding(output_dim=embedding_size, input_dim=num_words, input_length=maxlen, trainable = True)(inputA)
x = Conv1D(26, 10, activation='relu')(emb)
x = MaxPooling1D(2)(x)
x = LSTM(200)(x)
x = Dropout(0.2)(x)
x = Model(inputs = inputA, outputs = x)

In [79]:
y = Dense(20, activation="relu")(inputB)
y = Dense(10, activation="relu")(y)
y = Model(inputs=inputB, outputs = y)

In [80]:
combined = Concatenate()([x.output, y.output])

In [81]:
z = Dense(200, activation='relu')(combined)
z = Dense(100, activation='relu')(combined)
z = Dense(num_classes, activation='softmax')(z)

In [82]:
model = Model(inputs =[x.input, y.input], outputs = z)

In [83]:
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adam(),
              metrics=[metrics.categorical_accuracy])
print(model.summary())

Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 100, 300)     3000000     input_5[0][0]                    
__________________________________________________________________________________________________
conv1d_4 (Conv1D)               (None, 91, 26)       78026       embedding_6[0][0]                
__________________________________________________________________________________________________
max_pooling1d_2 (MaxPooling1D)  (None, 45, 26)       0           conv1d_4[0][0]                   
____________________________________________________________________________________________

In [84]:
earlystop = EarlyStopping(monitor='val_categorical_accuracy', min_delta = 0.0001, patience=5, verbose=1, mode='auto')
callbacks_list = [earlystop]

In [85]:
model.fit([X_train_300, X_train_20], y_train, batch_size=batch_size, epochs=epochs, callbacks=callbacks_list,
          validation_data=([X_val_300, X_val_20], y_val), class_weight = class_weights_dict, verbose=1)

Train on 87844 samples, validate on 1000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 00011: early stopping


<keras.callbacks.History at 0x7fdfcc5a2dd8>

In [86]:
scores = model.evaluate([X_val_300, X_val_20], y_val, verbose=0)
print('Val loss:', scores[0])
print('Val accuracy:', scores[1])

Val loss: 0.9982561945915223
Val accuracy: 0.801


In [87]:
y_pred = model.predict([X_val_300, X_val_20])
y_pred_classes = [[1 if c == max(a) else 0 for c in a] for a in y_pred]
y_pred_labels = [a.index(1) for a in y_pred_classes]
confusion_matrix(y_pred_labels, y_val_labels)
#y_val_labels

array([[67,  1,  1,  3,  3,  0,  2,  2,  4,  1],
       [ 0, 76,  3,  1,  1,  1,  1,  0,  3,  5],
       [10, 15, 90,  5,  1,  1,  2,  4,  3,  2],
       [10,  1,  2, 83,  1,  0,  2,  3,  7,  0],
       [ 1,  0,  0,  1, 87,  0,  0,  0,  4,  2],
       [ 3,  0,  0,  1,  1, 90,  1,  2,  2,  3],
       [ 3,  1,  0,  2,  0,  5, 85,  3,  2,  1],
       [ 5,  1,  2,  4,  3,  2,  6, 86,  7,  2],
       [ 0,  3,  2,  0,  0,  0,  0,  0, 58,  5],
       [ 1,  2,  0,  0,  3,  1,  1,  0, 10, 79]])

In [88]:
test_scores = model.evaluate([X_test_300, X_test_20], y_test, verbose=0)
print('Test loss:', test_scores[0])
print('Test accuracy:', test_scores[1])

Test loss: 1.0426510825157165
Test accuracy: 0.799


In [89]:
y_pred = model.predict([X_test_300, X_test_20])
y_pred_classes = [[1 if c == max(a) else 0 for c in a] for a in y_pred]
y_pred_labels = [a.index(1) for a in y_pred_classes]
confusion_matrix(y_pred_labels, y_test_labels)

array([[78,  4,  2,  4,  1,  0,  3,  4,  4,  2],
       [ 1, 66,  0,  3,  0,  0,  1,  0, 10,  7],
       [ 4, 11, 91,  2,  2,  1,  2,  3,  3,  1],
       [ 3,  8,  1, 78,  2,  1,  0,  3,  8,  3],
       [ 1,  0,  0,  1, 88,  0,  0,  0,  9,  1],
       [ 0,  0,  0,  2,  0, 93,  1,  0,  0,  0],
       [ 6,  0,  1,  4,  2,  3, 87,  2,  0,  1],
       [ 7,  4,  5,  6,  4,  1,  6, 85,  5,  0],
       [ 0,  4,  0,  0,  1,  0,  0,  2, 52,  4],
       [ 0,  3,  0,  0,  0,  1,  0,  1,  9, 81]])

In [90]:
print(classification_report(y_pred_labels, y_test_labels))

              precision    recall  f1-score   support

           0       0.78      0.76      0.77       102
           1       0.66      0.75      0.70        88
           2       0.91      0.76      0.83       120
           3       0.78      0.73      0.75       107
           4       0.88      0.88      0.88       100
           5       0.93      0.97      0.95        96
           6       0.87      0.82      0.84       106
           7       0.85      0.69      0.76       123
           8       0.52      0.83      0.64        63
           9       0.81      0.85      0.83        95

    accuracy                           0.80      1000
   macro avg       0.80      0.80      0.80      1000
weighted avg       0.81      0.80      0.80      1000



In [None]:
from keras.utils.vis_utils import plot_model
plot_model(model, to_file='model_plot cnn.png', show_shapes=True, show_layer_names=True)