In [1]:
import numpy as np
from numpy import load
import pandas as pd
import pickle
from collections import defaultdict
import re
from bs4 import BeautifulSoup
import sys
import os
os.environ['KERAS_BACKEND']='theano' # Using theano as backend instead of tensorflow
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, Input
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from keras.callbacks import ReduceLROnPlateau
import matplotlib.pyplot as plt
from keras.models import load_model
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

plt.switch_backend('agg')
%matplotlib inline

Using Theano backend.


In [2]:
path_to_dataset = 'ILDC_multi/ILDC_multi.csv'

# path to transformer generated chunk embeddings eg. XLNet etc.
path_to_transformer_chunk_embeddings_train = 'emb/XLNet_train.npy'
path_to_transformer_chunk_embeddings_dev = 'emb/XLNet_dev.npy'
path_to_transformer_chunk_embeddings_test = 'emb/XLNet_test.npy'


In [3]:
dataset = pd.read_csv(path_to_dataset)

In [4]:
x_train0 = load(path_to_transformer_chunk_embeddings_train, allow_pickle = True)
x_dev0 = load(path_to_transformer_chunk_embeddings_dev, allow_pickle= True)
x_test0 = load(path_to_transformer_chunk_embeddings_test, allow_pickle= True)

In [5]:
# loading the corresponding label for each case in dataset
dev = dataset.loc[dataset['split'] == 'dev']
train = dataset.loc[dataset['split'] == 'train']
test = dataset.loc[dataset['split'] == 'test']

y_train0 = []
for i in range(train.shape[0]):
    y_train0.append(train.loc[i,'label'])

y_dev0 = []
for i in range(dev.shape[0]):
    y_dev0.append(dev.loc[i+32305,'label'])

y_test0 = []
for i in range(test.shape[0]):
    y_test0.append(test.loc[i+33299,'label'])

In [29]:
print(len(y_train0))
print(dev.shape)
1008*32+31

5082
(994, 4)


32287

In [6]:
# Setting the maximum sequnce length and embedding dimension
MAX_SEQUENCE_LENGTH = 25
EMBEDDING_DIM = 768

In [7]:
# padding the vectors to maximum sequence length
for i in range(x_train0.shape[0]):
    padding_vector = np.zeros((MAX_SEQUENCE_LENGTH - x_train0[i].shape[0], EMBEDDING_DIM))
    x_train0[i] = np.concatenate((x_train0[i],padding_vector), axis = 0)
    
for i in range(x_dev0.shape[0]):
    padding_vector = np.zeros((MAX_SEQUENCE_LENGTH - x_dev0[i].shape[0], EMBEDDING_DIM))
    x_dev0[i] = np.concatenate((x_dev0[i],padding_vector),axis = 0)
    
for i in range(x_test0.shape[0]):
    padding_vector = np.zeros((MAX_SEQUENCE_LENGTH - x_test0[i].shape[0], EMBEDDING_DIM))
    x_test0[i] = np.concatenate((x_test0[i],padding_vector), axis = 0)

In [8]:
# Using Input layer to convert into required tensor shape
text_input = Input(shape=(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM), dtype='float32', name='text')
# Using 3 Conv1D layers followed by max pooling layers
l_cov1= Conv1D(256, 2, activation='relu')(text_input)
l_pool1 = MaxPooling1D(2)(l_cov1)
l_cov2 = Conv1D(128, 2, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(2)(l_cov2)
l_cov3 = Conv1D(128, 2, activation='relu')(l_pool2)
l_pool3 = MaxPooling1D(4)(l_cov3)  # global max pooling
# Using the flatten layer to convert into 1D tensor
l_flat = Flatten()(l_pool3)
# passing the output embeddings through 2 dense layers
l_dense = Dense(128, activation='relu')(l_flat)
l_dense1 = Dense(32, activation='relu')(l_dense)
# Using sigmoid classifier
preds = Dense(1, activation='sigmoid')(l_dense1)

model = Model(text_input, preds)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

print("Simplified convolutional neural network")
model.summary()

Simplified convolutional neural network
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text (InputLayer)            (None, 25, 768)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 24, 256)           393472    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 12, 256)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 11, 128)           65664     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 5, 128)            0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 4, 128)            32896     
_________________________________________________________________
max_pooling1d_3 (Ma

In [12]:
num_sequences = len(x_train0)
batch_size = 32
batches_per_epoch =  int(num_sequences/batch_size)
num_features= 768
def train_generator(): # function to generate batches of corresponding batch size
    x_list= x_train0
    y_list =  y_train0
    # Generate batches
    while True:
        for b in range(batches_per_epoch):
            longest_index = (b + 1) * batch_size - 1
            timesteps = len(max(x_train0[:(b + 1) * batch_size][-batch_size:], key=len))
            x_train = np.full((batch_size, timesteps, num_features), -99.)
            y_train = np.zeros((batch_size,  1))
            for i in range(batch_size):
                li = b * batch_size + i
                x_train[i, 0:len(x_list[li]), :] = x_list[li]
                y_train[i] = y_list[li]
            yield x_train, y_train

In [9]:
num_sequences_val = len(x_dev0)
batch_size_val = 32
batches_per_epoch_val = int(num_sequences_val/batch_size_val)
num_features= 768
def val_generator(): # function to generate batches of corresponding batch size
    x_list= x_dev0
    y_list =  y_dev0
    # Generate batches
    while True:
        for b in range(batches_per_epoch_val):
            longest_index = (b + 1) * batch_size_val - 1
            timesteps = len(max(x_dev0[:(b + 1) * batch_size_val][-batch_size_val:], key=len))
            x_train = np.full((batch_size_val, timesteps, num_features), 0)
            y_train = np.zeros((batch_size_val,  1))
            for i in range(batch_size_val):
                li = b * batch_size_val + i
                x_train[i, 0:len(x_list[li]), :] = x_list[li]
                y_train[i] = y_list[li]
            yield x_train, y_train

In [10]:
num_features= 768
def test_generator(): # function to generate batches of corresponding batch size
    x_list= x_test0
    y_list =  y_test0
    # Generate batches
    while True:
        for b in range(batches_per_epoch_test):
            if(b == batches_per_epoch_test-1): # An extra if else statement just to manage the last batch as it's size might not be equal to batch size 
                longest_index = num_sequences_test - 1
                timesteps = len(max(x_test0[:longest_index + 1][-batch_size_test:], key=len))
                x_train = np.full((longest_index - b*batch_size_test, timesteps, num_features), -99.)
                y_train = np.zeros((longest_index - b*batch_size_test,  1))
                for i in range(longest_index - b*batch_size_test):
                    li = b * batch_size_test + i
                    x_train[i, 0:len(x_list[li]), :] = x_list[li]
                    y_train[i] = y_list[li]
            else:
                longest_index = (b + 1) * batch_size_test - 1
                timesteps = len(max(x_test0[:(b + 1) * batch_size_test][-batch_size_test:], key=len))
                x_train = np.full((batch_size_test, timesteps, num_features), -99.)
                y_train = np.zeros((batch_size_test,  1))
                for i in range(batch_size_test):
                    li = b * batch_size_test + i
                    x_train[i, 0:len(x_list[li]), :] = x_list[li]
                    y_train[i] = y_list[li]
            yield x_train, y_train

In [13]:
# Setting the callback and training the model
call_reduce = ReduceLROnPlateau(monitor='val_acc', factor=0.95, patience=2, verbose=2,
                                mode='auto', min_delta=0.01, cooldown=0, min_lr=0)

model.fit_generator(train_generator(), steps_per_epoch=batches_per_epoch, epochs=10,
                    validation_data=val_generator(), validation_steps=batches_per_epoch_val, callbacks =[call_reduce] )

Epoch 1/10
Epoch 2/10
Epoch 3/10

Epoch 00003: ReduceLROnPlateau reducing learning rate to 0.0009500000451225787.
Epoch 4/10
Epoch 5/10

Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.0009025000152178108.
Epoch 6/10
Epoch 7/10

Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.0008573750033974647.
Epoch 8/10
Epoch 9/10

Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.0008145062311086804.
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7fa793b70390>

In [14]:
num_sequences_test = len(x_test0)
batch_size_test = 32
batches_per_epoch_test = int(num_sequences_test/batch_size_test) + 1
num_features= 768
# evaluating on the test data
model.evaluate_generator(test_generator(), steps= batches_per_epoch_test)

[0.5710874199867249, 0.7077836394309998]

In [15]:
# defining a function which calculates various metrics such as micro and macro precision, accuracy and f1
def metrics_calculator(preds, test_labels):
    cm = confusion_matrix(test_labels, preds)
    TP = []
    FP = []
    FN = []
    for i in range(0,2):
        summ = 0
        for j in range(0,2):
            if(i!=j):
                summ=summ+cm[i][j]

        FN.append(summ)
    for i in range(0,2):
        summ = 0
        for j in range(0,2):
            if(i!=j):
                summ=summ+cm[j][i]

        FP.append(summ)
    for i in range(0,2):
        TP.append(cm[i][i])
    precision = []
    recall = []
    for i in range(0,2):
        precision.append(TP[i]/(TP[i] + FP[i]))
        recall.append(TP[i]/(TP[i] + FN[i]))

    macro_precision = sum(precision)/2
    macro_recall = sum(recall)/2
    micro_precision = sum(TP)/(sum(TP) + sum(FP))
    micro_recall = sum(TP)/(sum(TP) + sum(FN))
    micro_f1 = (2*micro_precision*micro_recall)/(micro_precision + micro_recall)
    macro_f1 = (2*macro_precision*macro_recall)/(macro_precision + macro_recall)
    return macro_precision, macro_recall, macro_f1, micro_precision, micro_recall, micro_f1

In [16]:
# getting the predicted labels on the test data
preds = model.predict_generator(test_generator(), steps= batches_per_epoch_test)
y_pred = preds > 0.5

# Calculating all metrics on test data predicted label
print(metrics_calculator(y_pred, y_test0[:-1]))

(0.7262697561976874, 0.7072125383992829, 0.7166144706204457, 0.7077836411609498, 0.7077836411609498, 0.7077836411609497)


In [17]:
# getting the predicted labels on the dev data
preds = model.predict_generator(val_generator(), steps= batches_per_epoch_val)
y_pred_dev = preds > 0.5

# Calculating all metrics on dev data predicted label
print(metrics_calculator(y_pred_dev, y_dev0[:-2]))

(0.6873489121676067, 0.6814516129032258, 0.6843875586614856, 0.6814516129032258, 0.6814516129032258, 0.6814516129032258)
