In [1]:
import pandas as pd
import numpy as np
from __future__ import absolute_import
import keras
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, TimeDistributed, Dense, Lambda, concatenate, Dropout, BatchNormalization
from keras.layers.embeddings import Embedding
from keras.regularizers import l2
from keras.callbacks import Callback, ModelCheckpoint
from sklearn.metrics import classification_report
from keras import backend as K
from keras.models import Model
from keras.layers import Bidirectional
from keras.layers import LSTM
from keras.layers import Flatten
from keras.optimizers import Adadelta
from keras.callbacks import ModelCheckpoint
from sklearn import preprocessing
import functools
import os
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, accuracy_score , recall_score , precision_score

Using TensorFlow backend.


In [2]:
os.chdir("/home/embibe/Personal/ML/NUS/LIAR-PLUS-master")

Following is the function to print the classification and confusion report for the prepared model.

In [3]:
def classification_and_confusion_report_binary(actual_label,predicted_label,threshold):
    predicted_label = np.where(predicted_label>threshold,1,0)
    report=classification_report(actual_label,predicted_label)
    cm = confusion_matrix(actual_label,predicted_label)
    print(cm)
    print(report)
    return(predicted_label)

This is where it comes in handy since we need to “merge” our two LSTMs output using the MaLSTM(Manhattan LSTM) similarity function. It tells the difference between the two feature vectors of the statement and justification generated by the siamese network.The function is made such that value remains between 0 to 1 as function is e^(-x) where x is always positive as it is the manhattan distance.

In [4]:
def exponent_neg_manhattan_distance(left, right):
    return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))

Reading all the preprocessed files.

In [5]:
train=pd.read_csv("train_preprocessed.csv")
val=pd.read_csv("val_preprocessed.csv")
test=pd.read_csv("test_preprocessed.csv")

Extracting the features and labels from the loaded data.  

In [6]:
x_train = train.drop(['label_multiclass','label_binary'],axis=1)
x_val = val.drop(['label_multiclass','label_binary'],axis=1)
x_test = test.drop(['label_multiclass','label_binary'],axis=1)
y_train_multiclass = train['label_multiclass']
y_test_multiclass = test['label_multiclass']
y_val_multiclass = val['label_multiclass']
y_train_binary=train['label_binary']
y_val_binary=val['label_binary']
y_test_binary=test['label_binary']

In [7]:
x_train.shape,x_val.shape,x_test.shape

((10240, 205), (1284, 205), (1267, 205))

Finding the approximate length of the justification and statement column for padding

In [8]:
len(x_train['justification'][0].split()),len(x_train['statement'][0].split())

(40, 11)

(40+11)*10240=522240 --->> approx words

The sentence length is approx from 0 to 20 in statement and 0 to 60 in justification and hence the maximum sequence length is set to 20 and 60 respectively. The embedding used is of dimension 50 from glove for each word. All the important variables are initialized in the following cell.
**Num of epochs has been set to 20 due to resource constraints. Increasing epochs shall improve learning further.**

In [9]:
nb_words = 25900 
EMBEDDING_DIM = 50
MAX_SEQUENCE_LENGTH_1=60
MAX_SEQUENCE_LENGTH_2=60
DROPOUT = 0.1
num_epoch=20

Tokenizer is initialized and is fitted on the train set and then the train and validation data is converted to sequences based on the learnt tokenizer.

In [10]:
tokenizer = Tokenizer() 
tokenizer.fit_on_texts(np.concatenate((x_train.statement.astype(str), x_train.justification.astype(str)), axis=0))

sequences_train_q1 = tokenizer.texts_to_sequences(x_train.statement.astype(str))
sequences_train_q2 = tokenizer.texts_to_sequences(x_train.justification.astype(str))

sequences_val_q1 = tokenizer.texts_to_sequences(x_val.statement.astype(str))
sequences_val_q2 = tokenizer.texts_to_sequences(x_val.justification.astype(str))

word_index = tokenizer.word_index

All the sequences are padded to a maximum length of 20 and 60 as mentioned above in the notebook

In [11]:
sequences_train_q1_padded = pad_sequences(sequences_train_q1, maxlen=MAX_SEQUENCE_LENGTH_1)
sequences_train_q2_padded = pad_sequences(sequences_train_q2, maxlen=MAX_SEQUENCE_LENGTH_2)

sequences_val_q1_padded = pad_sequences(sequences_val_q1, maxlen=MAX_SEQUENCE_LENGTH_1)
sequences_val_q2_padded = pad_sequences(sequences_val_q2, maxlen=MAX_SEQUENCE_LENGTH_2)

## WORD EMBEDDING

Word embedding matrix is prepared using the glove file of 50 dimension for each word. A dictionary with keys as words and values as embeddings is prepared from the glove file and the words that we have in our vocabulary from tokenizer are given those embeddings and saved in the embedding matrix according to index. We have matrix size of NUM_WORDS*EMBEDDING_DIM.

In [12]:
word_embedding_matrix = np.zeros((nb_words+1, EMBEDDING_DIM))

embeddings_index = {}

f = open('glove.6B.50d.txt')

for line in f:
    values = line.split()
    word = values[0]       
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
    
f.close()

for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        word_embedding_matrix[i] = embedding_vector

In [13]:
train_features=x_train.drop(['statement','justification'],axis=1)
val_features=x_val.drop(['statement','justification'],axis=1)

In [14]:
train_features.shape,val_features.shape

((10240, 203), (1284, 203))

As the labels are multiclass they are converted to one hot encoding.

In [15]:
train_y_onehot =pd.get_dummies(y_train_multiclass, columns=['label_multiclass'], prefix=['label_multiclass'])
val_y_onehot = pd.get_dummies(y_val_multiclass, columns=['label_multiclass'], prefix=['label_multiclass'])
test_y_onehot = pd.get_dummies(y_test_multiclass, columns=['label_multiclass'], prefix=['label_multiclass'])

Network's input dimensions are been given including the word embeddings and the other features calculated in data_preprocessing step. The weights have been initialized with the word embedding matrix obtained from glove. Bidirectional LSTMs have been used to learn the word embeddings.'sum' Merge_mode is used to combine outputs of the forward and backward RNNs. 

Since this is a siamese network, both sides share the same LSTM initialised with the same weights and having the same parameters giving different feature vectors for the two input questions to be compared. Lambda function calculates the distance as defined by the exponent_neg_manhattan_distance function and that distance is concatenated with our other features. Further Dense layers and Dropouts are added to give final output layer with 1 and 6 nodes respectively i.e is_true_2 and is_true_6.

## BINARY AND MULTICLASS CLASSIFICATION

In [16]:
statement = Input(shape=(MAX_SEQUENCE_LENGTH_1,))
justification = Input(shape=(MAX_SEQUENCE_LENGTH_2,))

q1 = Embedding(nb_words + 1, 
                 EMBEDDING_DIM, 
                 weights=[word_embedding_matrix], 
                 input_length=MAX_SEQUENCE_LENGTH_1, 
                 trainable=True)(statement)

q2 = Embedding(nb_words + 1, 
                 EMBEDDING_DIM, 
                 weights=[word_embedding_matrix], 
                 input_length=MAX_SEQUENCE_LENGTH_2, 
                 trainable=True)(justification)

shared_lstm = Bidirectional(LSTM(30), merge_mode="sum")

q1 = shared_lstm(q1)
q2 = shared_lstm(q2)

malstm_distance = Lambda(function=lambda x: exponent_neg_manhattan_distance(x[0], x[1]),output_shape=lambda x: (x[0][0],1))([q1,q2])

features  = Input(shape=(train_features.shape[1],))

merged = concatenate([q1,q2,malstm_distance,features])
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)

merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)

merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)

merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)

merged = BatchNormalization()(merged)

is_true_6 = Dense(6, activation='softmax')(merged)
is_true_2 = Dense(1, activation='sigmoid')(merged)

lstm_model_multiclass = Model(inputs=[statement,justification,features], outputs=[is_true_6])
lstm_model_multiclass.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

lstm_model_binary = Model(inputs=[statement,justification,features], outputs=[is_true_2])
lstm_model_binary.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

W0809 20:36:55.695868 140287669671744 deprecation_wrapper.py:119] From /usr/local/lib/python2.7/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0809 20:36:55.744004 140287669671744 deprecation_wrapper.py:119] From /usr/local/lib/python2.7/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0809 20:36:55.753084 140287669671744 deprecation_wrapper.py:119] From /usr/local/lib/python2.7/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0809 20:36:55.785442 140287669671744 deprecation_wrapper.py:119] From /usr/local/lib/python2.7/dist-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W0809 20:36:55.787733 1402876696

In [17]:
lstm_model_multiclass.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 60)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 60)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 60, 50)       1295050     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 60, 50)       1295050     input_2[0][0]                    
__________________________________________________________________________________________________
bidirectio

In [18]:
lstm_model_binary.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 60)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 60)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 60, 50)       1295050     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 60, 50)       1295050     input_2[0][0]                    
__________________________________________________________________________________________________
bidirectio

Model is fitted with input values as padded train and test sequences and output is given as is_true_6 and available validation set is provided while training.

In [19]:
lstm_model_multiclass.fit([sequences_train_q1_padded, sequences_train_q2_padded,train_features], train_y_onehot, batch_size=64, nb_epoch=num_epoch,validation_data=([sequences_val_q1_padded, sequences_val_q2_padded,val_features],val_y_onehot))

  """Entry point for launching an IPython kernel.


Train on 10240 samples, validate on 1284 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f96f1bde110>

Tokenizer is applied to test data and padding is done afterwards.

In [20]:
sequences_test_data_q1 = tokenizer.texts_to_sequences(x_test.statement.astype(str))
sequences_test_data_q2 = tokenizer.texts_to_sequences(x_test.justification.astype(str))
sequences_test_data_q1_padded = pad_sequences(sequences_test_data_q1, maxlen=MAX_SEQUENCE_LENGTH_1)
sequences_test_data_q2_padded = pad_sequences(sequences_test_data_q2, maxlen=MAX_SEQUENCE_LENGTH_2)

As statement and justification have been converted to embeddings the as it is text columns have been removed.

In [21]:
test_features=x_test.drop(['statement','justification'],axis=1)

In [22]:
y_pred_multiclass = lstm_model_multiclass.predict([sequences_test_data_q1_padded, sequences_test_data_q2_padded,test_features])

In [23]:
y_pred_multiclass

array([[7.17154920e-01, 5.72092421e-02, 2.59098131e-02, 6.92518777e-04,
        1.98353916e-01, 6.79565128e-04],
       [9.99786794e-01, 2.45780557e-05, 5.46478276e-08, 1.21944453e-07,
        1.11018403e-06, 1.87421567e-04],
       [4.42616552e-01, 1.71677917e-01, 8.81721731e-03, 1.57764554e-03,
        3.74687433e-01, 6.23150496e-04],
       ...,
       [9.99961019e-01, 1.04740445e-06, 4.32311751e-08, 2.72947864e-09,
        1.61755764e-07, 3.78213408e-05],
       [8.36522598e-03, 8.75938237e-01, 5.95911220e-02, 3.92761634e-04,
        5.53054065e-02, 4.07304004e-04],
       [3.16324830e-01, 6.80857480e-01, 1.19067903e-03, 1.35656708e-04,
        1.03779945e-04, 1.38755504e-03]], dtype=float32)

### **The class having maximum score in prediction is taken to be the predicted class.**

In [24]:
indices_multiclass = y_pred_multiclass.argmax(axis=1)

In [25]:
print("accuracy",accuracy_score(indices_multiclass,y_test_multiclass))

('accuracy', 0.40015785319652725)


In [26]:
conf_mat = confusion_matrix(y_test_multiclass, indices_multiclass)
print(conf_mat)

[[ 84  32  70  10  14   2]
 [ 36 104  73  16  12   8]
 [ 34  41 143  30  11   6]
 [ 23  30  85  87   4  12]
 [ 10   9  17   3  50   3]
 [ 25  33  70  33   8  39]]


In [34]:
report_multiclass=classification_report(y_test_multiclass, indices_multiclass)
print(report_multiclass)

              precision    recall  f1-score   support

           0       0.40      0.40      0.40       212
           1       0.42      0.42      0.42       249
           2       0.31      0.54      0.40       265
           3       0.49      0.36      0.41       241
           4       0.51      0.54      0.52        92
           5       0.56      0.19      0.28       208

   micro avg       0.40      0.40      0.40      1267
   macro avg       0.45      0.41      0.40      1267
weighted avg       0.43      0.40      0.39      1267



Model is fitted with input values as padded train and test sequences and output is given as is_true_2 and available validation set is provided while training.

In [27]:
lstm_model_binary.fit([sequences_train_q1_padded, sequences_train_q2_padded,train_features], y_train_binary, batch_size=64, nb_epoch=num_epoch,validation_data=([sequences_val_q1_padded, sequences_val_q2_padded,val_features],y_val_binary))

  """Entry point for launching an IPython kernel.


Train on 10240 samples, validate on 1284 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f969e7d4d10>

In [28]:
lstm_model_multiclass.save('multiclass_model_siamese.h5')
lstm_model_binary.save('binary_model_siamese.h5')

In [29]:
y_pred_binary = lstm_model_binary.predict([sequences_test_data_q1_padded, sequences_test_data_q2_padded,test_features])

Threshold is taken as 0.5. Those with value less than 0.5 are considered of label 0, 1 otherwise.

In [30]:
predicted_label=classification_and_confusion_report_binary(y_test_binary,y_pred_binary,0.5)

[[378 175]
 [216 498]]
              precision    recall  f1-score   support

           0       0.64      0.68      0.66       553
           1       0.74      0.70      0.72       714

   micro avg       0.69      0.69      0.69      1267
   macro avg       0.69      0.69      0.69      1267
weighted avg       0.69      0.69      0.69      1267



In [31]:
print("accuracy",accuracy_score(predicted_label,y_test_binary))

('accuracy', 0.691397000789266)


In [35]:
report_binary=classification_report(y_test_binary, predicted_label)
print(report_binary)

              precision    recall  f1-score   support

           0       0.64      0.68      0.66       553
           1       0.74      0.70      0.72       714

   micro avg       0.69      0.69      0.69      1267
   macro avg       0.69      0.69      0.69      1267
weighted avg       0.69      0.69      0.69      1267



In [32]:
!pip freeze > requirements_Siamese.txt