In [2]:
%tensorflow_version 2.x
import tensorflow as tf
print(tf.__version__)

2.3.0


In [3]:
# All general imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer 

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, Reshape, Conv2D, MaxPool2D, Concatenate, Flatten, Dropout, Dense, Bidirectional, GlobalAveragePooling1D, GRU, GlobalMaxPooling1D, concatenate
from keras.optimizers import Adam
from keras.layers import LSTM, GRU, Conv1D, MaxPool1D, Activation
from keras.layers import add

from keras.models import Model, Sequential
from keras.layers.core import SpatialDropout1D

from keras.engine.topology import Layer
from keras.layers import Dense, Input, Embedding, Dropout, Activation, Conv1D, Softmax
from keras import initializers, regularizers, constraints, optimizers, layers
from keras import backend as K

from keras.callbacks import EarlyStopping

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
import io, os, gc

In [5]:
#################### Importing FNC Datasets ####################
# Train set
train_df = pd.read_csv('../Datasets/train_fnc.csv')
print(train_df.columns)
le = LabelEncoder()
train_df['Stance'] = le.fit_transform(train_df['Stance'])
train_df.head()

# Test set
test_df = pd.read_csv('../Datasets/test_fnc.csv')
print(test_df.columns)
test_df['Stance'] = le.transform(test_df['Stance'])
test_df.head()

Index(['Headline', 'Body ID', 'Stance', 'Body', 'Novelty_Labels', 'Emotion_1'], dtype='object')
Index(['Headline', 'Body ID', 'Stance', 'Body', 'Novelty_Labels', 'Emotion_1'], dtype='object')


Unnamed: 0,Headline,Body ID,Stance,Body,Novelty_Labels,Emotion_1
0,Ferguson riots: Pregnant woman loses eye after...,2008,3,A RESPECTED senior French police officer inves...,neutral,anger
1,Crazy Conservatives Are Sure a Gitmo Detainee ...,1550,3,Dave Morin's social networking company Path is...,neutral,fear
2,A Russian Guy Says His Justin Bieber Ringtone ...,2,3,A bereaved Afghan mother took revenge on the T...,contradiction,joy
3,"Zombie Cat: Buried Kitty Believed Dead, Meows ...",1793,3,Hewlett-Packard is officially splitting in two...,entailment,joy
4,Argentina's President Adopts Boy to End Werewo...,37,3,An airline passenger headed to Dallas was remo...,contradiction,turst


In [6]:
# Getting BERT Embeddings (only for agreed and disagree class)
pre_bert_fnc = np.load("../Datasets/pre_bert_fnc.npy")
hyp_bert_fnc = np.load("../Datasets/hyp_bert_fnc.npy")
print('Premise', pre_bert_fnc.shape)
print('Hypothesis', hyp_bert_fnc.shape)

Premise (4518, 768)
Hypothesis (4518, 768)


In [7]:
# Bert embeddings for test
pre_bert_fnc_test = np.load("../Datasets/pre_bert_test_fnc.npy")
hyp_bert_fnc_test = np.load("../Datasets/hyp_bert_test_fnc.npy")
print('Premise', pre_bert_fnc_test.shape)
print('Hypothesis', hyp_bert_fnc_test.shape)

Premise (2600, 768)
Hypothesis (2600, 768)


In [8]:
train_lst_1 = train_df['Body'].tolist()
print(len(train_lst_1))
train_lst_1[:5]
train_lst_2 = train_df['Headline'].tolist()
print(len(train_lst_2))
uq_tr_1 = list(set(train_lst_1))
uq_tr_2 = list(set(train_lst_2))
print(len(uq_tr_1))
print(len(uq_tr_2))
train_merged = uq_tr_1 + uq_tr_2
print('Train Length is', len(train_merged))
train_merged[:5]
test_lst_1 = test_df['Body'].tolist()
test_lst_2 = test_df['Headline'].tolist()
uq_ts_1 = list(set(test_lst_1))
uq_ts_2 = list(set(test_lst_2))
test_merged = uq_ts_1 + uq_ts_2
print('Test merged', len(test_merged))
total_dataset = train_merged + test_merged
print('Dataset length is', len(total_dataset))

49972
49972
1669
1648
Train Length is 3317
Test merged 1794
Dataset length is 5111


In [9]:
# Defining the tokenizer
def get_tokenizer(vocabulary_size):
  print('Training tokenizer...')
  tokenizer = Tokenizer(num_words= vocabulary_size)
  tweet_text = []
  print('Read {} Sentences'.format(len(total_dataset)))
  tokenizer.fit_on_texts(total_dataset)
  return tokenizer

In [10]:
# For getting the embedding matrix
def get_embeddings():
  print('Generating embeddings matrix...')
  embeddings_file = 'glove.6B.300d.txt'
  embeddings_index = dict()
  with open(embeddings_file, 'r', encoding="utf-8") as infile:
    for line in infile:
      values = line.split()
      word = values[0]
      vector = np.asarray(values[1:], "float32")
      embeddings_index[word] = vector
	# create a weight matrix for words in training docs
  vocabulary_size = len(embeddings_index)
  embeddinds_size = list(embeddings_index.values())[0].shape[0]
  print('Vocabulary = {}, embeddings = {}'.format(vocabulary_size, embeddinds_size))
  tokenizer = get_tokenizer(vocabulary_size)
  embedding_matrix = np.zeros((vocabulary_size, embeddinds_size))
  considered = 0
  total = len(tokenizer.word_index.items())
  for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
      print(word, index)
      continue
    else:
      embedding_vector = embeddings_index.get(word)
      if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
        considered += 1
  print('Considered ', considered, 'Left ', total - considered)			
  return embedding_matrix, tokenizer

In [11]:
def get_data(tokenizer, MAX_LENGTH, input_df):
  print('Loading data')
  X1, X2, Y = [], [], []
  X1 = input_df['Body'].tolist()
  X2 = input_df['Headline'].tolist()
  Y = input_df['Stance'].tolist()
  
  assert len(X1) == len(X2) == len(Y)
  sequences_1 = tokenizer.texts_to_sequences(X1)
  sequences_2 = tokenizer.texts_to_sequences(X2)
  X1 = pad_sequences(sequences_1, maxlen=MAX_LENGTH)
  X2 = pad_sequences(sequences_2, maxlen=MAX_LENGTH)
  Y = np.array(Y)
  return X1, X2, Y

In [12]:
embedding_matrix, tokenizer = get_embeddings()

Generating embeddings matrix...
Vocabulary = 400000, embeddings = 300
Training tokenizer...
Read 5111 Sentences
Considered  26192 Left  11274


In [13]:
MAX_LENGTH = 100
# read ml data
X1, X2, Y = get_data(tokenizer, MAX_LENGTH, train_df)

Loading data


In [14]:
X1_test, X2_test, Y_test = get_data(tokenizer, MAX_LENGTH, test_df)

Loading data


In [15]:
print(Y.shape)
print(type(X1))
X1.shape

(49972,)
<class 'numpy.ndarray'>


(49972, 100)

In [18]:
# Removing the unrelated samples from both train and test
result = np.where(train_df['Stance'] == 2)[0]
result_1 = np.where(train_df['Stance'] == 3)[0]
print(result.shape, result_1.shape)
result_comb = np.concatenate((result, result_1))
print(result_comb.shape)
reduced_X1 = np.delete(X1, result_comb, axis=0)
reduced_X2 = np.delete(X2, result_comb, axis=0)
print('Train shape', reduced_X1.shape)
reduced_train_labels = np.delete(train_df['Stance'].values, result_comb)
print('Train labels', reduced_train_labels)
result_test = np.where(test_df['Stance'] == 2)[0]
result_test_1 = np.where(test_df['Stance'] == 3)[0]
result_test_comb = np.concatenate((result_test, result_test_1))
reduced_X1_test = np.delete(X1_test, result_test_comb, axis=0)
reduced_X2_test = np.delete(X2_test, result_test_comb, axis=0)
print('Test shape', reduced_X1_test.shape)
reduced_test_labels = np.delete(test_df['Stance'].values, result_test_comb)
print('Test labels', reduced_test_labels)

(8909,) (36545,)
(45454,)
Train shape (4518, 100)
Train labels [0 1 0 ... 0 1 0]
Test shape (2600, 100)
Test labels [0 0 0 ... 1 1 0]


In [19]:
embeddings_file = 'glove.6B.300d.txt'
embeddings_index = dict()
with open(embeddings_file, 'r', encoding="utf-8") as infile:
  for line in infile:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], "float32")
    embeddings_index[word] = vector

In [20]:
y_train = keras.utils.to_categorical(reduced_train_labels)
print(y_train)
y_test = keras.utils.to_categorical(reduced_test_labels)
print(y_test)

[[1. 0.]
 [0. 1.]
 [1. 0.]
 ...
 [1. 0.]
 [0. 1.]
 [1. 0.]]
[[1. 0.]
 [1. 0.]
 [1. 0.]
 ...
 [0. 1.]
 [0. 1.]
 [1. 0.]]


In [22]:
from sklearn.model_selection import train_test_split
VALIDATION_RATIO = 0.1
RANDOM_STATE = 9527
x1_train, x1_val, \
x2_train, x2_val, \
pre_train_bert, pre_val_bert, \
hyp_train_bert, hyp_val_bert, \
y_train, y_val = \
    train_test_split(
        reduced_X1, reduced_X2, pre_bert_fnc, hyp_bert_fnc, y_train, 
        test_size=VALIDATION_RATIO, 
        random_state=RANDOM_STATE
)

In [24]:
print("Training Set")
print("-" * 10)
print(f"x1_train: {x1_train.shape}")
print(f"x2_train: {x2_train.shape}")
print(f"y_train : {y_train.shape}")
print(f"Bert premise : {pre_train_bert.shape}")

print("-" * 10)
print(f"x1_val:   {x1_val.shape}")
print(f"x2_val:   {x2_val.shape}")
print(f"y_val :   {y_val.shape}")
print(f"Val_Bert_premise : {pre_val_bert.shape}")
print("-" * 10)
print("Test Set")

Training Set
----------
x1_train: (4066, 100)
x2_train: (4066, 100)
y_train_cs : (4066, 2)
Bert premise : (4066, 768)
----------
x1_val:   (452, 100)
x2_val:   (452, 100)
y_val_cs :   (452, 2)
Val_Bert_premise : (452, 768)
----------
Test Set


In [25]:
NUM_CLASSES = 2

MAX_SEQUENCE_LENGTH = 100

NUM_LSTM_UNITS = 150

MAX_NUM_WORDS = embedding_matrix.shape[0]

NUM_EMBEDDING_DIM = embedding_matrix.shape[1]

In [26]:
# BERT + Normal Grand Model

NUM_LSTM_UNITS = 150

top_input_wd = Input(
    shape=(MAX_SEQUENCE_LENGTH, ), 
    dtype='int32')
bm_input_wd = Input(
    shape=(MAX_SEQUENCE_LENGTH, ), 
    dtype='int32')

embedding_layer = Embedding(
    MAX_NUM_WORDS, NUM_EMBEDDING_DIM)
top_embedded_wd = embedding_layer(
    top_input_wd)
bm_embedded_wd = embedding_layer(
    bm_input_wd)

source_lstm_wd = Bidirectional(LSTM(NUM_LSTM_UNITS, return_sequences=True, recurrent_dropout = 0.3))
shared_lstm_wd = Bidirectional(LSTM(NUM_LSTM_UNITS, activation='tanh', recurrent_dropout = 0.3))
top_source_wd = source_lstm_wd(top_embedded_wd)
bm_source_wd = source_lstm_wd(bm_embedded_wd)

source_comb_wd = concatenate(
    [top_source_wd, bm_source_wd],
    axis=-1
    )
lstm_ops_wd = shared_lstm_wd(source_comb_wd)   # 300D vector


top_input_bt = Input(
    shape=(768, ), 
    dtype='float32')
bm_input_bt = Input(
    shape=(768, ), 
    dtype='float32')


top_embedded_bt = Reshape((1, 768, ))(top_input_bt)
bm_embedded_bt = Reshape((1, 768, ))(bm_input_bt)

source_lstm_bt = Bidirectional(LSTM(NUM_LSTM_UNITS, return_sequences=True, recurrent_dropout = 0.3))
shared_lstm_bt = Bidirectional(LSTM(NUM_LSTM_UNITS, activation='tanh', recurrent_dropout = 0.3))
top_source_bt = source_lstm_bt(top_embedded_bt)
bm_source_bt = source_lstm_bt(bm_embedded_bt)

source_comb_bt = concatenate(
    [top_source_bt, bm_source_bt],
    axis=-1
    )
lstm_ops_bt = shared_lstm_bt(source_comb_bt)  #300D vector

# Bert and Normal Combination

comb_features_fnc = concatenate(
    [lstm_ops_wd+lstm_ops_bt, lstm_ops_wd-lstm_ops_bt, lstm_ops_wd*lstm_ops_bt],
    axis=-1
    )

pre_fnc = Dense(
    units=64, 
    activation='tanh',
    name = 'pre_fnc')(comb_features_fnc)

dense_fnc =  Dense(
    units=NUM_CLASSES, 
    activation='softmax',
    name = 'fnc')

predictions_fnc = dense_fnc(pre_fnc)

model = Model(
    inputs=[top_input_wd, bm_input_wd, top_input_bt, bm_input_bt], 
    outputs=[predictions_fnc])

model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 100)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 100)]        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 768)]        0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 768)]        0                                            
_______________________________________________________________________________________

In [27]:
from keras.optimizers import Adam
lr = 1e-3
opt = Adam(lr=lr, decay=lr/50)
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy'])

In [28]:
from sklearn.utils import class_weight
class_weight = class_weight.compute_class_weight('balanced'
                                               ,np.unique(reduced_train_labels)
                                               ,reduced_train_labels)
class_weight_dict = dict(enumerate(class_weight))

In [29]:
BATCH_SIZE = 256
NUM_EPOCHS = 10
stop = [EarlyStopping(monitor='val_loss', patience=0.001)]
history = model.fit(x=[x1_train, x2_train, pre_train_bert, hyp_train_bert],
                    y=y_train,
                    batch_size=BATCH_SIZE,
                    epochs=NUM_EPOCHS,
                    validation_data=(
                      [x1_val, x2_val, pre_val_bert, hyp_val_bert], 
                      y_val
                    ),
                    shuffle=True,
                    callbacks=stop,
                    class_weight = class_weight_dict
          )

Epoch 1/10
Epoch 2/10


In [32]:
from sklearn import metrics
from sklearn.metrics import classification_report
predictions = model.predict(
    [reduced_X1_test, reduced_X2_test, pre_bert_fnc_test, hyp_bert_fnc_test])

In [33]:
y_pred = [idx for idx in np.argmax(predictions, axis=1)]
print('Accuracy is')
print(metrics.accuracy_score(reduced_test_labels, y_pred)*100)
print(classification_report(reduced_test_labels, y_pred, target_names = ['agreed', 'disagreed']))

Accuracy is
68.15384615384616
              precision    recall  f1-score   support

      agreed       0.74      0.86      0.80      1903
   disagreed       0.33      0.19      0.24       697

    accuracy                           0.68      2600
   macro avg       0.54      0.53      0.52      2600
weighted avg       0.63      0.68      0.65      2600



In [37]:
# Model 2
NUM_LSTM_UNITS = 128

print('Getting Text CNN model...')
filter_sizes = [2, 3, 5]
num_filters = 128	#Hyperparameters 32,64,128; 0.2,0.3,0.4
drop = 0.4

top_input = Input(
    shape=(MAX_SEQUENCE_LENGTH, ), 
    dtype='int32')
bm_input = Input(
    shape=(MAX_SEQUENCE_LENGTH, ), 
    dtype='int32')

embedding_layer = Embedding(
    MAX_NUM_WORDS, NUM_EMBEDDING_DIM)
top_embedded = embedding_layer(
    top_input)
bm_embedded = embedding_layer(
    bm_input)
reshape = Reshape((MAX_SEQUENCE_LENGTH, NUM_EMBEDDING_DIM, 1))(top_embedded)
reshape_1 = Reshape((MAX_SEQUENCE_LENGTH, NUM_EMBEDDING_DIM, 1))(bm_embedded)
conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], NUM_EMBEDDING_DIM),  padding='valid', kernel_initializer='normal',  activation='relu')(reshape)
conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], NUM_EMBEDDING_DIM),  padding='valid', kernel_initializer='normal',  activation='relu')(reshape_1)
maxpool_0 = MaxPool2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)
maxpool_1 = MaxPool2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)
concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1])
flatten = Flatten()(concatenated_tensor)
dropout = Dropout(drop)(flatten)

top_input_bt = Input(
    shape=(768, ), 
    dtype='float32')
bm_input_bt = Input(
    shape=(768, ), 
    dtype='float32')


top_embedded_bt = Reshape((1, 768, ))(top_input_bt)
bm_embedded_bt = Reshape((1, 768, ))(bm_input_bt)

source_lstm_bt = Bidirectional(LSTM(NUM_LSTM_UNITS, return_sequences=True, recurrent_dropout = 0.3))
shared_lstm_bt = Bidirectional(LSTM(NUM_LSTM_UNITS, activation='tanh', recurrent_dropout = 0.3))
top_source_bt = source_lstm_bt(top_embedded_bt)
bm_source_bt = source_lstm_bt(bm_embedded_bt)

source_comb_bt = concatenate(
    [top_source_bt, bm_source_bt],
    axis=-1
    )
lstm_ops_bt = shared_lstm_bt(source_comb_bt)  #256D vector

comb_features_fnc = concatenate(
    [dropout+lstm_ops_bt, dropout-lstm_ops_bt, dropout*lstm_ops_bt],
    axis=-1
    )

predictions = Dense(units=NUM_CLASSES, activation='sigmoid')(comb_features_fnc)

model = Model(
    inputs=[top_input, bm_input, top_input_bt, bm_input_bt], 
    outputs=predictions)
model.summary()

Getting Text CNN model...
Model: "functional_8"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_13 (InputLayer)           [(None, 100)]        0                                            
__________________________________________________________________________________________________
input_14 (InputLayer)           [(None, 100)]        0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 100, 300)     120000000   input_13[0][0]                   
                                                                 input_14[0][0]                   
__________________________________________________________________________________________________
reshape_10 (Reshape)            (None, 100, 300, 1)  0       

In [38]:
from keras.optimizers import Adam
lr = 1e-3
opt = Adam(lr=lr, decay=lr/50)
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy'])

In [39]:
BATCH_SIZE = 256
NUM_EPOCHS = 10
stop = [EarlyStopping(monitor='val_loss', patience=0.001)]
history = model.fit(x=[x1_train, x2_train, pre_train_bert, hyp_train_bert],
                    y=y_train,
                    batch_size=BATCH_SIZE,
                    epochs=NUM_EPOCHS,
                    validation_data=(
                      [x1_val, x2_val, pre_val_bert, hyp_val_bert], 
                      y_val
                    ),
                    shuffle=True,
                    callbacks=stop,
                    class_weight = class_weight_dict
          )

Epoch 1/10
Epoch 2/10
Epoch 3/10


In [40]:
from sklearn import metrics
from sklearn.metrics import classification_report
predictions = model.predict(
    [reduced_X1_test, reduced_X2_test, pre_bert_fnc_test, hyp_bert_fnc_test])

In [41]:
y_pred = [idx for idx in np.argmax(predictions, axis=1)]
print('Accuracy is')
print(metrics.accuracy_score(reduced_test_labels, y_pred)*100)
print(classification_report(reduced_test_labels, y_pred, target_names = ['agreed', 'disagreed']))

Accuracy is
72.5
              precision    recall  f1-score   support

      agreed       0.76      0.92      0.83      1903
   disagreed       0.47      0.19      0.27       697

    accuracy                           0.73      2600
   macro avg       0.61      0.55      0.55      2600
weighted avg       0.68      0.72      0.68      2600

