In [None]:
%tensorflow_version 2.x
import tensorflow as tf
print(tf.__version__)

In [None]:
# All general imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer 

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, Reshape, Conv2D, MaxPool2D, Concatenate, Flatten, Dropout, Dense, Bidirectional, GlobalAveragePooling1D, GRU, GlobalMaxPooling1D, concatenate
from keras.optimizers import Adam
from keras.layers import LSTM, GRU, Conv1D, MaxPool1D, Activation, Add

from keras.models import Model, Sequential
from keras.layers.core import SpatialDropout1D

from keras.engine.topology import Layer
from keras.layers import Dense, Input, Embedding, Dropout, Activation, Conv1D, Softmax
from keras import initializers, regularizers, constraints, optimizers, layers
from keras import backend as K

from keras.callbacks import EarlyStopping

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
import io, os, gc

In [None]:
#################### Importing CS Datasets ####################
# Train set
train_df = pd.read_csv('CS_Data//cstance_train.csv')
print(train_df.columns)
train_df.head()

# Test set
test_df = pd.read_csv('CS_Data//cstance_test_new.csv')
print(test_df.columns)
test_df.head()

In [None]:
pre_bert_cs = np.load("CS_Data//pre_bert_cs.npy")
hyp_bert_cs = np.load("CS_Data//hyp_bert_cs.npy")
print('Premise', pre_bert_cs.shape)
print('Hypothesis', hyp_bert_cs.shape)

In [None]:
pre_bert_cs_test = np.load("CS_Data//pre_bert_test_cs.npy")
hyp_bert_cs_test = np.load("CS_Data//hyp_bert_test_cs.npy")
print('Premise', pre_bert_cs_test.shape)
print('Hypothesis', hyp_bert_cs_test.shape)

In [None]:
premise = ["chloroquine hydroxychloroquine are cure for the novel coronavirus"]
train_lst_1 = train_df['text'].tolist()
print(len(train_lst_1))
train_lst_1[:5]
uq_tr_1 = list(set(train_lst_1))
print(len(uq_tr_1))
train_merged = uq_tr_1 + premise
print('Train Length is', len(train_merged))
train_merged[:5]
test_lst_1 = test_df['text'].tolist()
uq_ts_1 = list(set(test_lst_1))
test_merged = uq_ts_1
print('Test merged', len(test_merged))
total_dataset = train_merged + test_merged
print('Dataset length is', len(total_dataset))

In [None]:
# Defining the tokenizer
def get_tokenizer(vocabulary_size):
  print('Training tokenizer...')
  tokenizer = Tokenizer(num_words= vocabulary_size)
  tweet_text = []
  print('Read {} Sentences'.format(len(total_dataset)))
  tokenizer.fit_on_texts(total_dataset)
  return tokenizer

In [None]:
# For getting the embedding matrix
def get_embeddings():
  print('Generating embeddings matrix...')
  embeddings_file = '../resouces/glove.6B.300d.txt'
  embeddings_index = dict()
  with open(embeddings_file, 'r', encoding="utf-8") as infile:
    for line in infile:
      values = line.split()
      word = values[0]
      vector = np.asarray(values[1:], "float32")
      embeddings_index[word] = vector
	# create a weight matrix for words in training docs
  vocabulary_size = len(embeddings_index)
  embeddinds_size = list(embeddings_index.values())[0].shape[0]
  print('Vocabulary = {}, embeddings = {}'.format(vocabulary_size, embeddinds_size))
  tokenizer = get_tokenizer(vocabulary_size)
  embedding_matrix = np.zeros((vocabulary_size, embeddinds_size))
  considered = 0
  total = len(tokenizer.word_index.items())
  for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
      print(word, index)
      continue
    else:
      embedding_vector = embeddings_index.get(word)
      if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
        considered += 1
  print('Considered ', considered, 'Left ', total - considered)			
  return embedding_matrix, tokenizer, embeddings_index

In [None]:
def get_data(tokenizer, MAX_LENGTH, input_df):
  print('Loading data')
  X1, X2, Y = [], [], []
  X2 = input_df['text'].tolist()
  length = len(X2)
  premise = "chloroquine hydroxychloroquine are cure for the novel coronavirus"
  X1 = [premise for i in range(length)]
  Y = input_df['stance'].tolist()
  new_Y = [(ele-1) for ele in Y]
  assert len(new_Y) == len(Y)
  Y_nv = input_df['Novelty_Quora']
  Y_em = input_df['com_femotion']
  
  len(X1) == len(X2) == len(Y)
  sequences_1 = tokenizer.texts_to_sequences(X1)
  sequences_2 = tokenizer.texts_to_sequences(X2)
  X1 = pad_sequences(sequences_1, maxlen=MAX_LENGTH)
  X2 = pad_sequences(sequences_2, maxlen=MAX_LENGTH)
  new_Y = np.array(new_Y)
  Y_nv = np.stack(Y_nv)
  Y_em = np.stack(Y_em)
  return X1, X2, new_Y, Y_nv, Y_em

In [None]:
embedding_matrix, tokenizer, embeddings_index = get_embeddings()

In [None]:
MAX_LENGTH = 40
# read ml data
X1, X2, Y_cs, Y_nv, Y_em = get_data(tokenizer, MAX_LENGTH, train_df)

In [None]:
X1_test, X2_test, Y_cs_test, Y_nv_test, Y_em_test = get_data(tokenizer, MAX_LENGTH, test_df)

In [None]:
# Scaffold labels
novel = embeddings_index['original']
duplicate = embeddings_index['duplicate']
emotion_true = embeddings_index['anticipation']+embeddings_index['sadness']+embeddings_index['joy']+embeddings_index['trust']
emotion_false = embeddings_index['anger']+embeddings_index['fear']+embeddings_index['disgust']+embeddings_index['surprise']

In [None]:
# Novelty Bias
train_bias_nv = []
test_bias_nv = []
zero_vector = np.zeros((300,))
for i, row in train_df.iterrows():
    if row['Novelty_Quora'] == 0:
        train_bias_nv.append(novel)
    elif row['Novelty_Quora'] == 1:
        train_bias_nv.append(duplicate)
    else:
        train_bias_nv.append(zero_vector)
for i, row in test_df.iterrows():
    if row['Novelty_Quora'] == 0:
        test_bias_nv.append(novel)
    elif row['Novelty_Quora'] == 1:
        test_bias_nv.append(duplicate)
    else:
        test_bias_nv.append(zero_vector)
train_bias_nv = np.stack(train_bias_nv)
test_bias_nv = np.stack(test_bias_nv)
print('Train bias', train_bias_nv.shape)
print('Test bias', test_bias_nv.shape)

In [None]:
# Emotion Labels
train_bias_em = []
test_bias_em = []
zero_vector = np.zeros((300,))
for i in range(len(train_df)):
    hyp = train_df.loc[i, 'com_femotion']
    if hyp == 0 and train_df.loc[i, 'stance'] == 1:
        train_bias_em.append(emotion_true)
    # elif hyp == 0 and train_df.loc[i, 'stance'] == 2:
    #     train_bias_em.append(emotion_true)
    elif hyp == 1 and train_df.loc[i, 'stance'] == 2:
        train_bias_em.append(emotion_false)
    else:
        #print('in here')
        train_bias_em.append(zero_vector)
for i in range(len(test_df)):
    hyp = test_df.loc[i, 'com_femotion']
    if hyp == 0 and test_df.loc[i, 'stance'] == 1:
        test_bias_em.append(emotion_true)
    # elif hyp == 0 and test_df.loc[i, 'stance'] == 2:
    #     test_bias_em.append(emotion_true)
    elif hyp == 1 and test_df.loc[i, 'stance'] == 2:
        test_bias_em.append(emotion_false)
    else:
        test_bias_em.append(zero_vector)
train_bias_em = np.stack(train_bias_em)
test_bias_em = np.stack(test_bias_em)
print('Train bias', train_bias_em.shape)
print('Test bias', test_bias_em.shape)

In [None]:
# Considering the final train and test bias
train_bias = np.add(train_bias_nv, train_bias_em)
test_bias = np.add(test_bias_nv, test_bias_em)

In [None]:
# Creating one-hot encodings
y_train_nv = keras.utils.to_categorical(Y_nv)
print(y_train_nv)
y_train_em = keras.utils.to_categorical(Y_em)
print(y_train_em)
y_train_cs = keras.utils.to_categorical(Y_cs)
print(y_train_cs)
y_test_nv = keras.utils.to_categorical(Y_nv_test)
print(y_test_nv)
y_test_em = keras.utils.to_categorical(Y_em_test)
print(y_test_em)
y_test_cs = keras.utils.to_categorical(Y_cs_test)
print(y_test_cs)

In [None]:
from sklearn.model_selection import train_test_split
VALIDATION_RATIO = 0.1
RANDOM_STATE = 9527
x1_train, x1_val, \
x2_train, x2_val, \
x1_train_bert, x1_val_bert, \
x2_train_bert, x2_val_bert, \
y_train_nv, y_val_nv, \
y_train_em, y_val_em, \
y_train_cs, y_val_cs, \
train_bias, val_bias = \
    train_test_split(
        X1, X2, 
        pre_bert_cs, hyp_bert_cs,
        y_train_nv, y_train_em, 
        y_train_cs, train_bias,
        test_size=VALIDATION_RATIO, 
        random_state=RANDOM_STATE
)

In [None]:
print("Training Set")
print("-" * 10)
print(f"x1_train: {x1_train.shape}")
print(f"x2_train: {x2_train.shape}")
print(f"y_train_cs : {y_train_cs.shape}")
print(f"Train_Bias : {train_bias.shape}")

print("-" * 10)
print(f"x1_val:   {x1_val.shape}")
print(f"x2_val:   {x2_val.shape}")
print(f"y_val_cs :   {y_val_cs.shape}")
print(f"Val_Bias : {val_bias.shape}")
print("-" * 10)
print("Test Set")

In [None]:
NUM_CLASSES = 2

MAX_SEQUENCE_LENGTH = 40

NUM_LSTM_UNITS = 300

MAX_NUM_WORDS = embedding_matrix.shape[0]

NUM_EMBEDDING_DIM = embedding_matrix.shape[1]

In [None]:
class Attention(tf.keras.Model):
    def __init__(self, units):
        super(Attention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
 
    def call(self, features, hidden):
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        hidden_with_time_axis = tf.expand_dims(hidden, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = tf.nn.tanh(
            self.W1(features) + self.W2(hidden_with_time_axis))
        
        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(self.V(score), axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)
 
        return context_vector, attention_weights

In [None]:
# BERT + Normal Grand Model

NUM_LSTM_UNITS = 150

top_input_wd = Input(
    shape=(MAX_SEQUENCE_LENGTH, ), 
    dtype='int32')
bm_input_wd = Input(
    shape=(MAX_SEQUENCE_LENGTH, ), 
    dtype='int32')

embedding_layer = Embedding(
    MAX_NUM_WORDS, NUM_EMBEDDING_DIM, weights = [embedding_matrix], trainable = True)
top_embedded_wd = embedding_layer(
    top_input_wd)
bm_embedded_wd = embedding_layer(
    bm_input_wd)

source_lstm_wd = Bidirectional(LSTM(NUM_LSTM_UNITS, return_sequences=True, recurrent_dropout = 0.3), name="bi_lstm_0")
shared_lstm_wd = Bidirectional(LSTM(NUM_LSTM_UNITS, return_sequences=True, return_state=True, activation='tanh', recurrent_dropout = 0.3), name="bi_lstm_1")
top_source_wd = source_lstm_wd(top_embedded_wd)
bm_source_wd = source_lstm_wd(bm_embedded_wd)

source_comb_wd = concatenate(
    [top_source_wd, bm_source_wd],
    axis=-1
    )
# For novelty task specific
(lstm_ops_wd, forward_h, forward_c, backward_h, backward_c) = shared_lstm_wd(source_comb_wd)

state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])

context_vector, attention_weights_nv = Attention(10)(lstm_ops_wd, state_h)

top_input_bt = Input(
    shape=(768, ), 
    dtype='float32')
bm_input_bt = Input(
    shape=(768, ), 
    dtype='float32')
bias_input = Input(
    shape = (300, ),
    dtype = 'float32')

top_embedded_bt = Reshape((1, 768, ))(top_input_bt)
bm_embedded_bt = Reshape((1, 768, ))(bm_input_bt)

source_lstm_bt = Bidirectional(LSTM(NUM_LSTM_UNITS, return_sequences=True, recurrent_dropout = 0.3))
shared_lstm_bt = Bidirectional(LSTM(NUM_LSTM_UNITS, activation='tanh', recurrent_dropout = 0.3))
top_source_bt = source_lstm_bt(top_embedded_bt)
bm_source_bt = source_lstm_bt(bm_embedded_bt)

source_comb_bt = concatenate(
    [top_source_bt, bm_source_bt],
    axis=-1
    )
lstm_ops_bt = shared_lstm_bt(source_comb_bt)  #300D vector

#merged = Add()([top_output, bm_output])
#merged_bd = Add()([lstm_ops, bias_input])

# Bert and Normal Combination
comb_features = concatenate(
    [context_vector+lstm_ops_bt, context_vector-lstm_ops_bt, context_vector*lstm_ops_bt],
    axis=-1
    )

comb_features_cs = concatenate(
    [context_vector+lstm_ops_bt+bias_input, context_vector*lstm_ops_bt*bias_input],
    axis=-1
    )

pre_nv = Dense(
    units=64, 
    activation='tanh',
    name = 'pre_nv')(comb_features)

pre_em = Dense(
    units=64, 
    activation='tanh',
    name = 'pre_em')(comb_features)

pre_cs = Dense(
    units=64, 
    activation='tanh',
    name = 'pre_cs')(comb_features_cs)

dense_nv =  Dense(
    units=NUM_CLASSES, 
    activation='softmax',
    name = 'nv')

dense_em =  Dense(
    units=NUM_CLASSES, 
    activation='softmax',
    name = 'em')

dense_cs =  Dense(
    units=NUM_CLASSES, 
    activation='softmax',
    name = 'cs')

predictions_nv = dense_nv(pre_nv)
predictions_em = dense_em(pre_em)
predictions_cs = dense_cs(pre_cs)

model = Model(
    inputs=[top_input_wd, bm_input_wd, top_input_bt, bm_input_bt, bias_input], 
    outputs=[predictions_nv, predictions_em, predictions_cs])

model.summary()

In [None]:
from keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
lr = 1e-3
opt = Adam(lr=lr, decay=lr/50)
model.compile(
    optimizer='adam',
    loss={'nv':'categorical_crossentropy', 'em':'categorical_crossentropy', 'cs':'categorical_crossentropy'},
    loss_weights={'nv': 0, 'em':0, 'cs': 1},
    metrics=['accuracy'])
checkpointer = ModelCheckpoint(filepath='multitask_csbias.h5', verbose=1, save_best_only=True)

In [None]:
# MultiTask BERT Model
BATCH_SIZE = 256
NUM_EPOCHS = 50
stop = [EarlyStopping(monitor='val_loss', patience=0.001)]
history = model.fit(x=[x1_train, x2_train, x1_train_bert, x2_train_bert, train_bias],
                    y=[y_train_nv, y_train_em, y_train_cs],
                    batch_size=BATCH_SIZE,
                    epochs=NUM_EPOCHS,
                    validation_data=(
                      [x1_val, x2_val, x1_val_bert, x2_val_bert, val_bias], 
                      [y_val_nv, y_val_em, y_val_cs]
                    ),
                    shuffle=True,
                    callbacks=stop,
          )

In [None]:
# Result Labels
res_df = pd.DataFrame()

In [None]:
from sklearn import metrics
from sklearn.metrics import classification_report
predictions = model.predict(
    [X1_test, X2_test, pre_bert_cs_test, hyp_bert_cs_test, test_bias])

In [None]:
print(np.stack(predictions).shape)
print(predictions[0].shape)
print(predictions[1].shape)
print(predictions[2].shape)

In [None]:
y_pred = [idx for idx in np.argmax(predictions[2], axis=1)]
res_df['Fake_News_Labels'] = y_pred
print('CS Accuracy is')
print(metrics.accuracy_score(Y_cs_test, y_pred)*100)
print(classification_report(Y_cs_test, y_pred, target_names = ['against', 'for']))

In [None]:
# calculating normal accuracy and checking
Y_cs_test = Y_cs_test.tolist()
assert len(Y_cs_test) == len(y_pred)
a_ctr = 0
f_ctr = 0
for i in range(len(y_pred)):
  if y_pred[i] == 0 and Y_cs_test[i] == 0:
    a_ctr+=1
  elif y_pred[i] == 1 and Y_cs_test[i] == 1:
    f_ctr+=1
acc = (a_ctr+f_ctr)/(float)(len(y_pred))
print('Accuracy is', acc)

In [None]:
y_pred = [idx for idx in np.argmax(predictions[1], axis=1)]
res_df['Emotion_Labels'] = y_pred
print('Emotion Accuracy is')
print(metrics.accuracy_score(Y_em_test, y_pred)*100)
print(classification_report(Y_em_test, y_pred, target_names = ['true', 'false']))

In [None]:
y_pred = [idx for idx in np.argmax(predictions[0], axis=1)]
res_df['Novelty_Labels'] = y_pred
print('NV Accuracy is')
print(metrics.accuracy_score(Y_nv_test, y_pred)*100)
print(classification_report(Y_nv_test, y_pred, target_names = ['novel', 'duplicate']))

In [None]:
# Saving the labels
res_df.to_csv("Attn_CS_MtaskRes.csv", index = False)