In [2]:
%tensorflow_version 2.x
import tensorflow as tf
print(tf.__version__)

2.3.0


In [3]:
# All general imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer 

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, Reshape, Conv2D, MaxPool2D, Concatenate, Flatten, Dropout, Dense, Bidirectional, GlobalAveragePooling1D, GRU, GlobalMaxPooling1D, concatenate
from keras.optimizers import Adam
from keras.layers import LSTM, GRU, Conv1D, MaxPool1D, Activation

from keras.models import Model, Sequential
from keras.layers.core import SpatialDropout1D

from keras.engine.topology import Layer
from keras.layers import Dense, Input, Embedding, Dropout, Activation, Conv1D, Softmax
from keras import initializers, regularizers, constraints, optimizers, layers
from keras import backend as K

from keras.callbacks import EarlyStopping

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
import io, os, gc

In [5]:
#################### Importing ByteDance Datasets ####################
# Train set
train_df = pd.read_csv('../Datasets/cstance_train.csv')
print(train_df.columns)
train_df.head()

# Test set
test_df = pd.read_csv('../Datasets/cstance_test_new.csv')
print(test_df.columns)
test_df.head()

Index(['id', 'text', 'stance', 'Novelty_Quora', 'Emotion_Label',
       'com_femotion'],
      dtype='object')
Index(['id', 'text', 'stance', 'Novelty_Quora', 'Emotion_Label',
       'com_femotion'],
      dtype='object')


Unnamed: 0,id,text,stance,Novelty_Quora,Emotion_Label,com_femotion
0,1.25e+18,what is the reason the cdc is hiding info re h...,2,0,1,1
1,1.25e+18,so you are denying that dr vladimir zelenko is...,2,1,0,1
2,1.25e+18,anyone whos been to a malaria area would in th...,2,1,1,1
3,1.26e+18,many countries preceded us in adopting chloroq...,2,1,1,1
4,1.26e+18,those who proselytize against hydroxychloroqui...,2,1,1,1


In [6]:
pre_bert_cs = np.load("../Datasets/pre_bert_accs.npy")
hyp_bert_cs = np.load("../Datasets/hyp_bert_accs.npy")
print('Premise', pre_bert_cs.shape)
print('Hypothesis', hyp_bert_cs.shape)

Premise (8572, 768)
Hypothesis (8572, 768)


In [7]:
pre_bert_cs_test = np.load("../Datasets/pre_bert_test_accs.npy")
hyp_bert_cs_test = np.load("../Datasets/hyp_bert_test_accs.npy")
print('Premise', pre_bert_cs_test.shape)
print('Hypothesis', hyp_bert_cs_test.shape)

Premise (2494, 768)
Hypothesis (2494, 768)


In [8]:
premise = ["chloroquine hydroxychloroquine are cure for the novel coronavirus"]
train_lst_1 = train_df['text'].tolist()
print(len(train_lst_1))
train_lst_1[:5]
uq_tr_1 = list(set(train_lst_1))
print(len(uq_tr_1))
train_merged = uq_tr_1 + premise
print('Train Length is', len(train_merged))
train_merged[:5]
test_lst_1 = test_df['text'].tolist()
uq_ts_1 = list(set(test_lst_1))
test_merged = uq_ts_1
print('Test merged', len(test_merged))
total_dataset = train_merged + test_merged
print('Dataset length is', len(total_dataset))

8572
7185
Train Length is 7186
Test merged 2366
Dataset length is 9552


In [9]:
# Defining the tokenizer
def get_tokenizer(vocabulary_size):
  print('Training tokenizer...')
  tokenizer = Tokenizer(num_words= vocabulary_size)
  tweet_text = []
  print('Read {} Sentences'.format(len(total_dataset)))
  tokenizer.fit_on_texts(total_dataset)
  return tokenizer

In [10]:
# For getting the embedding matrix
def get_embeddings():
  print('Generating embeddings matrix...')
  embeddings_file = 'glove.6B.300d.txt'
  embeddings_index = dict()
  with open(embeddings_file, 'r', encoding="utf-8") as infile:
    for line in infile:
      values = line.split()
      word = values[0]
      vector = np.asarray(values[1:], "float32")
      embeddings_index[word] = vector
	# create a weight matrix for words in training docs
  vocabulary_size = len(embeddings_index)
  embeddinds_size = list(embeddings_index.values())[0].shape[0]
  print('Vocabulary = {}, embeddings = {}'.format(vocabulary_size, embeddinds_size))
  tokenizer = get_tokenizer(vocabulary_size)
  embedding_matrix = np.zeros((vocabulary_size, embeddinds_size))
  considered = 0
  total = len(tokenizer.word_index.items())
  for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
      print(word, index)
      continue
    else:
      embedding_vector = embeddings_index.get(word)
      if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
        considered += 1
  print('Considered ', considered, 'Left ', total - considered)			
  return embedding_matrix, tokenizer

In [11]:
def get_data(tokenizer, MAX_LENGTH, input_df):
  print('Loading data')
  X1, X2, Y = [], [], []
  X2 = input_df['text'].tolist()
  length = len(X2)
  premise = "chloroquine hydroxychloroquine are cure for the novel coronavirus"
  X1 = [premise for i in range(length)]
  Y = input_df['stance'].tolist()
  new_Y = [(ele-1) for ele in Y]
  assert len(new_Y) == len(Y)
  
  len(X1) == len(X2) == len(Y)
  sequences_1 = tokenizer.texts_to_sequences(X1)
  sequences_2 = tokenizer.texts_to_sequences(X2)
	# for i, s in enumerate(sequences):
	# 	sequences[i] = sequences[i][-250:]
  X1 = pad_sequences(sequences_1, maxlen=MAX_LENGTH)
  X2 = pad_sequences(sequences_2, maxlen=MAX_LENGTH)
  new_Y = np.array(new_Y)
  return X1, X2, new_Y

In [12]:
embedding_matrix, tokenizer = get_embeddings()

Generating embeddings matrix...
Vocabulary = 400000, embeddings = 300
Training tokenizer...
Read 9552 Sentences
Considered  11908 Left  4553


In [13]:
MAX_LENGTH = 50
# read ml data
X1, X2, Y = get_data(tokenizer, MAX_LENGTH, train_df)

Loading data


In [14]:
X1_test, X2_test, Y_test = get_data(tokenizer, MAX_LENGTH, test_df)

Loading data


In [15]:
encoder = LabelBinarizer()#convertes into one hot form
encoder.fit(Y)
Y_enc = encoder.transform(Y)
Y_enc_test = encoder.transform(Y_test)
print(Y_enc)
print(Y_enc_test)

[[1]
 [1]
 [1]
 ...
 [0]
 [0]
 [0]]
[[1]
 [1]
 [1]
 ...
 [1]
 [0]
 [0]]


In [16]:
y_train = keras.utils.to_categorical(Y)
print(y_train)
y_test = keras.utils.to_categorical(Y_test)
print(y_test)

[[0. 1.]
 [0. 1.]
 [0. 1.]
 ...
 [1. 0.]
 [1. 0.]
 [1. 0.]]
[[0. 1.]
 [0. 1.]
 [0. 1.]
 ...
 [0. 1.]
 [1. 0.]
 [1. 0.]]


In [17]:
from sklearn.model_selection import train_test_split
VALIDATION_RATIO = 0.1
RANDOM_STATE = 9527
x1_train, x1_val, \
x2_train, x2_val, \
pre_train_bert, pre_val_bert, \
hyp_train_bert, hyp_val_bert, \
y_train, y_val = \
    train_test_split(
        X1, X2, pre_bert_cs, hyp_bert_cs,
        y_train,
        test_size=VALIDATION_RATIO, 
        random_state=RANDOM_STATE
)

In [18]:
print("Training Set")
print("-" * 10)
print(f"x1_train: {x1_train.shape}")
print(f"x2_train: {x2_train.shape}")
print(f"y_train : {y_train.shape}")
print(f"Bert premise : {pre_train_bert.shape}")

print("-" * 10)
print(f"x1_val:   {x1_val.shape}")
print(f"x2_val:   {x2_val.shape}")
print(f"y_val :   {y_val.shape}")
print(f"Val_Bert_premise : {pre_val_bert.shape}")
print("-" * 10)
print("Test Set")

Training Set
----------
x1_train: (7714, 50)
x2_train: (7714, 50)
y_train : (7714, 2)
Bert premise : (7714, 768)
----------
x1_val:   (858, 50)
x2_val:   (858, 50)
y_val :   (858, 2)
Val_Bert_premise : (858, 768)
----------
Test Set


In [19]:
NUM_CLASSES = 2

MAX_SEQUENCE_LENGTH = 50

NUM_LSTM_UNITS = 150

MAX_NUM_WORDS = embedding_matrix.shape[0]

NUM_EMBEDDING_DIM = embedding_matrix.shape[1]

In [20]:
# BERT + Normal Grand Model

NUM_LSTM_UNITS = 150

top_input_wd = Input(
    shape=(MAX_SEQUENCE_LENGTH, ), 
    dtype='int32')
bm_input_wd = Input(
    shape=(MAX_SEQUENCE_LENGTH, ), 
    dtype='int32')

embedding_layer = Embedding(
    MAX_NUM_WORDS, NUM_EMBEDDING_DIM)
top_embedded_wd = embedding_layer(
    top_input_wd)
bm_embedded_wd = embedding_layer(
    bm_input_wd)

source_lstm_wd = Bidirectional(LSTM(NUM_LSTM_UNITS, return_sequences=True, recurrent_dropout = 0.3))
shared_lstm_wd = Bidirectional(LSTM(NUM_LSTM_UNITS, activation='tanh', recurrent_dropout = 0.3))
top_source_wd = source_lstm_wd(top_embedded_wd)
bm_source_wd = source_lstm_wd(bm_embedded_wd)

source_comb_wd = concatenate(
    [top_source_wd, bm_source_wd],
    axis=-1
    )
lstm_ops_wd = shared_lstm_wd(source_comb_wd)   # 300D vector


top_input_bt = Input(
    shape=(768, ), 
    dtype='float32')
bm_input_bt = Input(
    shape=(768, ), 
    dtype='float32')


top_embedded_bt = Reshape((1, 768, ))(top_input_bt)
bm_embedded_bt = Reshape((1, 768, ))(bm_input_bt)

source_lstm_bt = Bidirectional(LSTM(NUM_LSTM_UNITS, return_sequences=True, recurrent_dropout = 0.3))
shared_lstm_bt = Bidirectional(LSTM(NUM_LSTM_UNITS, activation='tanh', recurrent_dropout = 0.3))
top_source_bt = source_lstm_bt(top_embedded_bt)
bm_source_bt = source_lstm_bt(bm_embedded_bt)

source_comb_bt = concatenate(
    [top_source_bt, bm_source_bt],
    axis=-1
    )
lstm_ops_bt = shared_lstm_bt(source_comb_bt)  #300D vector

# Bert and Normal Combination

comb_features_cs = concatenate(
    [lstm_ops_wd+lstm_ops_bt, lstm_ops_wd-lstm_ops_bt, lstm_ops_wd*lstm_ops_bt],
    axis=-1
    )

pre_cs = Dense(
    units=64, 
    activation='tanh',
    name = 'pre_fnc')(comb_features_cs)

dense_cs =  Dense(
    units=NUM_CLASSES, 
    activation='softmax',
    name = 'fnc')

predictions_cs = dense_cs(pre_cs)

model = Model(
    inputs=[top_input_wd, bm_input_wd, top_input_bt, bm_input_bt], 
    outputs=[predictions_cs])

model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 50)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 50)]         0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 768)]        0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 768)]        0                                            
_______________________________________________________________________________________

In [21]:
from keras.optimizers import Adam
lr = 1e-3
opt = Adam(lr=lr, decay=lr/50)
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy'])

In [22]:
BATCH_SIZE = 512
NUM_EPOCHS = 50
stop = [EarlyStopping(monitor='val_loss', patience=0.001)]
history = model.fit(x=[x1_train, x2_train, pre_train_bert, hyp_train_bert],
                    y=y_train,
                    batch_size=BATCH_SIZE,
                    epochs=NUM_EPOCHS,
                    validation_data=(
                      [x1_val, x2_val, pre_val_bert, hyp_val_bert], 
                      y_val
                    ),
                    shuffle=True,
                    callbacks=stop,
          )

Epoch 1/50
Epoch 2/50
Epoch 3/50


In [23]:
from sklearn import metrics
from sklearn.metrics import classification_report
predictions = model.predict(
    [X1_test, X2_test, pre_bert_cs_test, hyp_bert_cs_test])

In [24]:
y_pred = [idx for idx in np.argmax(predictions, axis=1)]
test_labels = test_df['stance'].tolist()
n_test_labels = [(ele-1) for ele in test_labels]
print('Accuracy is')
print(metrics.accuracy_score(n_test_labels, y_pred)*100)
print(classification_report(n_test_labels, y_pred, target_names = ['against', 'for']))

Accuracy is
83.96150761828387
              precision    recall  f1-score   support

     against       0.80      0.80      0.80      1007
         for       0.86      0.87      0.87      1487

    accuracy                           0.84      2494
   macro avg       0.83      0.83      0.83      2494
weighted avg       0.84      0.84      0.84      2494



In [25]:
# Model 2
NUM_LSTM_UNITS = 128

print('Getting Text CNN model...')
filter_sizes = [2, 3, 5]
num_filters = 128	#Hyperparameters 32,64,128; 0.2,0.3,0.4
drop = 0.4

top_input = Input(
    shape=(MAX_SEQUENCE_LENGTH, ), 
    dtype='int32')
bm_input = Input(
    shape=(MAX_SEQUENCE_LENGTH, ), 
    dtype='int32')

embedding_layer = Embedding(
    MAX_NUM_WORDS, NUM_EMBEDDING_DIM)
top_embedded = embedding_layer(
    top_input)
bm_embedded = embedding_layer(
    bm_input)
reshape = Reshape((MAX_SEQUENCE_LENGTH, NUM_EMBEDDING_DIM, 1))(top_embedded)
reshape_1 = Reshape((MAX_SEQUENCE_LENGTH, NUM_EMBEDDING_DIM, 1))(bm_embedded)
conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], NUM_EMBEDDING_DIM),  padding='valid', kernel_initializer='normal',  activation='relu')(reshape)
conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], NUM_EMBEDDING_DIM),  padding='valid', kernel_initializer='normal',  activation='relu')(reshape_1)
maxpool_0 = MaxPool2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)
maxpool_1 = MaxPool2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)
concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1])
flatten = Flatten()(concatenated_tensor)
dropout = Dropout(drop)(flatten)

top_input_bt = Input(
    shape=(768, ), 
    dtype='float32')
bm_input_bt = Input(
    shape=(768, ), 
    dtype='float32')


top_embedded_bt = Reshape((1, 768, ))(top_input_bt)
bm_embedded_bt = Reshape((1, 768, ))(bm_input_bt)

source_lstm_bt = Bidirectional(LSTM(NUM_LSTM_UNITS, return_sequences=True, recurrent_dropout = 0.3))
shared_lstm_bt = Bidirectional(LSTM(NUM_LSTM_UNITS, activation='tanh', recurrent_dropout = 0.3))
top_source_bt = source_lstm_bt(top_embedded_bt)
bm_source_bt = source_lstm_bt(bm_embedded_bt)

source_comb_bt = concatenate(
    [top_source_bt, bm_source_bt],
    axis=-1
    )
lstm_ops_bt = shared_lstm_bt(source_comb_bt)  #256D vector

comb_features_cs = concatenate(
    [dropout+lstm_ops_bt, dropout-lstm_ops_bt, dropout*lstm_ops_bt],
    axis=-1
    )

predictions = Dense(units=NUM_CLASSES, activation='sigmoid')(comb_features_cs)

model = Model(
    inputs=[top_input, bm_input, top_input_bt, bm_input_bt], 
    outputs=predictions)
model.summary()

Getting Text CNN model...
Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 50)]         0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, 50)]         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 50, 300)      120000000   input_5[0][0]                    
                                                                 input_6[0][0]                    
__________________________________________________________________________________________________
reshape_2 (Reshape)             (None, 50, 300, 1)   0       

In [26]:
from keras.optimizers import Adam
lr = 1e-3
opt = Adam(lr=lr, decay=lr/50)
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy'])

In [27]:
BATCH_SIZE = 512
NUM_EPOCHS = 50
stop = [EarlyStopping(monitor='val_loss', patience=0.001)]
history = model.fit(x=[x1_train, x2_train, pre_train_bert, hyp_train_bert],
                    y=y_train,
                    batch_size=BATCH_SIZE,
                    epochs=NUM_EPOCHS,
                    validation_data=(
                      [x1_val, x2_val, pre_val_bert, hyp_val_bert], 
                      y_val
                    ),
                    shuffle=True,
                    callbacks=stop,
          )

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50


In [28]:
from sklearn import metrics
from sklearn.metrics import classification_report
predictions = model.predict(
    [X1_test, X2_test, pre_bert_cs_test, hyp_bert_cs_test])

In [29]:
y_pred = [idx for idx in np.argmax(predictions, axis=1)]
test_labels = test_df['stance'].tolist()
n_test_labels = [(ele-1) for ele in test_labels]
print('Accuracy is')
print(metrics.accuracy_score(n_test_labels, y_pred)*100)
print(classification_report(n_test_labels, y_pred, target_names = ['against', 'for']))

Accuracy is
85.44506816359262
              precision    recall  f1-score   support

     against       0.82      0.82      0.82      1007
         for       0.88      0.88      0.88      1487

    accuracy                           0.85      2494
   macro avg       0.85      0.85      0.85      2494
weighted avg       0.85      0.85      0.85      2494

