In [None]:
import tensorflow as tf
print(tf.__version__)

2.3.0


In [None]:
# All general imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer 

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, Reshape, Conv2D, MaxPool2D, Concatenate, Flatten, Dropout, Dense, Bidirectional, GlobalAveragePooling1D, GRU, GlobalMaxPooling1D, concatenate
from keras.optimizers import Adam
from keras.layers import LSTM, GRU, Conv1D, MaxPool1D, Activation
from keras.layers import add

from keras.models import Model, Sequential
from keras.layers.core import SpatialDropout1D

from keras.engine.topology import Layer
from keras.layers import Dense, Input, Embedding, Dropout, Activation, Conv1D, Softmax
from keras import initializers, regularizers, constraints, optimizers, layers
from keras import backend as K

from keras.callbacks import EarlyStopping

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
import io, os, gc

In [None]:
#################### Importing FNC Datasets ####################
# Train set
train_df = pd.read_csv('../FNC_Dataset/train_fnc_processed.csv')
print(train_df.columns)
le = LabelEncoder()
train_df['Stance'] = le.fit_transform(train_df['Stance'])
train_df.head()

# Test set
test_df = pd.read_csv('../FNC_Dataset/competition_test_fnc_processed.csv')
print(test_df.columns)
test_df['Stance'] = le.transform(test_df['Stance'])
test_df.head()

Index(['Headline', 'Body ID', 'Stance', 'Body', 'Novelty_Labels', 'Emotion_1'], dtype='object')
Index(['Headline', 'Body ID', 'Stance', 'Body', 'Novelty_Labels', 'Emotion_1'], dtype='object')


Unnamed: 0,Headline,Body ID,Stance,Body,Novelty_Labels,Emotion_1
0,Ferguson riots: Pregnant woman loses eye after...,2008,3,A RESPECTED senior French police officer inves...,neutral,anger
1,Crazy Conservatives Are Sure a Gitmo Detainee ...,1550,3,Dave Morin's social networking company Path is...,neutral,fear
2,A Russian Guy Says His Justin Bieber Ringtone ...,2,3,A bereaved Afghan mother took revenge on the T...,contradiction,joy
3,"Zombie Cat: Buried Kitty Believed Dead, Meows ...",1793,3,Hewlett-Packard is officially splitting in two...,entailment,joy
4,Argentina's President Adopts Boy to End Werewo...,37,3,An airline passenger headed to Dallas was remo...,contradiction,turst


In [None]:
train_lst_1 = train_df['Body'].tolist()
print(len(train_lst_1))
train_lst_1[:5]
train_lst_2 = train_df['Headline'].tolist()
print(len(train_lst_2))
uq_tr_1 = list(set(train_lst_1))
uq_tr_2 = list(set(train_lst_2))
print(len(uq_tr_1))
print(len(uq_tr_2))
train_merged = uq_tr_1 + uq_tr_2
print('Train Length is', len(train_merged))
train_merged[:5]
test_lst_1 = test_df['Body'].tolist()
test_lst_2 = test_df['Headline'].tolist()
uq_ts_1 = list(set(test_lst_1))
uq_ts_2 = list(set(test_lst_2))
test_merged = uq_ts_1 + uq_ts_2
print('Test merged', len(test_merged))
total_dataset = train_merged + test_merged
print('Dataset length is', len(total_dataset))

49972
49972
1669
1648
Train Length is 3317
Test merged 1794
Dataset length is 5111


In [None]:
print(uq_tr_1[4])

Spotting a spider on the floor may give you a fright, but imagine finding out one was living under your skin for three days.

This is what happened to Australian man Dylan Thomas who went on vacation to Bali with a friend. His friend noticed a strange red scar going up the 21-year-old's stomach. At first, doctors told him it was an insect bite, but they later discovered a spider had burrowed itself into Thomas's abdomen, most likely through his recent appendix scar.

Thomas posted this on his Facebook page: "Well after running tests and putting things inside my stomach they finally found out it was a tropical spider that's been living inside of me for the last three days, managed to get it out luckily. Haven't felt so violated in my life before! Just glad it's all over."

Thomas explains not only was there a painful burning sensation under his skin, but the scar began to get bigger and bubble.

Check out photos from Thomas's experience in the video above. Yikes!


In [None]:
# Defining the tokenizer
def get_tokenizer(vocabulary_size):
  print('Training tokenizer...')
  tokenizer = Tokenizer(num_words= vocabulary_size)
  tweet_text = []
  print('Read {} Sentences'.format(len(total_dataset)))
  tokenizer.fit_on_texts(total_dataset)
  return tokenizer

In [None]:
# For getting the embedding matrix
def get_embeddings():
  print('Generating embeddings matrix...')
  embeddings_file = 'glove.6B.300d.txt'
  embeddings_index = dict()
  with open(embeddings_file, 'r', encoding="utf-8") as infile:
    for line in infile:
      values = line.split()
      word = values[0]
      vector = np.asarray(values[1:], "float32")
      embeddings_index[word] = vector
	# create a weight matrix for words in training docs
  vocabulary_size = len(embeddings_index)
  embeddinds_size = list(embeddings_index.values())[0].shape[0]
  print('Vocabulary = {}, embeddings = {}'.format(vocabulary_size, embeddinds_size))
  tokenizer = get_tokenizer(vocabulary_size)
  embedding_matrix = np.zeros((vocabulary_size, embeddinds_size))
  considered = 0
  total = len(tokenizer.word_index.items())
  for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
      print(word, index)
      continue
    else:
      embedding_vector = embeddings_index.get(word)
      if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
        considered += 1
  print('Considered ', considered, 'Left ', total - considered)			
  return embedding_matrix, tokenizer

In [None]:
def get_data(tokenizer, MAX_LENGTH, input_df):
  print('Loading data')
  X1, X2, Y = [], [], []
	# with open(input_file) as infile:
	# 	for line in infile:
	# 		data = line.split(',')
	# 		text, annotation = data[2], data[1]
			
	# 		if annotation == "MET":
	# 			X.append(text)
	# 			Y.append("1")
	# 		elif annotation == "Non_MET" or annotation == "Help":	
	# 			X.append(text)
	# 			Y.append("0")
  X1 = input_df['Body'].tolist()
  X2 = input_df['Headline'].tolist()
  Y = input_df['Stance'].tolist()
  
  assert len(X1) == len(X2) == len(Y)
  sequences_1 = tokenizer.texts_to_sequences(X1)
  sequences_2 = tokenizer.texts_to_sequences(X2)
	# for i, s in enumerate(sequences):
	# 	sequences[i] = sequences[i][-250:]
  X1 = pad_sequences(sequences_1, maxlen=MAX_LENGTH)
  X2 = pad_sequences(sequences_2, maxlen=MAX_LENGTH)
  Y = np.array(Y)
  return X1, X2, Y

In [None]:
embedding_matrix, tokenizer = get_embeddings()

Generating embeddings matrix...
Vocabulary = 400000, embeddings = 300
Training tokenizer...
Read 5111 Sentences
Considered  26192 Left  11274


In [None]:
MAX_LENGTH = 100
# read ml data
X1, X2, Y = get_data(tokenizer, MAX_LENGTH, train_df)

Loading data


In [None]:
X1_test, X2_test, Y_test = get_data(tokenizer, MAX_LENGTH, test_df)

Loading data


In [None]:
print(Y.shape)
print(type(X1))
X1.shape

(49972,)
<class 'numpy.ndarray'>


(49972, 100)

In [None]:
train_doc = np.load('train_embed.npy')
test_doc = np.load('test_embed.npy')
print(train_doc.shape)
print(test_doc.shape)

(49972, 300)
(25413, 300)


In [None]:
# Removing the unrelated samples from both train and test
result = np.where(train_df['Stance'] == 2)[0]
result_1 = np.where(train_df['Stance'] == 3)[0]
print(result.shape, result_1.shape)
result_comb = np.concatenate((result, result_1))
print(result_comb.shape)
reduced_X1 = np.delete(X1, result_comb, axis=0)
reduced_X2 = np.delete(X2, result_comb, axis=0)
reduced_train_doc = np.delete(train_doc, result_comb, axis=0)
print('Train shape', reduced_X1.shape)
reduced_train_labels = np.delete(train_df['Stance'].values, result_comb)
print('Train labels', reduced_train_labels)
result_test = np.where(test_df['Stance'] == 2)[0]
result_test_1 = np.where(test_df['Stance'] == 3)[0]
result_test_comb = np.concatenate((result_test, result_test_1))
reduced_X1_test = np.delete(X1_test, result_test_comb, axis=0)
reduced_X2_test = np.delete(X2_test, result_test_comb, axis=0)
reduced_test_doc = np.delete(test_doc, result_test_comb, axis=0)
print('Test shape', reduced_X1_test.shape)
reduced_test_labels = np.delete(test_df['Stance'].values, result_test_comb)
print('Test labels', reduced_test_labels)

(8909,) (36545,)
(45454,)
Train shape (4518, 100)
Train labels [0 1 0 ... 0 1 0]
Test shape (2600, 100)
Test labels [0 0 0 ... 1 1 0]


In [None]:
embeddings_file = 'glove.6B.300d.txt'
embeddings_index = dict()
with open(embeddings_file, 'r', encoding="utf-8") as infile:
  for line in infile:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], "float32")
    embeddings_index[word] = vector

In [None]:
agree = embeddings_index['agree']
disagree = embeddings_index['disagree']

In [None]:
train_bias = []
test_bias = []
zero_vector = np.zeros((300,))
print(reduced_train_labels.tolist())
for label in reduced_train_labels.tolist():
  if label == 0:
    train_bias.append(zero_vector)
  elif label == 1:
    train_bias.append(disagree)
  else:
    print('Some problem in train, please check')
for label in reduced_test_labels.tolist():
  if label == 0:
    test_bias.append(zero_vector)
  elif label == 1:
    test_bias.append(disagree)
  else:
    print("Some problem in test, please check")
train_bias = np.array(train_bias)
test_bias = np.array(test_bias)
print(train_bias.shape)
print(test_bias.shape)

[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 

In [None]:
y_train = keras.utils.to_categorical(reduced_train_labels)
print(y_train)
y_test = keras.utils.to_categorical(reduced_test_labels)
print(y_test)

[[1. 0.]
 [0. 1.]
 [1. 0.]
 ...
 [1. 0.]
 [0. 1.]
 [1. 0.]]
[[1. 0.]
 [1. 0.]
 [1. 0.]
 ...
 [0. 1.]
 [0. 1.]
 [1. 0.]]


In [None]:
from sklearn.model_selection import train_test_split
VALIDATION_RATIO = 0.1
RANDOM_STATE = 9527
x1_train, x1_val, \
x2_train, x2_val, \
doc_train, doc_val, \
bias_train, bias_val, \
y_train, y_val = \
    train_test_split(
        reduced_X1, reduced_X1, reduced_train_doc, train_bias, y_train, 
        test_size=VALIDATION_RATIO, 
        random_state=RANDOM_STATE
)

In [None]:
print("Training Set")
print("-" * 10)
print(f"x1_train: {x1_train.shape}")
print(f"x2_train: {x2_train.shape}")
print(f"y_train : {y_train.shape}")

print("-" * 10)
print(f"x1_val:   {x1_val.shape}")
print(f"x2_val:   {x2_val.shape}")
print(f"y_val :   {y_val.shape}")
print("-" * 10)
print("Test Set")

Training Set
----------
x1_train: (4066, 100)
x2_train: (4066, 100)
y_train : (4066, 2)
----------
x1_val:   (452, 100)
x2_val:   (452, 100)
y_val :   (452, 2)
----------
Test Set


In [None]:
NUM_CLASSES = 2

MAX_SEQUENCE_LENGTH = 100

NUM_LSTM_UNITS = 128

MAX_NUM_WORDS = embedding_matrix.shape[0]

NUM_EMBEDDING_DIM = embedding_matrix.shape[1]

In [None]:
# from keras import Input
# from keras.layers import Embedding,LSTM, concatenate, Dense
# from keras.models import Model

top_input = Input(
    shape=(MAX_SEQUENCE_LENGTH, ), 
    dtype='int32')
bm_input = Input(
    shape=(MAX_SEQUENCE_LENGTH, ), 
    dtype='int32')
# bias_input = Input(
#     shape=(300, ), 
#     dtype='float32')
# doc_input = Input(
#     shape=(1,300, ), 
#     dtype='float32')
# zero_vector = np.zeros((1,300))
# zero_vector = Reshape((1,300,))(zero_vector)

embedding_layer = Embedding(
    MAX_NUM_WORDS, NUM_EMBEDDING_DIM)
top_embedded = embedding_layer(
    top_input)
bm_embedded = embedding_layer(
    bm_input)

# Trying to concatenate top_embed with doc vector
# new_top_embed = Concatenate(axis=1)([top_embedded, doc_input]) 
# new_bm_embed = Concatenate(axis=1)([bm_embedded, zero_vector])

source_lstm = Bidirectional(LSTM(150, return_sequences=True))
shared_lstm = Bidirectional(LSTM(150))
new_top_embed = source_lstm(top_embedded)
new_bm_embed = source_lstm(bm_embedded)
merged = concatenate(
    [new_top_embed, new_bm_embed],
    axis=-1
    )
merged_lstm = shared_lstm(merged)
source_rep = K.sum(top_embedded, axis=1)
target_rep = K.sum(bm_embedded, axis=1)
merged_input = concatenate(
    [merged_lstm, source_rep, target_rep], 
    axis=-1)
# merged_input = add([merged_lstm, bias_input])
# top_output = shared_lstm(new_top_embed_rep)
# bm_output = shared_lstm(bm_embedded)
#bm_output = shared_lstm(new_bm_embed)

# merged = concatenate(
#     [top_output, bm_output], 
#     axis=-1)
# shared_lstm_1 = Bidirectional(LSTM(int(NUM_LSTM_UNITS/2), return_sequences=True))
# merged_final = shared_lstm_1(merged)

dense =  Dense(
    units=NUM_CLASSES, 
    activation='softmax')
predictions = dense(merged_input)

model = Model(
    inputs=[top_input, bm_input], 
    outputs=predictions)

model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 100)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 100)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 100, 300)     120000000   input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 100, 300)     541200      embedding[0][0]       

In [None]:
from keras.optimizers import Adam
lr = 1e-3
opt = Adam(lr=lr, decay=lr/50)
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy'])

In [None]:
from sklearn.utils import class_weight
class_weight = class_weight.compute_class_weight('balanced'
                                               ,np.unique(reduced_train_labels)
                                               ,reduced_train_labels)
class_weight_dict = dict(enumerate(class_weight))

In [None]:
doc_train = np.reshape(doc_train, (doc_train.shape[0], 1,300))
doc_val = np.reshape(doc_val, (doc_val.shape[0], 1,300))
doc_test = np.reshape(reduced_test_doc, (reduced_test_doc.shape[0], 1, 300))

In [None]:
print(doc_train.shape)
print(doc_val.shape)
BATCH_SIZE = 256
NUM_EPOCHS = 10
stop = [EarlyStopping(monitor='val_loss', patience=0.001)]
history = model.fit(x=[x1_train, x2_train],
                    y=y_train,
                    batch_size=BATCH_SIZE,
                    epochs=NUM_EPOCHS,
                    validation_data=(
                      [x1_val, x2_val], 
                      y_val
                    ),
                    shuffle=True,
                    class_weight = class_weight_dict
          )

(4066, 1, 300)
(452, 1, 300)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
from sklearn import metrics
from sklearn.metrics import classification_report
predictions = model.predict(
    [reduced_X1_test, reduced_X2_test, test_bias])

In [None]:
y_pred = [idx for idx in np.argmax(predictions, axis=1)]
#print(y_pred)
print('Accuracy is')
print(metrics.accuracy_score(reduced_test_labels, y_pred)*100)
print(classification_report(reduced_test_labels, y_pred, target_names = ['agreed', 'disagreed']))

Accuracy is
67.5
              precision    recall  f1-score   support

      agreed       0.74      0.85      0.79      1903
   disagreed       0.33      0.21      0.25       697

    accuracy                           0.68      2600
   macro avg       0.54      0.53      0.52      2600
weighted avg       0.63      0.68      0.65      2600

