In [1]:
import numpy as np
from scipy import spatial
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import pandas as pd

In [2]:
import numpy as np
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

In [3]:
# Unpickling here
dct = unpickle('ESIM_data/FNC/fnc_train_novelty.pickle')
# Stacking the arrays to create a single feature matrix
novelty_feature = np.stack(list(dct.values()))
novelty_feature.shape

(49972, 1, 300)

In [4]:
# Removing the middle dimension
novelty_feature.resize((49972, 300))
novelty_feature.shape

(49972, 300)

In [5]:
# Reading the test set
dct_test = unpickle('ESIM_data/FNC/fnc_test_novelty.pickle')
test_feature = np.stack(list(dct_test.values()))
test_feature.resize(list(test_feature.shape)[0], 300)
test_feature.shape

(25413, 300)

In [6]:
# TODO - Cateogorically encode the lables layer
import pandas as pd
from sklearn.preprocessing import LabelEncoder
train_df = pd.read_csv('../FNC_Dataset/train_fnc_processed.csv')
print(train_df.columns)
le = LabelEncoder()
train_df['Stance'] = le.fit_transform(train_df['Stance'])
# train_df.label = train_df.label.astype('category').cat.codes #unrelated-2; disagreed-1; agreed-0
train_df.head()

Index(['Headline', 'Body ID', 'Stance', 'Body', 'Novelty_Labels', 'Emotion_1'], dtype='object')


Unnamed: 0,Headline,Body ID,Stance,Body,Novelty_Labels,Emotion_1
0,Police find mass graves with at least '15 bodi...,712,3,Danny Boyle is directing the untitled film\r\n...,contradiction,fear
1,Hundreds of Palestinians flee floods in Gaza a...,158,0,Hundreds of Palestinians were evacuated from t...,entailment,sadness
2,"Christian Bale passes on role of Steve Jobs, a...",137,3,30-year-old Moscow resident was hospitalized w...,neutral,sadness
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,3,(Reuters) - A Canadian soldier was shot at the...,contradiction,fear
4,Spider burrowed through tourist's stomach and ...,1923,1,"Fear not arachnophobes, the story of Bunbury's...",neutral,fear


In [7]:
# Loading the test set
test_df = pd.read_csv('../FNC_Dataset/competition_test_fnc_processed.csv')
print(test_df.columns)
#test_df.Category = test_df.Category.astype('category').cat.codes
test_df['Stance'] = le.transform(test_df['Stance'])
test_df.head()

Index(['Headline', 'Body ID', 'Stance', 'Body', 'Novelty_Labels', 'Emotion_1',
       'Novelty_Quora', 'Novelty_Quora_1'],
      dtype='object')


Unnamed: 0,Headline,Body ID,Stance,Body,Novelty_Labels,Emotion_1,Novelty_Quora,Novelty_Quora_1
0,Ferguson riots: Pregnant woman loses eye after...,2008,3,A RESPECTED senior French police officer inves...,neutral,anger,1,1
1,Crazy Conservatives Are Sure a Gitmo Detainee ...,1550,3,Dave Morin's social networking company Path is...,neutral,fear,0,0
2,A Russian Guy Says His Justin Bieber Ringtone ...,2,3,A bereaved Afghan mother took revenge on the T...,contradiction,joy,0,1
3,"Zombie Cat: Buried Kitty Believed Dead, Meows ...",1793,3,Hewlett-Packard is officially splitting in two...,entailment,joy,1,1
4,Argentina's President Adopts Boy to End Werewo...,37,3,An airline passenger headed to Dallas was remo...,contradiction,turst,0,0


In [8]:
# Loading the pre-trained Glove embeddings
embeddings_dict = {}
with open("../resources/glove.6B.200d.txt", 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [9]:
def find_closest_embeddings(embedding):
    return sorted(embeddings_dict.keys(), key=lambda word: spatial.distance.euclidean(embeddings_dict[word], embedding))

In [10]:
print(embeddings_dict["original"])
print(find_closest_embeddings(embeddings_dict["original"])[1:6])

[-0.45048   -0.1449     0.069873  -0.2611     0.51599    0.2649
 -0.37062   -0.4043    -0.047766  -0.85103    0.15076   -0.3983
 -0.30388    0.13779    0.31623   -0.27125    0.45444    0.93036
  0.37158   -0.13067    0.016844   2.2395     0.21558   -0.97138
 -0.15899    0.5531    -0.045112  -0.76692    0.0094216 -0.12936
 -0.21059   -0.11888    0.1508    -0.2525    -0.22782   -0.53595
 -0.40099   -0.58793    0.059262  -0.64623   -0.20917   -0.03534
 -0.034241  -0.08936   -0.16375    0.36763    0.82737   -0.10209
  0.19804    0.4031    -0.36257   -0.072119   0.2679    -0.20291
  0.10427    0.24153   -0.06382    0.4669    -0.12288    0.11546
 -0.11928    0.12932   -0.8338    -0.82749    0.014886  -0.57084
 -0.58857   -0.089826   0.39842    0.022715   0.54963   -0.30594
  0.058506   0.33867    0.17773    0.30477   -0.13231    0.1711
 -0.26207   -0.20595    0.58286   -0.090458   0.026476   0.090705
  0.20869   -0.22999   -0.51844   -0.82133    0.81661   -0.63622
  0.10171    1.0201     0.3

In [11]:
print(embeddings_dict["duplicate"])
print(find_closest_embeddings(embeddings_dict["duplicate"])[1:6])
print(embeddings_dict["duplicate"].shape)

[-0.11542    0.62583    0.0031159 -0.27893    0.67254   -0.45319
 -0.16397   -0.32293   -0.26415   -0.25232   -0.52077   -0.088397
 -0.69601   -0.16587    0.12641   -0.60435   -0.45325    0.34664
 -0.37659   -0.44404   -0.54856    0.46296    0.16677    0.53835
 -0.51188   -0.026023   0.64821   -0.6264     0.44719   -0.15889
 -0.2688     0.035679   0.29732   -0.048158  -0.0096131 -0.37165
 -0.67745   -0.5086     0.64688    0.1884     0.19655    0.034364
  0.29706    0.20052   -0.016906   0.0406     0.56526   -0.55097
 -0.10443   -0.22204   -0.03948   -0.90869   -0.14798    0.19678
 -0.15683    0.62182   -0.0029273 -0.4239    -0.063591  -0.12829
 -0.39695    0.21382   -0.10626    0.033134   0.5027    -0.35057
  0.070328  -0.28327    0.29084    0.33744    0.56324   -0.46098
  0.63119    0.21986   -0.73121    0.13872    0.0082016 -0.30085
 -0.25409    0.67545    0.36366    0.54803    0.68694    0.18578
  0.29969   -0.55681   -0.41297   -0.3755     0.3044    -0.43858
  0.36564    0.2197    

In [12]:
# Storing for further use
novel = embeddings_dict["original"]
duplicate = embeddings_dict["duplicate"]

In [36]:
# Reading Labels info
train_bias = []
test_bias = []
zero_vector = np.zeros((200,))
for i, row in train_df.iterrows():
    if row['Novelty_Labels'] == 'contradiction' and row['Stance'] == 1:
        train_bias.append(novel)
    elif row['Novelty_Labels'] == 'entailment' and row['Stance'] == 0:
        train_bias.append(duplicate)
    else:
        train_bias.append(zero_vector)
for i, row in test_df.iterrows():
    if row['Novelty_Labels'] == 'contradiction' and row['Stance'] == 1:
        test_bias.append(novel)
    elif row['Novelty_Labels'] == 'entailment' and row['Stance'] == 0:
        test_bias.append(duplicate)
    else:
        test_bias.append(zero_vector)
train_bias = np.stack(train_bias)
test_bias = np.stack(test_bias)
print('Train bias', train_bias.shape)
print('Test bias', test_bias.shape)

Train bias (49972, 200)
Test bias (25413, 200)


In [37]:
# Performing PCA to reduce the dimensions
from sklearn.decomposition import PCA
pca = PCA(n_components=200)
train_novelty_feature = pca.fit_transform(novelty_feature)
test_novelty_feature = pca.transform(test_feature)
print('Train shape is', train_novelty_feature.shape)
print('Test shape is', test_novelty_feature.shape)

Train shape is (49972, 200)
Test shape is (25413, 200)


In [38]:
fnc_train_nt_feature = np.add(train_novelty_feature, train_bias)
fnc_test_nt_feature = np.add(test_novelty_feature, test_bias)
print('Train', fnc_train_nt_feature.shape)
print('Test', fnc_test_nt_feature.shape)
print('Max Train value', np.amax(fnc_train_nt_feature))
print('Min Train value', np.amin(fnc_train_nt_feature))
print('Max Test value', np.amax(fnc_test_nt_feature))
print('Min Test value', np.amin(fnc_test_nt_feature))

Train (49972, 200)
Test (25413, 200)
Max Train value 16.037691116333008
Min Train value -11.610048800706863
Max Test value 15.767217636108398
Min Test value -11.282849311828613


In [39]:
# Removing the unrelated samples from both train and test
#print(type(train_df['bd_label'] == 2))
result = np.where(train_df['Stance'] == 2)[0]
result_1 = np.where(train_df['Stance'] == 3)[0]
print(result.shape, result_1.shape)
result_comb = np.concatenate((result, result_1))
print(result_comb.shape)
reduced_fnc_nt_train = np.delete(fnc_train_nt_feature, result_comb, axis=0)
print('Train shape', reduced_fnc_nt_train.shape)
reduced_train_labels = np.delete(train_df['Stance'].values, result_comb)
print('Train labels', reduced_train_labels)
result_test = np.where(test_df['Stance'] == 2)[0]
result_test_1 = np.where(test_df['Stance'] == 3)[0]
result_test_comb = np.concatenate((result_test, result_test_1))
reduced_fnc_nt_test = np.delete(fnc_test_nt_feature, result_test_comb, axis=0)
print('Test shape', reduced_fnc_nt_test.shape)
reduced_test_labels = np.delete(test_df['Stance'].values, result_test_comb)
print('Test labels', reduced_test_labels)

(8909,) (36545,)
(45454,)
Train shape (4518, 200)
Train labels [0 1 0 ... 0 1 0]
Test shape (2600, 200)
Test labels [0 0 0 ... 1 1 0]


In [40]:
from sklearn.utils.class_weight import compute_class_weight
class_weight = compute_class_weight(class_weight='balanced', classes = np.unique(reduced_test_labels), y=reduced_train_labels)
class_weight_dict = dict(enumerate(class_weight))
print(class_weight_dict)

{0: 0.6141924959216966, 1: 2.6892857142857145}


In [41]:
# Applying simple logistic regression
from sklearn import linear_model
from sklearn import metrics
from sklearn.metrics import classification_report

LogisticRegression(max_iter=5000)

In [43]:
# Load the train and test embeddings
import numpy as np
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict
# Considering the new emotion representations
train_emotion_dict_pre = unpickle('../Proposed_Model/FNC_Dataset/train_ag_dg_premise_fnc.tsv_k_bal_bin.pickle')
train_emotion_dict_hyp = unpickle('../Proposed_Model/FNC_Dataset/train_ag_dg_hyp_fnc.tsv_k_bal_bin.pickle')
print(len(train_emotion_dict_pre))
test_emotion_dict_pre = unpickle('../Proposed_Model/FNC_Dataset/test_ag_dg_premise_fnc.tsv_k_bal_bin.pickle')
test_emotion_dict_hyp = unpickle('../Proposed_Model/FNC_Dataset/test_ag_dg_hyp_fnc.tsv_k_bal_bin.pickle')

train_em_pre = np.stack(list(train_emotion_dict_pre.values()))
train_em_hyp = np.stack(list(train_emotion_dict_hyp.values()))
test_em_pre  = np.stack(list(test_emotion_dict_pre.values()))
test_em_hyp  = np.stack(list(test_emotion_dict_hyp.values()))

print('Train premise', train_em_pre.shape)
print('Train hyp', train_em_hyp.shape)
print('Test premise', test_em_pre.shape)
print('Test hyp', test_em_hyp.shape)

4518
Train premise (4518, 768)
Train hyp (4518, 768)
Test premise (2600, 768)
Test hyp (2600, 768)


In [44]:
# Adding the premise and hypothesis
train_em = np.add(train_em_pre, train_em_hyp)
test_em = np.add(test_em_pre, test_em_hyp)
print('Train', train_em.shape)
print('Test', test_em.shape)
print('Max Train value', np.amax(train_em))
print('Min Train value', np.amin(train_em))
print('Max Test value', np.amax(test_em))
print('Min Test value', np.amin(test_em))

Train (4518, 768)
Test (2600, 768)
Max Train value 1.9978582
Min Train value -1.9980149
Max Test value 1.9978582
Min Test value -1.998014


In [45]:
# Performing PCA to reduce the dimensions
from sklearn.decomposition import PCA
pca = PCA(n_components=200)
train_em = pca.fit_transform(train_em)
test_em = pca.transform(test_em)
print('Train shape is', train_em.shape)
print('Test shape is', test_em.shape)

Train shape is (4518, 200)
Test shape is (2600, 200)


In [46]:
# Word Embeddings
# emotion_true = np.add(embeddings_dict['anticipation'], embeddings_dict['sadness'], embeddings_dict['joy'], embeddings_dict['trust'])
# emotion_false = np.add(embeddings_dict['anger'], embeddings_dict['fear'], embeddings_dict['disgust'], embeddings_dict['surprise'])
emotion_true = embeddings_dict['anticipation']+embeddings_dict['sadness']+embeddings_dict['joy']+embeddings_dict['trust']
emotion_false = embeddings_dict['anger']+embeddings_dict['fear']+embeddings_dict['disgust']+embeddings_dict['surprise']
print('True emotion', emotion_true.shape)
print('Fake emotion', emotion_false.shape)

True emotion (200,)
Fake emotion (200,)


In [47]:
train_ag_dg = pd.read_csv('../Proposed_Model/FNC_Data/train_ag_dg_only_fnc.csv')
test_ag_dg = pd.read_csv('../Proposed_Model/FNC_Data/test_ag_dg_only_fnc.csv')
train_hy_df = pd.read_csv('../Proposed_Model/FNC_Data/train_ag_dg_hyp_fnc.tsv_k_bal_numb_predictions_bin.csv')
train_pre_df = pd.read_csv('../Proposed_Model/FNC_Data/train_ag_dg_premise_fnc.tsv_k_bal_numb_predictions_bin.csv')
test_hy_df = pd.read_csv('../Proposed_Model/FNC_Data/test_ag_dg_hyp_fnc.tsv_k_bal_numb_predictions_bin.csv')
test_pre_df = pd.read_csv('../Proposed_Model/FNC_Data/test_ag_dg_premise_fnc.tsv_k_bal_numb_predictions_bin.csv')
assert len(train_ag_dg) == len(train_hy_df) == len(train_pre_df)
assert len(test_ag_dg) == len(test_hy_df) == len(test_pre_df)

In [48]:
# New kind of adding scaffold labels
train_bias_em = []
test_bias_em = []
zero_vector = np.zeros((200,))
for i in range(len(train_ag_dg)):
    if train_ag_dg.loc[i, 'Stance'] == 'agree' and train_pre_df.loc[i, 'Emotion_Label'] == 0 and train_hy_df.loc[i, 'Emotion_Label'] == 0:
        train_bias_em.append(emotion_true)
    elif train_ag_dg.loc[i, 'Stance'] == 'disagree' and train_pre_df.loc[i, 'Emotion_Label'] == 0 and train_hy_df.loc[i, 'Emotion_Label'] == 1:
        train_bias_em.append(emotion_false)
    else:
        train_bias_em.append(zero_vector)
for i in range(len(test_ag_dg)):
    if test_ag_dg.loc[i, 'Stance'] == 'agree' and test_pre_df.loc[i, 'Emotion_Label'] == 0 and test_hy_df.loc[i, 'Emotion_Label'] == 0:
        test_bias_em.append(emotion_true)
    elif test_ag_dg.loc[i, 'Stance'] == 'disagree' and test_pre_df.loc[i, 'Emotion_Label'] == 0 and test_hy_df.loc[i, 'Emotion_Label'] == 1:
        test_bias_em.append(emotion_false)
    else:
        test_bias_em.append(zero_vector)
train_bias_em = np.stack(train_bias_em)
test_bias_em = np.stack(test_bias_em)
print('Train bias', train_bias_em.shape)
print('Test bias', test_bias_em.shape)

Train bias (4518, 200)
Test bias (2600, 200)


In [49]:
fnc_train_et_feature = np.add(train_em, train_bias_em)
fnc_test_et_feature = np.add(test_em, test_bias_em)
print('Train', fnc_train_et_feature.shape)
print('Test', fnc_test_et_feature.shape)
print('Max Train value', np.amax(fnc_train_et_feature))
print('Min Train value', np.amin(fnc_train_et_feature))
print('Max Test value', np.amax(fnc_test_et_feature))
print('Min Test value', np.amin(fnc_test_et_feature))

Train (4518, 200)
Test (2600, 200)
Max Train value 46.74790608882904
Min Train value -32.49567794799805
Max Test value 46.748016715049744
Min Test value -32.49523162841797


In [50]:
combined_fnc_train = np.add(reduced_fnc_nt_train, fnc_train_et_feature)
combined_fnc_test = np.add(reduced_fnc_nt_test, fnc_test_et_feature)
print('Combined Train', combined_fnc_train.shape)
print('Combined Test', combined_fnc_test.shape)

Combined Train (4518, 200)
Combined Test (2600, 200)


In [53]:
# Final Combined Logistic Regression Model
lg_reg_combine = linear_model.LogisticRegression(max_iter = 5000)
lg_reg_combine.fit(combined_fnc_train, reduced_train_labels)

LogisticRegression(max_iter=5000)

In [54]:
y_pred_combine = lg_reg_combine.predict(combined_fnc_test)
print("Accuracy of Combined Reduced Logistic Regression model is:",
metrics.accuracy_score(reduced_test_labels, y_pred_combine)*100)
print(classification_report(reduced_test_labels, y_pred_combine, target_names = ['agreed', 'disagreed']))

Accuracy of Combined Reduced Logistic Regression model is: 88.1923076923077
              precision    recall  f1-score   support

      agreed       0.89      0.96      0.92      1903
   disagreed       0.87      0.66      0.75       697

    accuracy                           0.88      2600
   macro avg       0.88      0.81      0.84      2600
weighted avg       0.88      0.88      0.88      2600

