In [1]:
import numpy as np
from scipy import spatial
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import pandas as pd
import matplotlib.pyplot as plt

In [6]:
# For unpickling files
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

In [15]:
# Import the main dataset
train_df = pd.read_table('../LIAR_PLUS_Dataset/liar_train.tsv')
test_df = pd.read_table('../LIAR_PLUS_Dataset/liar_test.tsv')

  
  This is separate from the ipykernel package so we can avoid doing imports until


In [9]:
train_data.head()
# 0 - False (pants fire, false, barely true)
# 1 - True (true, mostly true, half-true)

Unnamed: 0,id,label,statement,subject,speaker,job,state,party,barely-true,false,half-true,mostly-true,pants-fire,venue,justification
0,2635.json,0,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,That's a premise that he fails to back up. Ann...
1,10540.json,1,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,"Surovell said the decline of coal ""started whe..."
2,324.json,1,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver,Obama said he would have voted against the ame...
3,1123.json,0,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release,The release may have a point that Mikulskis co...
4,9028.json,1,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN,"Crist said that the economic ""turnaround start..."


In [3]:
# Import the novelty results
train_pre_nv = pd.read_csv('Liar_Data/train_liar_0_3_combined.csv')
test_pre_nv = pd.read_csv('Liar_Data/test_liar_0_3_combined.csv')
train_pre_nv.columns = ['id', 'nv_label']
test_pre_nv.columns = ['id', 'nv_label']

In [5]:
# Import novlety features
train_novelty_feature = np.load('Liar_Data/train_liar_0_3_combine.npy')
test_novelty_feature = np.load('Liar_Data/test_liar_0_3_combine.npy')
# Examining the shape
print('Train shape is', train_novelty_feature.shape)
print('Test shape is', test_novelty_feature.shape)

Train shape is (10156, 200)
Test shape is (1258, 200)


In [7]:
# Import the emotion results
train_em = pd.read_csv('Liar_Data/liar_em_train_pre.tsv_k_numb_predictions_bin.csv', header = None)
test_em = pd.read_csv('Liar_Data/liar_em_test_pre.tsv_k_numb_predictions_bin.csv', header = None)
train_em.columns = ['id', 'em_label']
test_em.columns = ['id', 'em_label']

In [8]:
# Importing the emotion features
train_emotion_feature = unpickle('Liar_Data/liar_em_train_pre.tsv_k_bin.pickle')
test_emotion_feature = unpickle('Liar_Data/liar_em_test_pre.tsv_k_bin.pickle')
train_emotion_feature = np.stack(list(train_emotion_feature.values()))
test_emotion_feature = np.stack(list(test_emotion_feature.values()))

In [10]:
# Loading the pre-trained Glove embeddings
embeddings_dict = {}
with open("../resource/glove.6B.200d.txt", 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [11]:
def find_closest_embeddings(embedding):
    return sorted(embeddings_dict.keys(), key=lambda word: spatial.distance.euclidean(embeddings_dict[word], embedding))

In [12]:
print(embeddings_dict["original"])
print(find_closest_embeddings(embeddings_dict["original"])[1:6])

[-0.45048   -0.1449     0.069873  -0.2611     0.51599    0.2649
 -0.37062   -0.4043    -0.047766  -0.85103    0.15076   -0.3983
 -0.30388    0.13779    0.31623   -0.27125    0.45444    0.93036
  0.37158   -0.13067    0.016844   2.2395     0.21558   -0.97138
 -0.15899    0.5531    -0.045112  -0.76692    0.0094216 -0.12936
 -0.21059   -0.11888    0.1508    -0.2525    -0.22782   -0.53595
 -0.40099   -0.58793    0.059262  -0.64623   -0.20917   -0.03534
 -0.034241  -0.08936   -0.16375    0.36763    0.82737   -0.10209
  0.19804    0.4031    -0.36257   -0.072119   0.2679    -0.20291
  0.10427    0.24153   -0.06382    0.4669    -0.12288    0.11546
 -0.11928    0.12932   -0.8338    -0.82749    0.014886  -0.57084
 -0.58857   -0.089826   0.39842    0.022715   0.54963   -0.30594
  0.058506   0.33867    0.17773    0.30477   -0.13231    0.1711
 -0.26207   -0.20595    0.58286   -0.090458   0.026476   0.090705
  0.20869   -0.22999   -0.51844   -0.82133    0.81661   -0.63622
  0.10171    1.0201     0.3

In [13]:
# Storing for further use
novel = embeddings_dict["original"]
duplicate = embeddings_dict["duplicate"]

In [20]:
# Adding novelty bias
true_train_labels = train_df['label'].tolist()
true_test_labels = test_df['label'].tolist()
train_bias = []
test_bias = []
zero_vector = np.zeros((200,))
for i, row in train_pre_nv.iterrows():
    if row['nv_label'] == 0 and true_train_labels[i] == 0:
        train_bias.append(novel)
    elif row['nv_label'] == 1 and true_train_labels[i] == 1:
        train_bias.append(duplicate)
    else:
        train_bias.append(zero_vector)
for i, row in test_pre_nv.iterrows():
    if row['nv_label'] == 0 and true_test_labels[i] == 0:
        test_bias.append(novel)
    elif row['nv_label'] == 1 and true_test_labels[i] == 1:
        test_bias.append(duplicate)
    else:
        test_bias.append(zero_vector)
train_bias_nv = np.stack(train_bias)
test_bias_nv = np.stack(test_bias)
print('Train bias', train_bias_nv.shape)
print('Test bias', test_bias_nv.shape)

Train bias (10156, 200)
Test bias (1258, 200)


In [25]:
# Emotion Bias vectors
emotion_true = embeddings_dict['anticipation']+embeddings_dict['sadness']+embeddings_dict['joy']+embeddings_dict['trust']
emotion_false = embeddings_dict['anger']+embeddings_dict['fear']+embeddings_dict['disgust']+embeddings_dict['surprise']
print('True emotion', emotion_true.shape)
print('Fake emotion', emotion_false.shape)

True emotion (200,)
Fake emotion (200,)


In [26]:
# Emotion Bias
true_train_labels = train_df['label'].tolist()
true_test_labels = test_df['label'].tolist()
train_bias = []
test_bias = []
zero_vector = np.zeros((200,))
for i, row in train_em.iterrows():
    if row['em_label'] == 0 and true_train_labels[i] == 1:
        train_bias.append(emotion_true)
    elif row['em_label'] == 1 and true_train_labels[i] == 0:
        train_bias.append(emotion_false)
    else:
        train_bias.append(zero_vector)
for i, row in test_em.iterrows():
    if row['em_label'] == 0 and true_test_labels[i] == 1:
        test_bias.append(emotion_true)
    elif row['em_label'] == 1 and true_test_labels[i] == 0:
        test_bias.append(emotion_false)
    else:
        test_bias.append(zero_vector)
train_bias_em = np.stack(train_bias)
test_bias_em = np.stack(test_bias)
print('Train bias', train_bias_em.shape)
print('Test bias', test_bias_em.shape)

Train bias (10156, 200)
Test bias (1258, 200)


In [27]:
# Combined bias
train_bias = np.add(train_bias_nv, train_bias_em)
test_bias = np.add(test_bias_nv, test_bias_em)
print('Shape of train_bias', train_bias.shape)
print('Shape of test_bias', test_bias.shape)

Shape of train_bias (10156, 200)
Shape of test_bias (1258, 200)


In [28]:
# Performing PCA on emotion features 768 -> 200
from sklearn.decomposition import PCA
pca = PCA(n_components=200)
train_emotion_feature = pca.fit_transform(train_emotion_feature)
test_emotion_feature = pca.transform(test_emotion_feature)
print('Train shape is', train_emotion_feature.shape)
print('Test shape is', test_emotion_feature.shape)

Train shape is (10156, 200)
Test shape is (1258, 200)


In [29]:
# Combined Master Features
train_liar_features = np.add(train_novelty_feature, train_emotion_feature, train_bias)
test_liar_features = np.add(test_novelty_feature, test_emotion_feature, test_bias)

In [30]:
# Training a LR model
from sklearn import linear_model
from sklearn import metrics
from sklearn.metrics import classification_report

In [31]:
lr_liar = linear_model.LogisticRegression(max_iter = 5000)
lr_liar.fit(train_liar_features, train_df['label'])

LogisticRegression(max_iter=5000)

In [32]:
# Infering on the test set
y_pred = lr_liar.predict(test_liar_features)
print("Accuracy of Logistic Regression model is:",
metrics.accuracy_score(test_df['label'], y_pred)*100)
print(classification_report(test_df['label'], y_pred, target_names = ['false', 'true']))

Accuracy of Logistic Regression model is: 78.13990461049285
              precision    recall  f1-score   support

       false       0.68      0.93      0.79       549
        true       0.93      0.66      0.77       709

    accuracy                           0.78      1258
   macro avg       0.81      0.80      0.78      1258
weighted avg       0.82      0.78      0.78      1258

