# 3.3 Precompute reduce evidence conditioned on claim based on D2V similarity

In [1]:
import json
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import *
from nltk.corpus import stopwords
import string
import pickle

In [2]:
with open ('../data/raw/train-claims.json') as f:
    train_claims = json.load(f)

In [3]:
with open ('../data/raw/dev-claims.json') as f:
    dev_claims = json.load(f)

In [4]:
train_claims_text = list()
dev_claims_text = list()

In [5]:
definitely_related_evidence = set()
for claim in train_claims:
    for evidence in train_claims[claim]['evidences']:
        definitely_related_evidence.add(evidence)
    
    train_claims_text.append(train_claims[claim]['claim_text'])

definitely_related_evidence2 = set()
for claim in dev_claims:
    for evidence in dev_claims[claim]['evidences']:
        definitely_related_evidence2.add(evidence)
    
    dev_claims_text.append(dev_claims[claim]['claim_text'])

In [6]:
cluster_train = definitely_related_evidence.difference(definitely_related_evidence2)

In [7]:
cluster_dev = definitely_related_evidence2

In [8]:
len(definitely_related_evidence)

3121

In [9]:
len(definitely_related_evidence2)

463

In [10]:
len(cluster_train)

2980

In [11]:
len(cluster_dev)

463

In [12]:
cluster_full = definitely_related_evidence.union(definitely_related_evidence2)

In [13]:
len(cluster_full)

3443

In [14]:
with open ('../data/raw/evidence.json') as f:
    evidence = json.load(f)

In [15]:
evidence_list = list(evidence.items())
evidence_list.sort()

In [16]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [18]:
evidence_tokenised = [evidence_list[i][1].split() for i in range(len(evidence_list))]

In [20]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(evidence_tokenised)]
model = Doc2Vec(documents, vector_size=10, window=2, min_count=1, workers=4)

In [26]:
X = list()

for i in range(len(evidence_tokenised)):
    X.append(model.infer_vector(evidence_tokenised[i]))

X

[array([-0.01106968,  0.05572545,  0.03178676, -0.08181673,  0.21559347,
         0.04156952,  0.07430033,  0.05271063, -0.15588412,  0.01045961],
       dtype=float32),
 array([-0.0666507 , -0.15458545,  0.19163793, -0.09211192,  0.06757217,
         0.03400132,  0.07744221,  0.06509159,  0.09984824, -0.05352056],
       dtype=float32),
 array([-0.04626767,  0.01238379,  0.00188007, -0.09181805,  0.13816231,
         0.03634149,  0.24565986,  0.03836261, -0.01697864,  0.14145365],
       dtype=float32),
 array([-0.02779751, -0.01990937, -0.04429947, -0.12464282, -0.09243864,
         0.00960186,  0.14072642,  0.26409966, -0.03338206,  0.03869378],
       dtype=float32),
 array([-0.06091616, -0.23802815, -0.04328539,  0.0975775 , -0.08822419,
        -0.03841515, -0.16434167, -0.16480525, -0.04081047, -0.00829715],
       dtype=float32),
 array([ 1.4357640e-05, -3.5109885e-02,  1.9263938e-01, -1.0486660e-01,
         1.2533584e-01,  1.4826396e-01,  1.1592293e-01,  3.3782378e-02,
      

In [27]:
claims_tokenised = [train_claims_text[i].split() for i in range(len(train_claims_text))]

In [28]:
claim_X = list()

for i in range(len(claims_tokenised)):
    claim_X.append(model.infer_vector(claims_tokenised[i]))

claim_X

[array([ 0.1022862 ,  0.27758107, -0.03519189, -0.1397265 ,  0.3181662 ,
        -0.38812938,  0.06724302, -0.05380239,  0.00300034, -0.26313493],
       dtype=float32),
 array([-0.09533089,  0.03193947,  0.37375614, -0.05483225, -0.1642918 ,
        -0.01268253,  0.1390761 ,  0.2589292 ,  0.12966609, -0.24631903],
       dtype=float32),
 array([-0.15007475,  0.14865488,  0.08649843, -0.1060974 , -0.17540126,
        -0.08736143,  0.09484517,  0.07476486,  0.0866652 , -0.09534271],
       dtype=float32),
 array([-0.19128114,  0.02485316,  0.00480373, -0.15983026,  0.37337813,
         0.15575849,  0.15817219, -0.2775496 , -0.25856575, -0.02033491],
       dtype=float32),
 array([-0.07933965,  0.06615064,  0.17463924,  0.03932288,  0.12091603,
         0.01613083,  0.12496398, -0.02963853, -0.08336017, -0.05468205],
       dtype=float32),
 array([ 0.09441453,  0.158509  , -0.12381246,  0.08919097,  0.10404443,
        -0.49923036,  0.10503782, -0.22283055,  0.18160796, -0.28297502],
   

In [29]:
from numpy import dot
from numpy.linalg import norm

def cos_sim(a, b):
    return dot(a,b)/(norm(a)*norm(b))

In [30]:
train_claims_list = [train_claims[claim]['evidences'] for claim in train_claims]

In [31]:
EVIDENCE_ID_MAP = {x[0]:x[1][0] for x in enumerate(evidence_list)}

In [32]:
import numpy as np

In [33]:
metric_list = []

for j in range(len(claim_X)):
    cosine_sim = []
    
    for i in range(len(X)):
        cosine_sim.append((i, cos_sim(claim_X[j], X[i])))

    cosine_sim.sort(reverse=True, key = lambda x:x[1])

    top_claims = [EVIDENCE_ID_MAP[cosine_sim[i][0]] for i in range(10000)]

    metric = sum([1 if claim in top_claims else 0 for claim in train_claims_list[j]])/len(train_claims_list[j])
    metric_list.append(metric)
    print(metric)

np.mean(metric_list)
        

0.3333333333333333
0.0
0.5
0.0
0.0
0.0
0.0
0.3333333333333333
0.2
0.0
1.0
0.2
0.0
0.2
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.2
0.0
0.0
0.0
0.2
0.0
0.2
0.0
0.0
0.0
0.0
0.0
0.3333333333333333
0.0
0.0
0.0
0.3333333333333333
0.25
0.0
0.0
0.0
0.5
0.2
0.0
0.0
0.0
0.5
0.0
0.2
0.0
1.0
0.0
0.2
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.2
0.0
0.0
0.0
0.0
0.25
0.25
0.0
0.0
0.0
0.2
0.0
0.25
0.2
