# 3.4 Precompute reduce evidence conditioned on claim based on W2V

In [1]:
import json
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import *
from nltk.corpus import stopwords
import string
import pickle
import numpy as np

## Read in Data

In [2]:
with open ('../data/raw/train-claims.json') as f:
    train_claims = json.load(f)

In [3]:
with open ('../data/raw/dev-claims.json') as f:
    dev_claims = json.load(f)

In [4]:
train_claims_text = list()
dev_claims_text = list()

## Prepare Data

In [5]:
definitely_related_evidence = set()
for claim in train_claims:
    for evidence in train_claims[claim]['evidences']:
        definitely_related_evidence.add(evidence)
    
    train_claims_text.append(train_claims[claim]['claim_text'])

definitely_related_evidence2 = set()
for claim in dev_claims:
    for evidence in dev_claims[claim]['evidences']:
        definitely_related_evidence2.add(evidence)
    
    dev_claims_text.append(dev_claims[claim]['claim_text'])

In [6]:
cluster_train = definitely_related_evidence.difference(definitely_related_evidence2)

In [7]:
cluster_dev = definitely_related_evidence2

In [8]:
len(definitely_related_evidence)

3121

In [9]:
len(definitely_related_evidence2)

463

In [10]:
len(cluster_train)

2980

In [11]:
len(cluster_dev)

463

In [12]:
cluster_full = definitely_related_evidence.union(definitely_related_evidence2)

In [13]:
len(cluster_full)

3443

In [14]:
with open ('../data/raw/evidence.json') as f:
    evidence = json.load(f)

In [15]:
evidence_list = list(evidence.items())
evidence_list.sort()

## Word2Vec

In [16]:
from gensim.models.doc2vec import Word2Vec

Preprocessing

In [31]:
evidence_tokenised = list()
ALL_PUNCT = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''

for i in range(len(evidence_list)):
    
    text = evidence_list[i][1].lower()

    no_punct_text = str()
    for char in text:
        if char not in ALL_PUNCT:
            no_punct_text += char

    evidence_tokenised.append(no_punct_text.split())

In [60]:
# Train the W2V
model = Word2Vec(sentences=evidence_tokenised, vector_size=50, window=5, min_count=1, workers=4)
# model.save("word2vec.model")

In [61]:
# get the average of W2V embeddings for each token

X = list()

for i in range(len(evidence_tokenised)):
    
    embed = [0 for k in range(50)]
    for word in evidence_tokenised[i]:
        embed = np.add(embed, model.wv[word])
    
    try:
        embed = [embed[j]/len(evidence_tokenised[i]) for j in range(50)] 
    except:   
        pass
    X.append(embed)

X

[[-0.5805553235113621,
  0.18499872769461945,
  -0.1384810800664127,
  0.0629951017908752,
  0.3873922345228493,
  -0.047308631241321564,
  1.53465375083033,
  2.046289199963212,
  0.1074271509423852,
  0.528627245221287,
  -1.318894954281859,
  1.0715504344552755,
  -1.086643850314431,
  2.2335069067776203,
  1.5495578832924366,
  -0.7877736017107964,
  0.3744096178561449,
  0.6482982560992241,
  0.07511200476437807,
  0.48021219624206424,
  -1.8035010867752135,
  -0.6792861800640821,
  1.2186064263805747,
  0.9380707442760468,
  0.401053368113935,
  0.08910735591780394,
  -0.05514431977644563,
  0.48927145381458104,
  -0.1727193519473076,
  2.515961923636496,
  -0.9230796676129103,
  -1.3254121487261727,
  -0.012129105627536774,
  0.16410813573747873,
  2.1080160290002823,
  0.6824697116389871,
  -0.7980004111304879,
  -0.1433539050631225,
  0.12763082422316074,
  -0.6815269803628325,
  0.6462566289119422,
  -1.0556100364774466,
  0.08229412045329809,
  1.9712556349113584,
  -0.41114

In [52]:
# tokenise each claim
claims_tokenised = list()
ALL_PUNCT = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''

for i in range(len(train_claims_text)):
    
    text = train_claims_text[i].lower()

    no_punct_text = str()
    for char in text:
        if char not in ALL_PUNCT:
            no_punct_text += char

    claims_tokenised.append(no_punct_text.split())

In [53]:
claims_tokenised

[['not',
  'only',
  'is',
  'there',
  'no',
  'scientific',
  'evidence',
  'that',
  'co2',
  'is',
  'a',
  'pollutant',
  'higher',
  'co2',
  'concentrations',
  'actually',
  'help',
  'ecosystems',
  'support',
  'more',
  'plant',
  'and',
  'animal',
  'life'],
 ['el',
  'niño',
  'drove',
  'record',
  'highs',
  'in',
  'global',
  'temperatures',
  'suggesting',
  'rise',
  'may',
  'not',
  'be',
  'down',
  'to',
  'manmade',
  'emissions'],
 ['in', '1946', 'pdo', 'switched', 'to', 'a', 'cool', 'phase'],
 ['weather',
  'channel',
  'cofounder',
  'john',
  'coleman',
  'provided',
  'evidence',
  'that',
  'convincingly',
  'refutes',
  'the',
  'concept',
  'of',
  'anthropogenic',
  'global',
  'warming'],
 ['january',
  '2008',
  'capped',
  'a',
  '12',
  'month',
  'period',
  'of',
  'global',
  'temperature',
  'drops',
  'on',
  'all',
  'of',
  'the',
  'major',
  'well',
  'respected',
  'indicators'],
 ['the',
  'last',
  'time',
  'the',
  'planet',
  'was',


In [62]:
# Prepare average W2V embeddings for all claims
claim_X = list()

for i in range(len(claims_tokenised)):
    
    embed = [0 for k in range(50)]
    for word in claims_tokenised[i]:
        try:
            embed = np.add(embed, model.wv[word])
        except:
            pass
    
    try:
        embed = [embed[j]/len(claims_tokenised[i]) for j in range(50)]
    except:   
        pass

    claim_X.append(embed)

claim_X

[[-0.8207949480662743,
  -0.28853979691242176,
  0.5149276601150632,
  0.43362100205073756,
  0.17650809635718664,
  1.819657472272714,
  0.48502979741897434,
  0.3662095746646325,
  0.05190194149812063,
  1.0957845744754497,
  -3.379044479380051,
  0.662426283020371,
  1.1111501169701417,
  2.361729564766089,
  -0.21794851558903852,
  1.0814615398024519,
  1.173034232109785,
  -0.7145604262671744,
  1.1651191997031372,
  0.5598318533351024,
  -3.1111326217651367,
  0.28518639504909515,
  0.8848162566622099,
  -1.6147603356900315,
  -2.875085061057083,
  -0.453546055747817,
  0.08020878210663795,
  1.088161184762915,
  -0.702407440985553,
  1.4048739404728015,
  -0.7227600036421791,
  -0.7062943118313948,
  0.07807490136474371,
  1.358102474361658,
  -0.381875761008511,
  1.0765319367249806,
  -1.3345946555879589,
  -0.9119467542817196,
  -0.31224656431004405,
  1.5547594465315342,
  0.4788334872573614,
  1.4928805939853191,
  -0.0664823455736041,
  1.2740958559637268,
  0.096093287381

# Try to get similarity

In [55]:
from numpy import dot
from numpy.linalg import norm

def cos_sim(a, b):
    return dot(a,b)/(norm(a)*norm(b))

In [56]:
train_claims_list = [train_claims[claim]['evidences'] for claim in train_claims]

In [57]:
EVIDENCE_ID_MAP = {x[0]:x[1][0] for x in enumerate(evidence_list)}

In [64]:
# quickly discover that top 10000 evidence often don't contain the ground truth evidence for the train set

metric_list = []

for j in range(len(claim_X)):
    cosine_sim = []
    
    for i in range(len(X)):
        cosine_sim.append((i, cos_sim(claim_X[j], X[i])))

    cosine_sim.sort(reverse=True, key = lambda x:x[1])

    top_claims = [EVIDENCE_ID_MAP[cosine_sim[i][0]] for i in range(10000)]

    metric = sum([1 if claim in top_claims else 0 for claim in train_claims_list[j]])/len(train_claims_list[j])
    metric_list.append(metric)
    print(train_claims_list[j])
    print(metric)

np.mean(metric_list)
        

  return dot(a,b)/(norm(a)*norm(b))


['evidence-442946', 'evidence-1194317', 'evidence-12171']
0.0
['evidence-338219', 'evidence-1127398']
1.0
['evidence-530063', 'evidence-984887']
0.0
['evidence-1177431', 'evidence-782448', 'evidence-540069', 'evidence-352655', 'evidence-1007867']
0.2
['evidence-1010750', 'evidence-91661', 'evidence-722725', 'evidence-554161', 'evidence-430839']
0.0
['evidence-226174', 'evidence-1049316', 'evidence-358301', 'evidence-493329', 'evidence-610497']
0.0
['evidence-974673', 'evidence-602109']
0.0
['evidence-707654', 'evidence-28478', 'evidence-491579']
0.0
['evidence-863309', 'evidence-61462', 'evidence-639818', 'evidence-757821', 'evidence-263527']
0.0
['evidence-439640']
0.0
['evidence-217743']
0.0
['evidence-222694', 'evidence-905909', 'evidence-600745', 'evidence-337702', 'evidence-1015241']
0.2
['evidence-386828', 'evidence-535248', 'evidence-1078011', 'evidence-975483', 'evidence-177166']
0.2
['evidence-515817', 'evidence-1018575', 'evidence-791159', 'evidence-1009205', 'evidence-878835

KeyboardInterrupt: 