# 3.2 Precompute reduce evidence conditioned on claim based on TFIDF+SVD similarity

In [2]:
import json
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import *
from nltk.corpus import stopwords
import string
import pickle

In [3]:
with open ('../data/raw/train-claims.json') as f:
    train_claims = json.load(f)

In [4]:
with open ('../data/raw/dev-claims.json') as f:
    dev_claims = json.load(f)

In [20]:
train_claims_text = list()
dev_claims_text = list()

In [21]:
definitely_related_evidence = set()
for claim in train_claims:
    for evidence in train_claims[claim]['evidences']:
        definitely_related_evidence.add(evidence)
    
    train_claims_text.append(train_claims[claim]['claim_text'])

definitely_related_evidence2 = set()
for claim in dev_claims:
    for evidence in dev_claims[claim]['evidences']:
        definitely_related_evidence2.add(evidence)
    
    dev_claims_text.append(dev_claims[claim]['claim_text'])

In [6]:
cluster_train = definitely_related_evidence.difference(definitely_related_evidence2)

In [7]:
cluster_dev = definitely_related_evidence2

In [8]:
len(definitely_related_evidence)

3121

In [9]:
len(definitely_related_evidence2)

463

In [10]:
len(cluster_train)

2980

In [11]:
len(cluster_dev)

463

In [12]:
cluster_full = definitely_related_evidence.union(definitely_related_evidence2)

In [13]:
len(cluster_full)

3443

In [65]:
with open ('../data/raw/evidence.json') as f:
    evidence = json.load(f)

In [17]:
evidence_list = list(evidence.items())
evidence_list.sort()

In [62]:
EVIDENCE_ID_MAP = {x[0]:x[1][0] for x in enumerate(evidence_list)}
ALL_PUNCT = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
STOP_WORDS = set(stopwords.words('english'))

In [29]:
stemmer = PorterStemmer()

In [45]:
def preprocess(text):
    text = text.lower()

    no_punct_text = str()
    for char in text:
        if char not in ALL_PUNCT:
            no_punct_text += char

    word_list = no_punct_text.split()

    no_stop_words_word_list = [word for word in word_list if word not in STOP_WORDS]
    # has_alpha_words_word_list = [word for word in no_stop_words_word_list if all_alpha(word)]
    
    # stemmed_word_list = [stemmer.stem(word) for word in has_alpha_words_word_list]
    stemmed_word_list = [stemmer.stem(word) for word in no_stop_words_word_list]

    output = ' '.join(stemmed_word_list)
    
    return output

preprocessed_evidence = [preprocess(evidence_list[i][1]) for i in range(len(evidence_list))]
preprocessed_evidence

['john bennet law english entrepreneur agricultur scientist',
 'lindberg began profession career age 16 eventu move new york citi 1977',
 'matroid dual go back origin paper hassler whitney defin matroid',
 'sauc may tomatobas depend michigan purchas',
 'see big west confer final season san diego state member pcaa',
 'philippa gail 1942 1999 british theatr film televis actress',
 'myanmar thailand peninsular malaysia sumatra',
 'compet two winter olymp best finish fifth fourman event salt lake citi 2002',
 'cbc monkstown cup time four win',
 'streamlin motorcycl design charli perethian 185 cc yamaha motor achiev 372 mpgu 1983 challeng display smithsonian institut',
 'lead ship anzacclass frigat',
 'xblade also divis unit kingdom locat doncast establish 2012',
 'move unit state studi univers oregon receiv bs',
 'ground host first game rugbi leagu march 1936 st georg dragon defeat newtown bluebag exhibit match',
 'name founder natali zahl consist two independ run primari school gymnasium'

In [46]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(preprocessed_evidence)

In [None]:
with open('../data/curated/processed_tfidf_nonalpha.pickle', 'wb') as f:
    pickle.dump(X, f)

In [34]:
with open('../data/curated/processed_tfidf_nonalpha.pickle', 'rb') as f:
    X = pickle.load(f)

In [36]:
from sklearn.decomposition import TruncatedSVD

In [104]:
svd = TruncatedSVD(40)

In [105]:
X_svd = svd.fit_transform(X)

In [41]:
preprocessed_claims = [preprocess(train_claims_text[i]) for i in range(len(train_claims_text))]

In [47]:
prepro_X = vectorizer.transform(preprocessed_claims)

In [106]:
prepro_X_svd = svd.transform(prepro_X)

In [49]:
from numpy import dot
from numpy.linalg import norm

def cos_sim(a, b):
    return dot(a,b)/(norm(a)*norm(b))

In [98]:
train_claims_list = [train_claims[claim]['evidences'] for claim in train_claims]

[['evidence-442946', 'evidence-1194317', 'evidence-12171'],
 ['evidence-338219', 'evidence-1127398'],
 ['evidence-530063', 'evidence-984887'],
 ['evidence-1177431',
  'evidence-782448',
  'evidence-540069',
  'evidence-352655',
  'evidence-1007867'],
 ['evidence-1010750',
  'evidence-91661',
  'evidence-722725',
  'evidence-554161',
  'evidence-430839'],
 ['evidence-226174',
  'evidence-1049316',
  'evidence-358301',
  'evidence-493329',
  'evidence-610497'],
 ['evidence-974673', 'evidence-602109'],
 ['evidence-707654', 'evidence-28478', 'evidence-491579'],
 ['evidence-863309',
  'evidence-61462',
  'evidence-639818',
  'evidence-757821',
  'evidence-263527'],
 ['evidence-439640'],
 ['evidence-217743'],
 ['evidence-222694',
  'evidence-905909',
  'evidence-600745',
  'evidence-337702',
  'evidence-1015241'],
 ['evidence-386828',
  'evidence-535248',
  'evidence-1078011',
  'evidence-975483',
  'evidence-177166'],
 ['evidence-515817',
  'evidence-1018575',
  'evidence-791159',
  'eviden

In [107]:
metric_list = []

for j in range(len(prepro_X_svd)):
    cosine_sim = []
    
    for i in range(len(X_svd)):
        cosine_sim.append((i, cos_sim(prepro_X_svd[j], X_svd[i])))

    cosine_sim.sort(reverse=True, key = lambda x:x[1])

    top_claims = [EVIDENCE_ID_MAP[cosine_sim[i][0]] for i in range(10000)]

    metric = sum([1 if claim in top_claims else 0 for claim in train_claims_list[j]])/len(train_claims_list[j])
    metric_list.append(metric)
    print(metric)

np.mean(metric_list)
        

  return dot(a,b)/(norm(a)*norm(b))


0.3333333333333333
0.0
0.0
0.2
0.0
0.2
0.0
0.0
0.0
0.0
0.0


KeyboardInterrupt: 