In [2]:
import numpy as np
import pandas as pd
import torch 
from make_vocab import lst_gram, n_gram
import json
import random
import copy

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
file_path = './data/'
files = ['norm_train_claims.json', 'norm_dev_claims.json', 'norm_climate_evidences.json', 'evidence.json']

for file in files:
    with open(file_path + file, 'r', encoding='utf-8') as f:
        if file == 'norm_train_claims.json':
            train_claims = json.load(f)
        elif file == 'norm_dev_claims.json':
            dev_claims = json.load(f)
        elif file == 'norm_climate_evidences.json':
            evidences = json.load(f)
        elif file == 'evidence.json':
            evidences_all = json.load(f)

In [5]:
evid_text_list = list(evidences.values())
train_text_list = []
for key, values in train_claims.items():
    train_text_list.append(values['norm_claim'])
len(train_text_list)
dev_text_list = []
for key, values in dev_claims.items():
    dev_text_list.append(values['norm_claim'])
len(dev_text_list)

154

In [6]:
vectorizer = TfidfVectorizer(stop_words='english')
vectorizer.fit(evid_text_list+train_text_list)
evid_tfidf = vectorizer.transform(evid_text_list)
train_tfidf = vectorizer.transform(train_text_list)
dev_tfidf = vectorizer.transform(dev_text_list)
print(evid_tfidf.shape), print(train_tfidf.shape), print(dev_tfidf.shape)

(521503, 445812)
(1228, 445812)
(154, 445812)


(None, None, None)

In [14]:
con_sim = cosine_similarity(train_tfidf, evid_tfidf)
print(con_sim.shape)

(1228, 521503)


In [15]:
top_k_similarities = np.argsort(con_sim, axis=1)[:,-100:]
print(top_k_similarities.shape)

(1228, 100)


In [7]:
evid_ids_list = list(evidences.keys())

def top_k_similarities(claim_tfidf,evid_tfidf, k):
    con_sim = cosine_similarity(claim_tfidf, evid_tfidf)
    ranked_evd_id, ranked_evd_score = [], []

    for i in range(con_sim.shape[0]):
        # for each claim
        cos_sim_claim = con_sim[i]
        # find top k cos similarity
        top_evd_index = np.argsort(cos_sim_claim).tolist()[-k:][::-1]
        top_evd_ids = [evid_ids_list[i] for i in top_evd_index]
        top_evd_score = np.sort(cos_sim_claim).tolist()[-k:][::-1]
        # append the top k evidence list
        ranked_evd_id.append(top_evd_ids)
        ranked_evd_score.append(top_evd_score)
    return ranked_evd_id, ranked_evd_score

In [43]:
top_evd_id_train, top_evd_score_train = top_k_similarities(train_tfidf, evid_tfidf, 2000)
# top_evd_id_dev, top_evd_score_dev = top_k_similarities(dev_tfidf, evid_tfidf, 100)

In [45]:
def recall(gt_evid_ids, pred_evid_ids):
    recall = 0
    for i in range(len(gt_evid_ids)):
        if gt_evid_ids[i] in pred_evid_ids:
            recall += 1
    return recall/len(gt_evid_ids)

def avg_recall(gt_claims, pred_claims):
    avg_recall = 0
    # for claim_id, values in gt_claims.items():
    for i, values in enumerate(gt_claims):
        avg_recall += recall(gt_claims[values]['evidences'], pred_claims[i])
    return avg_recall/len(gt_claims)

In [47]:
top_k = [500, 1000, 1500, 2000, 3000]
for k in top_k:
    top_evd_id_train, top_evd_score_train = top_k_similarities(train_tfidf, evid_tfidf, k)
    rec = avg_recall(train_claims, top_evd_id_train)
    print(k, rec)

0.5776465798045601
0.6642100977198694
0.7125135722041248
0.7397258414766548
0.778732356134635


In [48]:
top_k = [4000, 5000, 6000]
for k in top_k:
    top_evd_id_train, top_evd_score_train = top_k_similarities(train_tfidf, evid_tfidf, k)
    rec = avg_recall(train_claims, top_evd_id_train)
    print(k, rec)

4000 0.8062839305103133
5000 0.8255293159609101
6000 0.8398208469055354


In [8]:
def negative_sample(i, values, top_evd_id_train, claim_df, evids_pool, num_hard, num_easy):
    pred_claim_evids = top_evd_id_train[i]
    gt_claim_evids = claim_df[values]['evidences']

    easy_pool = [evd for evd in evids_pool if evd not in pred_claim_evids and evd not in gt_claim_evids]
    easy_neg = random.sample(easy_pool, num_easy)
    
    hard_neg = []
    idx, eff_idx = 0, 0
    while eff_idx < num_hard:
        hard_id = pred_claim_evids[idx]
        if hard_id not in gt_claim_evids:
            hard_neg.append(hard_id)
            eff_idx += 1
        idx += 1
    neg_samples = hard_neg + easy_neg
    return neg_samples

In [9]:
def add_neg(top_evd_ids, claim_df, evids_pool, num_hard, num_easy):
    for i, values in enumerate(claim_df):
        if i % 200 == 0:
            print(f"{i} claims have been processed.")
        neg_samples = negative_sample(i, values, top_evd_ids, claim_df, evids_pool, num_hard, num_easy)

        claim_df[values]['neg_evidences'] = neg_samples
    return claim_df

In [10]:
top_evd_id_train, top_evd_score_train = top_k_similarities(train_tfidf, evid_tfidf, 10)
copied_train = copy.deepcopy(train_claims)

In [29]:
train_with_neg = add_neg(top_evd_id_train, copied_train, evid_ids_list, 3, 300)

0 claims have been processed.
200 claims have been processed.
400 claims have been processed.
600 claims have been processed.
800 claims have been processed.
1000 claims have been processed.
1200 claims have been processed.


In [33]:
top_evd_id_dev, top_evd_score_dev = top_k_similarities(dev_tfidf, evid_tfidf, 10)
copied_dev = copy.deepcopy(dev_claims)
dev_with_neg = add_neg(top_evd_id_dev, copied_dev, evid_ids_list, 3, 300)

0 claims have been processed.


In [35]:
with open("data/output/train_with_neg_303.json", "w") as f:
    json.dump(train_with_neg, f)
with open("data/output/dev_with_neg_303.json", "w") as f:
    json.dump(dev_with_neg, f)