In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

import json
import torch
import random
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from nltk.corpus import stopwords
from scipy.spatial.distance import cdist
from transformers import BertModel, BertTokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package punkt to /Users/relax/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/relax/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
'''Remove stopwords from claim and evidence for reducing the computational consumption'''
def stopwords_func(stop_words, text_type, text_data):
    if text_type == "evidence":
        for i in text_data:
            sentence = text_data[i]
            words = sentence.split()
            filtered_words = [word for word in words if word.lower() not in stop_words]
            filtered_sentence = " ".join(filtered_words)
            text_data[i] = filtered_sentence
    else:
        for i in text_data.values():
            sentence = i["claim_text"]
            words = sentence.split()
            filtered_words = [word for word in words if word.lower() not in stop_words]
            filtered_sentence = " ".join(filtered_words)
            i["claim_text"] = filtered_sentence
    return text_data

'''Function for picking random keys from the dictionary after excluding the specified key(s)'''
def pick_random_keys(dictionary, excluded_keys, num_keys):
    available_keys = [key for key in dictionary.keys() if key not in excluded_keys]
    random_keys = random.sample(available_keys, num_keys)
    return random_keys

'''Function for turning the text into lowercase expression'''
def lower_processing(data, text_type):
    if text_type == "claim_text":
        for i in data:
            data[i][text_type] = data[i][text_type].lower()
    else:
        for i in data:
            data[i] = data[i].lower()
    return data

In [3]:
## Read in data
# Read in training data (claim)
with open('../project-data/train-claims.json', 'r') as tclaim_file:
    tclaim_data = json.load(tclaim_file)

# Read in development data (claim)
with open('../project-data/dev-claims.json', 'r') as dclaim_file:
    dclaim_data = json.load(dclaim_file)

# Read in test data (claim)
with open('../project-data/test-claims-unlabelled.json', 'r') as uclaim_file:
    uclaim_data = json.load(uclaim_file)

# Read in evidence data
with open('../project-data/evidence.json', 'r') as evi_file:
    evi_data = json.load(evi_file)

## Preprocessing - Lowercase operation of the case
tclaim_data = lower_processing(tclaim_data, "claim_text")
dclaim_data = lower_processing(dclaim_data, "claim_text")
uclaim_data = lower_processing(uclaim_data, "claim_text")
evi_data = lower_processing(evi_data, 'evidence')

## Create claim-evidence pair based on training set
tkey_list = list(tclaim_data.keys())
train_pairs = []
evi_keys = []
labels = []
for i in tclaim_data:
    for j in tclaim_data[i]["evidences"]:
        evi_keys.append(j)
        train_pairs.append((i, tclaim_data[i]["claim_text"], evi_data[j], j))
        labels.append(1)
  
## insert negative sample to the training set
random.seed(1)
for i in tclaim_data:
    excluded_keys = tclaim_data[i]["evidences"]
    random_keys = pick_random_keys(evi_data, excluded_keys, len(excluded_keys))
    for j in random_keys:
        evi_keys.append(j)
        train_pairs.append((i, tclaim_data[i]["claim_text"], evi_data[j], j))
        labels.append(0)

In [4]:
# Store the claim, evidence sentence(s) to list for embedded usage
claim_train_sentence = []
claim_dep_sentence = []
claim_test_sentence = []
evidence_full_sentence = []
evidence_sample_sentence = []

for i in tclaim_data.values():
    claim_train_sentence.append(i["claim_text"])
for i in dclaim_data.values():
    claim_dep_sentence.append(i["claim_text"])
for i in uclaim_data.values():
    claim_test_sentence.append(i["claim_text"])
for i in evi_data.values():
    evidence_full_sentence.append(i)
tfidf_keys = []
for i in train_pairs[:int(len(train_pairs)/2)]:
  if i[2] not in evidence_sample_sentence:
    evidence_sample_sentence.append(i[2])
    tfidf_keys.append(i[3])

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample training data
train_data = claim_train_sentence + claim_dep_sentence + evidence_full_sentence

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit the vectorizer on the training data
tfidf_vectorizer.fit(train_data)

# Transform the training and test data using the trained vectorizer
train_embeddings = tfidf_vectorizer.transform(claim_train_sentence)
test_embeddings = tfidf_vectorizer.transform(claim_test_sentence)

In [17]:
train_claim_list = tclaim_data.keys()
train_claim_list

dict_keys(['claim-1937', 'claim-126', 'claim-2510', 'claim-2021', 'claim-2449', 'claim-851', 'claim-2773', 'claim-949', 'claim-1019', 'claim-2834', 'claim-1441', 'claim-1181', 'claim-2417', 'claim-2152', 'claim-826', 'claim-1066', 'claim-3003', 'claim-3059', 'claim-568', 'claim-60', 'claim-378', 'claim-2486', 'claim-1782', 'claim-1235', 'claim-2065', 'claim-1390', 'claim-2789', 'claim-1414', 'claim-377', 'claim-558', 'claim-1447', 'claim-1038', 'claim-2794', 'claim-930', 'claim-2763', 'claim-1920', 'claim-2961', 'claim-1395', 'claim-1404', 'claim-2257', 'claim-646', 'claim-1545', 'claim-1317', 'claim-1547', 'claim-2280', 'claim-118', 'claim-1357', 'claim-69', 'claim-2849', 'claim-499', 'claim-749', 'claim-428', 'claim-1837', 'claim-1875', 'claim-1324', 'claim-1980', 'claim-409', 'claim-2193', 'claim-3084', 'claim-1227', 'claim-1499', 'claim-1790', 'claim-2090', 'claim-2199', 'claim-1498', 'claim-1792', 'claim-2540', 'claim-2474', 'claim-2741', 'claim-3102', 'claim-1112', 'claim-2488', 

In [None]:
k = 11
train_claim_list = list(tclaim_data.keys())
label_potential_list = []
for claim_id, claim_value in uclaim_data.items():
    test_claim_text = claim_value["claim_text"]
    test_claim_emb = tfidf_vectorizer.transform([test_claim_text])
    evi_sim_dict = {}
    sim = cosine_similarity(test_claim_emb, train_embeddings)[0]

    for i in range(len(sim)):
        evi_sim_dict[train_claim_list[i]] = sim[i]
    s_sim = [(k, v) for k, v in sorted(evi_sim_dict.items(), key=lambda item: item[1],reverse=True)][:k]
    label_list = []
    for i in s_sim:
        label_list.append(i[0])
    label_potential_list.append(label_list)

In [25]:
with open('../test-claims-predictions.json', 'r') as final_json:
    final_test = json.load(final_json)
test_key_list = list(final_test.keys())

def most_frequent_element(nums):
    return max(set(nums), key=nums.count)

majority_label_list = []
for i in range(len(label_potential_list)):
    po_label = []
    for j in label_potential_list[i]:
        po_label.append(tclaim_data[j]["claim_label"])
    test_class = (most_frequent_element(po_label))
    final_test[test_key_list[i]]["claim_label"] = test_class

file_path = '../test-claims-predictions.json'
with open(file_path, 'w') as json_file:
    json.dump(final_test, json_file)

In [26]:
final_test

{'claim-2967': {'claim_text': 'The contribution of waste heat to the global climate is 0.028 W/m2.',
  'claim_label': 'SUPPORTS',
  'evidences': ['evidence-308923',
   'evidence-22107',
   'evidence-213569',
   'evidence-560239',
   'evidence-88593',
   'evidence-127889']},
 'claim-979': {'claim_text': '“Warm weather worsened the most recent five-year drought, which included the driest four-year period on record in terms of statewide precipitation.',
  'claim_label': 'NOT_ENOUGH_INFO',
  'evidences': ['evidence-937309',
   'evidence-957389',
   'evidence-178433',
   'evidence-268048',
   'evidence-434754',
   'evidence-435808']},
 'claim-1609': {'claim_text': 'Greenland has only lost a tiny fraction of its ice mass.',
  'claim_label': 'SUPPORTS',
  'evidences': ['evidence-776422',
   'evidence-52981',
   'evidence-966882',
   'evidence-264761',
   'evidence-510677',
   'evidence-408551']},
 'claim-1020': {'claim_text': '“The global reef crisis does not necessarily mean extinction for c

In [None]:
with open('../test-claims-predictions.json', 'r') as final_json:
    final_test = json.load(final_json)

label_list = []
for i in test_np:
    similarity = []
    for j in train_np:
        similarity.append(cosine_similarity(np.reshape(i, (1, -1)), np.reshape(j, (1, -1)))[0][0])
    top_index = np.argsort(similarity)[-11:]
    label_list.append(list(top_index))

In [None]:
def most_frequent_element(nums):
    return max(set(nums), key=nums.count)

train_key_list = list(tclaim_data.keys())
test_key_list = list(final_test.keys())
for i in range(len(label_list)):
    label_list_potential = []
    for j in label_list[i]:
        label_list_potential.append(tclaim_data[train_key_list[j]]["claim_label"])
    test_class = (most_frequent_element(label_list_potential))
    # test_class = (tclaim_data[train_key_list[label_list[i]]]["claim_label"])
    final_test[test_key_list[i]]["claim_label"] = test_class

In [None]:
file_path = '../test-claims-predictions.json'
with open(file_path, 'w') as json_file:
    json.dump(final_test, json_file)