In [1]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
from statistics import mean
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
pd.set_option('display.float_format', '{:.1f}'.format)

## Read data

In [3]:
%cd /Users/macbookpro/Documents/COMP90042_NLP/Project/COMP90042_2024_Project

/Users/macbookpro/Documents/COMP90042_NLP/Project/COMP90042_2024_Project


In [4]:
with open("data/train-claims.json", 'r') as file:
    train_claims = json.load(file)

with open("data/dev-claims.json", 'r') as file:
    dev_claims = json.load(file)

with open("data/test-claims-unlabelled.json", 'r') as file:
    test_claims_unlabelled = json.load(file)

with open("data/dev-claims-baseline.json", 'r') as file:
    dev_claims_baseline = json.load(file)
    
with open("data/evidence.json", 'r') as file:
    evidence = json.load(file)

with open("data/external/evidence_climate_2.json", 'r') as file:
    evidence_climate_2 = json.load(file)
    
label_tags = ["SUPPORTS", "REFUTES", "NOT_ENOUGH_INFO", "DISPUTED"]

In [33]:
len(evidence_climate_2)

521503

In [5]:
inverted_dict = {}

for key, value in evidence_climate_2.items():
    inverted_dict[value] = key

cleaned_evidence = {v: k for k, v in inverted_dict.items()}

evidence_climate_2 = cleaned_evidence
len(evidence_climate_2)

514740

Preprocessing evidence

In [4]:
with open("data/external/preprocessed_evidences.json", 'r') as file:
    prepro_evd_texts = json.load(file)

prepro_evidence = {}
i = 0
for key, value in evidence.items():
    prepro_evidence[key] = " ".join(prepro_evd_texts[i])
    i += 1
len(prepro_evidence)

1208827

In [36]:
prepro_climated_evidence = {}
for key in evidence_climate_2.keys():
    prepro_climated_evidence[key] = prepro_evidence[key]
len(prepro_climated_evidence)

521503

In [5]:
with open("data/external/preprocessed_train_claims.json", 'r') as file:
    prepro_claim_texts = json.load(file)

prepro_train = {}
i = 0
for key, value in train_claims.items():
    value_pre = {}
    value_pre['claim_text'] = " ".join(prepro_claim_texts[i])
    value_pre['claim_label'] = value['claim_label']
    value_pre['evidences'] = value['evidences']
    prepro_train[key] = value_pre
    i += 1

print("number of train claims: ", len(prepro_train))

with open("data/external/preprocessed_dev_claims.json", 'r') as file:
    prepro_claim_texts_dev = json.load(file)

prepro_dev = {}
i = 0
for key, value in dev_claims.items():
    value_pre = {}
    value_pre['claim_text'] = " ".join(prepro_claim_texts_dev[i])
    value_pre['claim_label'] = value['claim_label']
    value_pre['evidences'] = value['evidences']
    prepro_dev[key] = value_pre
    i += 1

print("number of development claims: ", len(prepro_dev))


number of train claims:  1228
number of development claims:  154


In [6]:
prepro_clim_evid = {}
for key, value in evidence_climate_2.items():
    prepro_clim_evid[key] = prepro_evidence[key]
len(prepro_clim_evid)

521503

In [68]:
evidence_climate_2

{'evidence-5': 'With peak winds of 110 mph (175 km/h) and a minimum pressure of 972 mbar (hPa ; 28.71 inHg), Florence was the strongest storm of the 1994 Atlantic hurricane season.',
 'evidence-6': 'He is currently a professor of piano at the University of Wisconsin -- Madison since August 2000.',
 'evidence-7': 'In addition to known and tangible risks, unforeseeable black swan extinction events may occur, presenting an additional methodological problem.',
 'evidence-9': 'Aslan Tlebzu (Аслъан ЛIыбзэу [- adyaːsɬaːn ɬʼəbzaw], Russian : Аслан Тлебзу), born 24 February 1981, Teuchezhsk, Adygea, USSR ; is a Russian Adyghe folk musician.',
 'evidence-13': 'His forced recusal from the case brought by the widow of Giuseppe Pinelli against the police commissioner Luigi Calabresi in 1971 became a cause célèbre.',
 'evidence-16': 'He is best known as author of The Prize : The Epic Quest for Oil, Money, and Power (1991) and The Quest : Energy, Security, and the Remaking of the Modern World (2011).

In [67]:
prepro_clim_evid

{'evidence-5': 'peak minimum pressure florence storm atlantic hurricane season',
 'evidence-6': 'currently professor piano university since august',
 'evidence-7': 'addition know tangible unforeseeable black swan extinction may occur additional methodological problem',
 'evidence-9': 'bear folk musician',
 'evidence-13': 'force case bring widow police commissioner cause',
 'evidence-16': 'best know author prize epic quest oil money power quest energy security modern world',
 'evidence-17': 'operating system distribution pro high processor',
 'evidence-19': 'academic chronicle chronicle late compilation',
 'evidence-20': 'commune march one nationwide',
 'evidence-21': 'people live informal often minimal access education urban economy',
 'evidence-24': 'connect toll lead activation protein',
 'evidence-25': 'specie beetle family specie genus',
 'evidence-27': 'census population',
 'evidence-28': 'area mathematical logic computer science know type theory type constructor feature formal la

In [7]:
# train_claims = prepro_train
# dev_claims = prepro_dev

In [6]:
train_claims_dic = [{"claim_id": key,
                    "claim_text": value["claim_text"],
                    "claim_label": value["claim_label"],
                    "evidences_id": value["evidences"]} for (key,value) in train_claims.items()]
train_claims_df = pd.json_normalize(train_claims_dic)

dev_claims_dic = [{"claim_id": key,
                    "claim_text": value["claim_text"],
                    "claim_label": value["claim_label"],
                    "evidences_id": value["evidences"]} for (key,value) in dev_claims.items()]
dev_claims_df = pd.json_normalize(dev_claims_dic)

dev_claims_baseline_dic = [{"claim_id": key,
                    "claim_text": value["claim_text"],
                    "claim_label": value["claim_label"],
                    "evidences_id": value["evidences"]} for (key,value) in dev_claims_baseline.items()]
dev_claims_baseline_df = pd.json_normalize(dev_claims_baseline_dic)

test_claims_dic = [{"claim_id": key,
                    "claim_text": value["claim_text"]} for (key,value) in test_claims_unlabelled.items()]
test_claims_df = pd.json_normalize(test_claims_dic)

evidence_dic = [{"evidence_id": key,
                             "evidence_text": value
                             } for (key, value) in evidence.items()]
evidence_df = pd.json_normalize(evidence_dic)

In [7]:
for key, value in dev_claims.items():
    dev_claims[key]['evidence_text'] = [evidence[id] for id in value['evidences']]        

#### TFIDF

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
evidlist = list(evidence_climate_2.values())
evidId = list(evidence_climate_2.keys())

In [10]:
def TFIDF(evidlist):
    vectorizer = TfidfVectorizer(stop_words="english")
    vectorizer.fit(evidlist+train_claims_df['claim_text'].tolist())

    train_claim_tfidf = vectorizer.transform(train_claims_df['claim_text'].tolist())
    test_claim_tfidf = vectorizer.transform(test_claims_df['claim_text'].tolist())
    dev_claim_tfidf = vectorizer.transform(dev_claims_df['claim_text'].tolist())
    
    evid_tfidf = vectorizer.transform(evidlist)

    return train_claim_tfidf,test_claim_tfidf,dev_claim_tfidf, evid_tfidf, vectorizer

In [11]:
train_claim_tfidf,test_claim_tfidf,dev_claim_tfidf, evid_tfidf,vectorizer = TFIDF(evidlist)

In [12]:
print(train_claim_tfidf.shape)
print(dev_claim_tfidf.shape)
print(evid_tfidf.shape)

(1228, 416701)
(154, 416701)
(514740, 416701)


In [15]:
feature_names = vectorizer.get_feature_names()

### Task 1: Evidence Retrieval

In [13]:
# for each claim, find top 5
def top_k_evd(dev_claim_tfidf,evid_tfidf, k,evidence_id):
    cos_sim = cosine_similarity(dev_claim_tfidf, evid_tfidf)
    
    ranked_evd_id = []
    ranked_evd_score = []
    for i in range(cos_sim.shape[0]):
        # for each claim
        cos_sim_claim = cos_sim[i]
        # find top k cos similarity
        top_evd_index = np.argsort(cos_sim_claim).tolist()[-k:][::-1]
        top_evd_ids = [evidence_id[i] for i in top_evd_index]
        top_evd_score = np.sort(cos_sim_claim).tolist()[-k:][::-1]
        # append the top k evidence list
        ranked_evd_id.append(top_evd_ids)
        ranked_evd_score.append(top_evd_score)
    return ranked_evd_id, ranked_evd_score

In [14]:
ranked_evd_ids_dev, ranked_evd_scores_dev = top_k_evd(dev_claim_tfidf,evid_tfidf, 5,evidId)
ranked_evd_ids_test, ranked_evd_scores_test = top_k_evd(test_claim_tfidf,evid_tfidf, 5,evidId)

In [15]:
def write_result(type,ranked_evd_ids):

    # store a new variable for writing retrival evidences
    if type == "dev":
        with open("data/dev-claims.json", 'r') as file:
            result = json.load(file)
        # match the predicted evidence with claim id 
        i = 0
        for claim_id, claim_value in result.items():
            evd_ids = ranked_evd_ids[i]
            result[claim_id]['evidences'] = evd_ids
            result[claim_id]['evidence_texts'] = [evidence[id] for id in dev_claims[claim_id]['evidences']]
            result[claim_id]['pre_evidence_texts'] = [evidence[id] for id in ranked_evd_ids[i]]
            i += 1

    elif type == "test":
        with open("data/test-claims-unlabelled.json", 'r') as file:
            result = json.load(file)
        



    # dump the predicted file
    if type == "dev":
        with open("data/dev_pred.json", "w") as file:
            json.dump(result, file)
    elif type == "test":
        with open("data/test-output.json", "w") as file:
            json.dump(result, file)


    return result

In [16]:
type = "dev"
dev_result = write_result(type,ranked_evd_ids_dev)
type = "test"
test_result = write_result(type,ranked_evd_ids_test)

In [17]:
import subprocess
output = subprocess.check_output("python eval.py --predictions data/dev_pred.json --groundtruth data/dev-claims.json", shell=True)
print(output)

b'Evidence Retrieval F-score (F)    = 0.1013399299113585\nClaim Classification Accuracy (A) = 1.0\nHarmonic Mean of F and A          = 0.18403024744506424\n'


In [59]:
dev_result

{'claim-752': {'claim_text': '[South Australia] has the most expensive electricity in the world.',
  'claim_label': 'SUPPORTS',
  'evidences': ['evidence-509525',
   'evidence-252686',
   'evidence-786054',
   'evidence-580844',
   'evidence-240255'],
  'evidence_texts': ['[citation needed] South Australia has the highest retail price for electricity in the country.',
   '"South Australia has the highest power prices in the world".'],
  'pre_evidence_texts': ['It was the most expensive record I ever made.',
   'It is found in South Australia and Western Australia.',
   'It is found in Australia, where it has been recorded from South Australia.',
   'It is found in Australia, where it has been recorded from South Australia.',
   'It is found in Australia, where it has been recorded from South Australia.']},
 'claim-375': {'claim_text': 'when 3 per cent of total annual global emissions of carbon dioxide are from humans and Australia prod\xaduces 1.3 per cent of this 3 per cent, then no a

In [None]:
"south", "expensive", "electricity", "world"

In [65]:
prepro_climated_evidence['evidence-328382']

'electricity'

In [66]:
dev_result

{'claim-752': {'claim_text': '[South Australia] has the most expensive electricity in the world.',
  'claim_label': 'SUPPORTS',
  'evidences': ['evidence-509525',
   'evidence-252686',
   'evidence-786054',
   'evidence-580844',
   'evidence-240255'],
  'evidence_texts': ['[citation needed] South Australia has the highest retail price for electricity in the country.',
   '"South Australia has the highest power prices in the world".'],
  'pre_evidence_texts': ['It was the most expensive record I ever made.',
   'It is found in South Australia and Western Australia.',
   'It is found in Australia, where it has been recorded from South Australia.',
   'It is found in Australia, where it has been recorded from South Australia.',
   'It is found in Australia, where it has been recorded from South Australia.']},
 'claim-375': {'claim_text': 'when 3 per cent of total annual global emissions of carbon dioxide are from humans and Australia prod\xaduces 1.3 per cent of this 3 per cent, then no a

In [49]:
precess_dev_result

{'claim-752': {'claim_text': '[South Australia] has the most expensive electricity in the world.',
  'claim_label': 'SUPPORTS',
  'evidences': ['evidence-793855',
   'evidence-532662',
   'evidence-421845',
   'evidence-902195',
   'evidence-328382'],
  'evidence_texts': ['[citation needed] South Australia has the highest retail price for electricity in the country.',
   '"South Australia has the highest power prices in the world".'],
  'pre_evidence_texts': ["Hydropower provides 16.3% of the world's electricity.",
   'CSP has other uses than electricity.',
   'electricity companies and electricity trusts',
   "Sähkö (``electricity'' in Finnish) was founded by Tommi Grönlund in 1993.",
   'It transmits electricity from the hydropower plants in the Yunnan to the Guangdong, including cities of Guangzhou and Shenzhen.']},
 'claim-375': {'claim_text': 'when 3 per cent of total annual global emissions of carbon dioxide are from humans and Australia prod\xaduces 1.3 per cent of this 3 per ce

In [23]:
# match the predicted evidence with claim id 
ranked_evd_ids_train, ranked_evd_scores_train = top_k_evd(train_claim_tfidf, evid_tfidf, 5, evidId)

with open("data/train-claims.json", "r") as file:
    train_pred = json.load(file)

i = 0
for claim_id, claim_value in train_claims.items():
    evd_ids = ranked_evd_ids_train[i]
    train_pred[claim_id]['evidences'] = evd_ids
    train_pred[claim_id]['evidence_texts'] = [evidence[id] for id in train_claims[claim_id]['evidences']]
    train_pred[claim_id]['pre_evidence_texts'] = [prepro_evidence[id] for id in ranked_evd_ids_train[i]]
    i += 1
with open("data/train/train_pred_regularTFIDF.json", "w") as file:
    json.dump(train_pred, file)
import subprocess
output = subprocess.check_output("python eval.py --predictions data/train/train_pred_regularTFIDF.json --groundtruth data/train-claims.json", shell=True)
print(output)

b'Evidence Retrieval F-score (F)    = 0.09441600744532341\nClaim Classification Accuracy (A) = 1.0\nHarmonic Mean of F and A          = 0.17254134954717465\n'


In [27]:
train_claims['claim-284']
evidence['evidence-1094467']

'The authors found that 3974 of the abstracts expressed a position on anthropogenic global warming, and that 97.1% of those endorsed the consensus that humans are causing global warming.'

In [24]:
train_pred

{'claim-1937': {'claim_text': 'Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.',
  'claim_label': 'DISPUTED',
  'evidences': ['evidence-668884',
   'evidence-66273',
   'evidence-364767',
   'evidence-98914',
   'evidence-55991'],
  'evidence_texts': ['At very high concentrations (100 times atmospheric concentration, or greater), carbon dioxide can be toxic to animal life, so raising the concentration to 10,000 ppm (1%) or higher for several hours will eliminate pests such as whiteflies and spider mites in a greenhouse.',
   'Plants can grow as much as 50 percent faster in concentrations of 1,000 ppm CO 2 when compared with ambient conditions, though this assumes no change in climate and no limitation on other nutrients.',
   'Higher carbon dioxide concentrations will favourably affect plant growth and demand for water.'],
  'pre_evidence_texts': ['',
   'high atmospheric lead incre

In [29]:
dev_claims

{'claim-752': {'claim_text': '[South Australia] has the most expensive electricity in the world.',
  'claim_label': 'SUPPORTS',
  'evidences': ['evidence-67732', 'evidence-572512'],
  'evidence_text': ['[citation needed] South Australia has the highest retail price for electricity in the country.',
   '"South Australia has the highest power prices in the world".']},
 'claim-375': {'claim_text': 'when 3 per cent of total annual global emissions of carbon dioxide are from humans and Australia prod\xaduces 1.3 per cent of this 3 per cent, then no amount of emissions reductio\xadn here will have any effect on global climate.',
  'claim_label': 'NOT_ENOUGH_INFO',
  'evidences': ['evidence-996421',
   'evidence-1080858',
   'evidence-208053',
   'evidence-699212',
   'evidence-832334'],
  'evidence_text': ['The 2011 UNEP Green Economy report states that "[a]agricultural operations, excluding land use changes, produce approximately 13 per cent of anthropogenic global GHG emissions.',
   'With

In [104]:
with open("data/test-claims-unlabelled.json", 'r') as file:
    result = json.load(file)

# match the predicted evidence with claim id 
i = 0
for claim_id, claim_value in result.items():
    evd_ids = ranked_evd_ids_test[i]
    result[claim_id]['claim_label'] = 'SUPPORTS'
    result[claim_id]['evidences'] = evd_ids
    i += 1