In [14]:
import pandas as pd
import numpy as np
from semantic_roberta import get_similarity_scores_triples, get_topk_similar_evidences
from textual_entailment import get_entailment

In [2]:
data_df = pd.read_pickle('dev_test.pkl')

In [3]:
def print_triples_sizes(df):
    df['claim_kg_size'] = df['claim_kg'].apply(lambda triples : len(triples))
    df['main_text_kg_size'] = df['main_text_kg'].apply(lambda triples : len(triples))

    print(df[['claim_kg_size', 'main_text_kg_size']].describe())
    
print_triples_sizes(data_df)

       claim_kg_size  main_text_kg_size
count    2452.000000        2452.000000
mean        3.113785          23.469005
std         0.534892          15.511103
min         1.000000           2.000000
25%         3.000000          13.000000
50%         3.000000          21.000000
75%         3.000000          30.000000
max         9.000000         151.000000


In [42]:
def get_cleaned_claim_triples(claim_kg):
    claim_triples = [f"({triple['head']}, {triple['type']}, {triple['tail']})" for triple in claim_kg]
    n = len(claim_triples)
    sim_scores = get_similarity_scores_triples(claim_triples)

    to_drop = np.zeros(n, dtype=bool)

    for i in range(n):
        # Skip sentences that have already been marked for dropping
        if to_drop[i]:
            continue

        for j in range(i + 1, n):
            # If the similarity score between sentences i and j is above the threshold, mark one of them for dropping
            if sim_scores[i, j] >= 0.9:
                to_drop[i] = True
                break

    # Return the indices of sentences that are not marked for dropping
    clean_claim_triples = [claim_triples[i] for i in range(n) if not to_drop[i]]
    # print(claim_triples)
    # print(remaining_sentences)
    # print()
    return clean_claim_triples

In [44]:
data_df['clean_claim_triples'] = data_df['claim_kg'].apply(get_cleaned_claim_triples)

In [45]:
print_triples_sizes(data_df)

       claim_kg_size  main_text_kg_size
count    2452.000000        2452.000000
mean        3.113785          23.469005
std         0.534892          15.511103
min         1.000000           2.000000
25%         3.000000          13.000000
50%         3.000000          21.000000
75%         3.000000          30.000000
max         9.000000         151.000000


In [4]:
data_df['clean_claim_triples'].apply(lambda x: len(x)).describe()

count    2452.000000
mean        2.207178
std         0.777266
min         1.000000
25%         2.000000
50%         2.000000
75%         3.000000
max         8.000000
Name: clean_claim_triples, dtype: float64

In [5]:
data_df['clean_main_text_triples'].apply(lambda x: len(x)).describe()

count    2452.000000
mean       17.347064
std        11.499945
min         1.000000
25%        10.000000
50%        15.000000
75%        22.000000
max        99.000000
Name: clean_main_text_triples, dtype: float64

In [48]:
data_df.to_pickle('dev_test.pkl')

In [49]:
data_df['clean_main_text_triples'] = data_df['main_text_kg'].apply(get_cleaned_claim_triples)

In [28]:
data_df.to_pickle('dev_test.pkl')

In [29]:
data_df.head()

Unnamed: 0.1,claim_id,claim,date_published,explanation,fact_checkers,main_text,sources,label,subjects,claim_kg,main_text_kg,Unnamed: 0,claim_kg_size,main_text_kg_size,clean_claim_triples,clean_main_text_triples,claim_evidence_1,entailment_scores_1
0,34656,A baby died at an unnamed medical facility be...,"November 10, 2015",Fellow Twitter users suggested @FierceFemtivis...,Kim LaCapria,"On 8 November 2015, former Twitter user @Fierc...",http://webcache.googleusercontent.com/search?q...,unproven,"Politics, fiercefemtivist, racism",[{'head': 'A baby died at an unnamed medical f...,"[{'head': 'Confederate flag', 'type': 'instanc...",,3,14,"[(A baby died at an unnamed medical facility, ...","[(Confederate flag, instance of, racist), (Fie...","{'(A baby died at an unnamed medical facility,...","[[0.66573095, -0.56622183, -0.4859991], [0.430..."
1,3632,Bat from Shawnee County tests positive for rab...,,A bat found in northeastern Kansas has tested ...,,Topeka television station KSNT reports that th...,https://www.ksnt.com/news/bat-tests-positive-f...,true,"Rabies, Health, General News, Kansas, Bats, To...","[{'head': 'Bat from Shawnee County', 'type': '...","[{'head': 'KSNT', 'type': 'located in the admi...",,3,6,"[(Shawnee County, located in the administrativ...","[(KSNT, located in the administrative territor...","{'(Shawnee County, located in the administrati...","[[0.6284045, -0.64509016, -0.4347026], [0.6175..."
2,29558,Germany has banned pork from school canteens b...,"March 7, 2016",What's true: Some politicians complained that ...,Kim LaCapria,"On 7 March 2016, British tabloid Express repor...",http://bnp.org.uk/news/regional/bnp-victory-br...,false,Politics,"[{'head': 'banned pork from school canteens', ...","[{'head': 'Express', 'type': 'country', 'tail'...",,3,36,"[(banned pork from school canteens, country, G...","[(Express, country, British), (Express, instan...","{'(banned pork from school canteens, country, ...","[[0.64209604, -0.69231135, -0.32926825], [0.68..."
3,8416,Coronavirus prompts Canada to roll out safe dr...,"April 16, 2020",Canada’s Pacific province of British Columbia ...,Tessa Vikander,"In March, the Canadian government urged provin...",,true,Health News,"[{'head': 'Coronavirus', 'type': 'country', 't...","[{'head': 'heroin', 'type': 'instance of', 'ta...",,3,21,"[(Coronavirus, country, Canada), (safe drugs, ...","[(heroin, instance of, controlled substance), ...","{'(Coronavirus, country, Canada)': ['(coronavi...","[[-0.1503326, -0.75800896, 0.63468295], [0.660..."
4,7169,"Wayne National Forest plans fires for tree, wi...",,"Nearly 2,000 acres of Wayne National Forest in...",,Forest officials say scientists who study nati...,,true,"Plants, Wildlife, Health, Wildlife health, For...","[{'head': 'Wayne National Forest', 'type': 'in...","[{'head': 'oak forests', 'type': 'located in t...",,3,3,"[(Wayne National Forest, instance of, wildlife...","[(oak forests, located in the administrative t...","{'(Wayne National Forest, instance of, wildlif...","[[0.63135314, -0.7029956, -0.32739952], [0.662..."


In [11]:
def pair_most_related_evidence(row, k):
    claim_triples = row['clean_claim_triples']
    evidence_triples = row['clean_main_text_triples']

    relevant_claim_evidence_pairs = {}
    for claim_triple in claim_triples:
        topk_evidences = get_topk_similar_evidences(claim_triple, evidence_triples, k)
        relevant_claim_evidence_pairs[claim_triple] = topk_evidences

    if row.name%100 == 0:
        print(row.name)

    return relevant_claim_evidence_pairs

In [12]:
data_df['claim_evidence_1'] = data_df.apply(lambda row: pair_most_related_evidence(row, 1), axis=1)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
0
100
200
300
400
500
600
700
800
900
1000
1100
1200


In [24]:
def get_entailment_scores(row):
    claim_evidences = row['claim_evidence_1']
    evidence_claim_tuples = []
    for claim, evidences in claim_evidences.items():
        evidence_claim_tuples.append((evidences[0], claim))
    # print(claim_evidences)
    # print(evidence_claim_tuples)

    if row.name%100==0:
        print(row.name)

    return get_entailment(evidence_claim_tuples)

In [27]:
data_df['entailment_scores_1'] = data_df.apply(get_entailment_scores, axis=1)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
