In [1]:
import numpy as np
import pandas as pd
from datasets import load_dataset
import cudf
import os


In [2]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
import re
from nltk.corpus import stopwords
from unidecode import unidecode

# Clean text
def remove_non_ascii(text):
    return re.sub(r'[^\x00-\x7F]', ' ', text)


def remove_punctuation(text):
    return re.sub(r'[^\w]', ' ', text)

def remove_digits(text):
    return re.sub(r'[\d]', '', text)


def to_lowercase(text):
    return text.lower()


def remove_extra_space(text):
    return re.sub(' +', ' ', text)


def remove_url(text):
    return re.sub(r'http\S+', ' ', text)


def remove_underline(text):
    return text.replace('_', ' ')


def remove_hyphen(text):
    return text.replace('-', ' ')


def remove_leading_whitespace(text):
    return text.lstrip()

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return ' '.join([word for word in text.split() if word not in stop_words])

def decode_special_chars(text):
    return re.sub(r'-[A-Z]+-', ' ', text)

def remove_newline(text):
    return re.sub('\n', ' ', text)

def remove_tabs(text):
    return re.sub('\t', '', text)

def remove_intext_tabs(text):
    return re.sub(r'(?<!\d)\t', ' ', text)

def remove_special_tokens(text):
    return re.sub(r'-[A-Z]+-', '', text)

def remove_quotes(text):
    text = re.sub(r'(``|\' \')', '', text)
    return re.sub(r"''", '', text)


def clean_text(df: pd.DataFrame, column: str):
    
    df[column] = df[column].apply(remove_special_tokens)
    df[column] = df[column].apply(remove_extra_space)
    df[column] = df[column].apply(remove_quotes)

    return df 

In [4]:

def get_claim_df():
    
    cache_dir = '/home/rahvk/data/tmp/cache/fever3' # change this to your own path
    claim_dataset = load_dataset('fever', 'v1.0', cache_dir=cache_dir)
    claim_df = pd.DataFrame()

    for split in ['paper_test']:
        # Load train file
        claim_split = claim_dataset[split]
        claim_d = pd.DataFrame(claim_split)

        claim_d = claim_d.drop(columns=['evidence_annotation_id', 'evidence_id', 'evidence_sentence_id', 'id'])
        
        # Remove rows with label NOT ENOUGH INFO
#         claim_d = claim_d[claim_d['label'] != "NOT ENOUGH INFO"]

        # Clean claim DataFrame
        claim_d = clean_text(df=claim_d, column="claim")
        claim_d.rename(columns={'evidence_wiki_url': 'title'}, inplace=True)
        claim_df = pd.concat([claim_df, claim_d], axis=0).drop_duplicates()

        del claim_split
        del claim_d

    del claim_dataset
    
    return claim_df

In [5]:
claim_df = get_claim_df()
claim_df.head()

Found cached dataset fever (/home/rahvk/data/tmp/cache/fever3/fever/v1.0/1.0.0/7f8936e0558704771b08c7ce9cc202071b29a0050603374507ba61d23c00a58e)


  0%|          | 0/6 [00:00<?, ?it/s]

Unnamed: 0,label,claim,title
0,NOT ENOUGH INFO,Grease had bad reviews.,
1,SUPPORTS,Ukrainian Soviet Socialist Republic was a foun...,Ukrainian_Soviet_Socialist_Republic
3,SUPPORTS,Ukrainian Soviet Socialist Republic was a foun...,United_Nations
8,SUPPORTS,2 Hearts is a musical composition by Minogue.,2_Hearts_-LRB-Kylie_Minogue_song-RRB-
12,REFUTES,The New Jersey Turnpike has zero shoulders.,New_Jersey_Turnpike


In [6]:
claim_df.shape

(11143, 3)

In [7]:
claim_df = claim_df.reset_index(drop=True)
claim_df.head()


Unnamed: 0,label,claim,title
0,NOT ENOUGH INFO,Grease had bad reviews.,
1,SUPPORTS,Ukrainian Soviet Socialist Republic was a foun...,Ukrainian_Soviet_Socialist_Republic
2,SUPPORTS,Ukrainian Soviet Socialist Republic was a foun...,United_Nations
3,SUPPORTS,2 Hearts is a musical composition by Minogue.,2_Hearts_-LRB-Kylie_Minogue_song-RRB-
4,REFUTES,The New Jersey Turnpike has zero shoulders.,New_Jersey_Turnpike


In [None]:
# claim_df.to_parquet('processed_fever/fever-with-noi.parquet')

In [8]:
import pandas as pd

def lookup_doc_ids(fever_df, parquet_files):
    """
    Lookup and map doc_ids from parquet files to titles in fever_df DataFrame.

    Args:
    - fever_df (pd.DataFrame): DataFrame containing titles.
    - parquet_files (list): List of paths to parquet files.

    Returns:
    - pd.DataFrame: Updated fever_df DataFrame with 'doc_id' column added.
    """
    # Load the titles from fever_df into a set for faster lookup
    fever_titles = set(fever_df['title'])

    # Create an empty dictionary to store doc_id mappings
    doc_id_mapping = {}
    
    print("Looking up.")
    
    # Iterate over each parquet file
    for parquet_file in parquet_files:
        
        # Load the parquet file into a DataFrame
        df = pd.read_parquet(parquet_file)

        # Filter the DataFrame to include only rows with titles in fever_titles
        filtered_df = df[df['title'].isin(fever_titles)]

        # Iterate over each row in the filtered DataFrame
        for index, row in filtered_df.iterrows():
            # Store the doc_id in the doc_id_mapping dictionary
            doc_id_mapping[row['title']] = str(row['doc_id'])
        
    # Map doc_ids to titles in fever_df using the doc_id_mapping dictionary
    fever_df['doc_id'] = fever_df['title'].map(doc_id_mapping)

    return fever_df


In [9]:
read_dir = "processed_wiki_2"  # Directory containing processed Parquet files
processed_files = [read_dir+"/"+name for name in sorted(os.listdir(read_dir))]
print(processed_files)

['processed_wiki_2/0.parquet', 'processed_wiki_2/1.parquet', 'processed_wiki_2/2.parquet', 'processed_wiki_2/3.parquet', 'processed_wiki_2/4.parquet', 'processed_wiki_2/5.parquet', 'processed_wiki_2/6.parquet', 'processed_wiki_2/7.parquet', 'processed_wiki_2/8.parquet', 'processed_wiki_2/9.parquet']


In [18]:
nan_present = True
num_of_not_nan = 1000
num_supports = 600
num_refutes = 600
num_noi = 334

while nan_present:
    
    # Preprocessing step
    supports_df = claim_df[claim_df['label'] == 'SUPPORTS'].sample(n=num_supports)
    refutes_df = claim_df[claim_df['label'] == 'REFUTES'].sample(n=num_refutes)
    result_df = pd.concat([supports_df, refutes_df], ignore_index=True)
    result_df = result_df.sample(frac=1).reset_index(drop=True)

    # Lookup step
    updated_fever_df = lookup_doc_ids(result_df, processed_files)

    # Check for NaN values in the 'doc_id' column for SUPPORTS and REFUTES labels
    
    condition = updated_fever_df['doc_id'].isna()
    num_of_not_nan = num_supports + num_refutes - condition.sum()
    
    print(f"Have {num_of_not_nan} good rows.")
    
    if num_of_not_nan >= 666:
        break


Looking up.
Have 726 good rows.


In [23]:
updated_fever_df = updated_fever_df[~condition]

In [24]:
updated_fever_df.head(-5)

Unnamed: 0,label,claim,title,doc_id
0,REFUTES,Yugoslavia is a member of the Group of 15.,Group_of_15,2001095
1,REFUTES,Louie (season 1) was created by David Benioff.,Louie_-LRB-season_1-RRB-,2975992
2,REFUTES,Gaius Julius Caesar was only the mother of Gai...,Gaius_Julius_Caesar_-LRB-proconsul-RRB-,1799494
3,REFUTES,Key & Peele has won an Academy Award.,Key_&_Peele,2688749
4,SUPPORTS,The American Civil War lasted 4 years.,American_Civil_War,354016
...,...,...,...,...
1184,REFUTES,Floyd Mayweather Jr. is incapable of boxing.,Floyd_Mayweather_Jr.,1805361
1185,SUPPORTS,The Danish language is spoken in North America.,Greenland,2000629
1187,SUPPORTS,One Flew Over the Cuckoo's Nest won Best Actor...,Academy_Award_for_Best_Actor,304336
1188,SUPPORTS,Chile is a member of the Group of 15.,Group_of_15,2001095


In [25]:
actual_num_supports = 333
actual_num_refutes = 333

supports_df = updated_fever_df[updated_fever_df['label'] == 'SUPPORTS'].sample(n=actual_num_supports)
refutes_df = updated_fever_df[updated_fever_df['label'] == 'REFUTES'].sample(n=actual_num_supports)

In [29]:
noi_df = claim_df[claim_df['label'] == 'NOT ENOUGH INFO'].sample(n=num_noi)
result_df = pd.concat([supports_df, refutes_df, noi_df], ignore_index=True)
final_fever_df = result_df.sample(frac=1).reset_index(drop=True)


In [30]:
final_fever_df

Unnamed: 0,label,claim,title,doc_id
0,NOT ENOUGH INFO,Hyksos practiced cat burials.,,
1,NOT ENOUGH INFO,Key & Peele has won two Academy Awards.,,
2,REFUTES,The great white shark prefers to prey on humans.,Great_white_shark,2042512
3,NOT ENOUGH INFO,Personality is affected by the development of ...,,
4,REFUTES,Night of the Living Dead is a a series of work...,Night_of_the_Living_Dead,3489638
...,...,...,...,...
995,REFUTES,Uganda is in space.,Lake_Victoria,2743824
996,REFUTES,Cable television and the internet are two medi...,Fred_Seibert,1756684
997,REFUTES,Brian De Palma was born in 1912.,Brian_De_Palma,836316
998,SUPPORTS,The ABC islands are part of the Caribbean.,ABC_islands_-LRB-Lesser_Antilles-RRB-,220295


In [40]:
# CHECKS

print("Number of NOT ENOUGH INFO rows: ",(final_fever_df['label'] == 'NOT ENOUGH INFO').sum())
print("Number of proper SUPPORTS rows: ",((final_fever_df['label'] == 'SUPPORTS') & (~final_fever_df['doc_id'].isna())).sum())
print("Number of proper REFUTES rows: ",((final_fever_df['label'] == 'REFUTES') & (~final_fever_df['doc_id'].isna())).sum())

Number of NOT ENOUGH INFO rows:  334
Number of proper SUPPORTS rows:  333
Number of proper REFUTES rows:  333


In [41]:
final_fever_df.to_parquet('processed_fever/fever-1000-new.parquet')