In [1]:
import numpy as np
import pandas as pd
from datasets import load_dataset
import cudf
import os


In [2]:
# 'evidence_sentence_id', raw_text of the title

In [3]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [10]:
import re
from nltk.corpus import stopwords
from unidecode import unidecode

# Clean text
def remove_non_ascii(text):
    return re.sub(r'[^\x00-\x7F]', ' ', text)


def remove_punctuation(text):
    return re.sub(r'[^\w]', ' ', text)

def remove_digits(text):
    return re.sub(r'[\d]', '', text)


def to_lowercase(text):
    return text.lower()


def remove_extra_space(text):
    return re.sub(' +', ' ', text)


def remove_url(text):
    return re.sub(r'http\S+', ' ', text)


def remove_underline(text):
    return text.replace('_', ' ')


def remove_hyphen(text):
    return text.replace('-', ' ')


def remove_leading_whitespace(text):
    return text.lstrip()

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return ' '.join([word for word in text.split() if word not in stop_words])

def decode_special_chars(text):
    return re.sub(r'-[A-Z]+-', ' ', text)

def remove_newline(text):
    return re.sub('\n', ' ', text)

def remove_tabs(text):
    return re.sub('\t', '', text)

def remove_intext_tabs(text):
    return re.sub(r'(?<!\d)\t', ' ', text)

def remove_special_tokens(text):
    return re.sub(r'-[A-Z]+-', '', text)

def remove_quotes(text):
    text = re.sub(r'(``|\' \')', '', text)
    return re.sub(r"''", '', text)


def clean_text(df: pd.DataFrame, column: str):
    
    df[column] = df[column].apply(remove_punctuation)
    df[column] = df[column].apply(remove_non_ascii)
    df[column] = df[column].apply(remove_special_tokens)
    df[column] = df[column].apply(remove_extra_space)
    df[column] = df[column].apply(remove_quotes)
    df[column] = df[column].apply(to_lowercase)
    df[column] = df[column].apply(remove_stopwords)
    df[column] = df[column].apply(remove_tabs)
    df[column] = df[column].apply(remove_extra_space)

    return df 

In [11]:

def get_claim_df():
    
    cache_dir = '/home/rahvk/data/tmp/cache/fever3' # change this to your own path
    claim_dataset = load_dataset('fever', 'v1.0', cache_dir=cache_dir)
    claim_df = pd.DataFrame()

    for split in ['train']:
        # Load train file
        claim_split = claim_dataset[split]
        claim_d = pd.DataFrame(claim_split)

        claim_d = claim_d.drop(columns=[ 'id'])
        
        # Remove rows with label NOT ENOUGH INFO
#         claim_d = claim_d[claim_d['evidence_sentence_id'] != -1]

        # Clean claim DataFrame
        claim_d['raw_text'] = claim_d['claim']
        claim_d = clean_text(df=claim_d, column="claim")
        claim_d.rename(columns={'evidence_wiki_url': 'title', 'claim': 'clean_text'}, inplace=True)
        claim_df = pd.concat([claim_df, claim_d], axis=0).drop_duplicates()

        del claim_split
        del claim_d

    del claim_dataset
    
    return claim_df

In [None]:
claim_df = get_claim_df()
claim_df.head()

Found cached dataset fever (/home/rahvk/data/tmp/cache/fever3/fever/v1.0/1.0.0/7f8936e0558704771b08c7ce9cc202071b29a0050603374507ba61d23c00a58e)


  0%|          | 0/6 [00:00<?, ?it/s]

In [None]:
# KEEP ASIDE NOT ENOUGH INFO ROWS FOR LATER USE
neo_rows = claim_df[claim_df['label'] == 'NOT ENOUGH INFO']

claim_df = claim_df.drop(noe_rows.index)
claim_df = claim_df[claim_df['evidence_sentence_id'] != -1]

In [None]:
claim_df.shape

In [None]:
claim_df = claim_df.reset_index(drop=True)
claim_df.head()


In [None]:

def lookup_doc_ids(fever_df, parquet_files):
    """
    Lookup and map doc_ids from parquet files to titles in fever_df DataFrame.

    Args:
    - fever_df (pd.DataFrame): DataFrame containing titles.
    - parquet_files (list): List of paths to parquet files.

    Returns:
    - pd.DataFrame: Updated fever_df DataFrame with 'doc_id' column added.
    """
    # Load the titles from fever_df into a set for faster lookup
    fever_titles = set(fever_df['title'])

    # Create an empty dictionary to store doc_id mappings
    doc_id_mapping = {}
    
#     print("Looking up.")
    
    x = 0
    # Iterate over each parquet file
    for parquet_file in parquet_files:
        
        print(f"Looking up in {x}.parquet")
        # Load the parquet file into a DataFrame
        df = pd.read_parquet(parquet_file)

        # Filter the DataFrame to include only rows with titles in fever_titles
        filtered_df = df[df['title'].isin(fever_titles)]

        # Iterate over each row in the filtered DataFrame
        for index, row in filtered_df.iterrows():
            # Store the doc_id in the doc_id_mapping dictionary
            doc_id_mapping[row['title']] = str(row['doc_id'])
        
        x+=1
        
    # Map doc_ids to titles in fever_df using the doc_id_mapping dictionary
    fever_df['doc_id'] = fever_df['title'].map(doc_id_mapping)

    return fever_df


In [None]:
read_dir = "wiki_docs_parquets"  # Directory containing processed Parquet files
processed_files = [read_dir+"/"+name for name in sorted(os.listdir(read_dir))]
print(processed_files)

In [None]:
updated_fever_df = lookup_doc_ids(claim_df, processed_files)

In [None]:
df = updated_fever_df.dropna(subset=['doc_id'])


In [None]:
df

In [None]:
df['joint_id'] = df['doc_id'].astype(str) + '_' + df['evidence_sentence_id'].astype(str)
df

In [None]:
import pandas as pd

# Group by 'raw_text' column and aggregate 'doc_id' values into a list
grouped = df.groupby('raw_text')['joint_id'].apply(list).reset_index()

# Merge the grouped DataFrame with the original DataFrame on the 'claim' column
df = df.merge(grouped, on='raw_text', how='left')

# Rename the column containing the lists of 'doc_id' values
df.rename(columns={'joint_id_x': 'joint_id', 'joint_id_y': 'joint_ids'}, inplace=True)

# Drop duplicate rows based on 'raw_text' column
df = df.drop_duplicates(subset='raw_text')

# Reset index
df = df.reset_index(drop=True)


In [None]:
df

In [None]:
df[df['raw_text']=='Jeff Goldblum starred in a film.']

In [None]:
df = df.drop(columns=['evidence_annotation_id', 'evidence_id', 'evidence_sentence_id', 'joint_id', 'doc_id'])


In [None]:
df

In [None]:
neo_rows = neo_rows.drop(columns=['evidence_annotation_id', 'evidence_id', 'evidence_sentence_id'])
neo_rows_random_sample = neo_rows.sample(n=334)
neo_rows_random_sample['joint_ids'] = [[]] * len(neo_rows_random_sample)

In [None]:
supports_rows = df[df['label'] == 'SUPPORTS']
supports_rows_random_sample = supports_rows.sample(n=333)

refutes_rows = df[df['label'] == 'REFUTES']
refutes_rows_random_sample = refutes_rows.sample(n=333)


In [None]:
concatenated_df = pd.concat([supports_rows_random_sample, refutes_rows_random_sample, neo_rows_random_sample])


In [38]:
# CHECKS
def perform_checks(df):
    
    print("Number of NOT ENOUGH INFO rows: ",(df['label'] == 'NOT ENOUGH INFO').sum())
    print("Number of proper SUPPORTS rows: ",((df['label'] == 'SUPPORTS').sum()))
    print("Number of proper REFUTES rows: ",((df['label'] == 'REFUTES').sum()))

In [39]:
perform_checks(df)

Number of NOT ENOUGH INFO rows:  0
Number of proper SUPPORTS rows:  36473
Number of proper REFUTES rows:  12503


In [22]:
df.to_parquet('processed_fever/fever-train-1000.parquet')
df.to_json('processed_fever/fever-train-1000.json', orient='records')