In [2]:
import numpy as np
import pandas as pd
from datasets import load_dataset
import cudf
import os


In [3]:
# 'evidence_sentence_id', raw_text of the title

In [4]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [5]:
import re
from nltk.corpus import stopwords
from unidecode import unidecode

# Clean text
def remove_non_ascii(text):
    return re.sub(r'[^\x00-\x7F]', ' ', text)


def remove_punctuation(text):
    return re.sub(r'[^\w]', ' ', text)

def remove_digits(text):
    return re.sub(r'[\d]', '', text)


def to_lowercase(text):
    return text.lower()


def remove_extra_space(text):
    return re.sub(' +', ' ', text)


def remove_url(text):
    return re.sub(r'http\S+', ' ', text)


def remove_underline(text):
    return text.replace('_', ' ')


def remove_hyphen(text):
    return text.replace('-', ' ')


def remove_leading_whitespace(text):
    return text.lstrip()

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return ' '.join([word for word in text.split() if word not in stop_words])

def decode_special_chars(text):
    return re.sub(r'-[A-Z]+-', ' ', text)

def remove_newline(text):
    return re.sub('\n', ' ', text)

def remove_tabs(text):
    return re.sub('\t', '', text)

def remove_intext_tabs(text):
    return re.sub(r'(?<!\d)\t', ' ', text)

def remove_special_tokens(text):
    return re.sub(r'-[A-Z]+-', '', text)

def remove_quotes(text):
    text = re.sub(r'(``|\' \')', '', text)
    return re.sub(r"''", '', text)


def clean_text(df: pd.DataFrame, column: str):
    
    df[column] = df[column].apply(remove_punctuation)
    df[column] = df[column].apply(remove_non_ascii)
    df[column] = df[column].apply(remove_special_tokens)
    df[column] = df[column].apply(remove_extra_space)
    df[column] = df[column].apply(remove_quotes)
    df[column] = df[column].apply(to_lowercase)
    df[column] = df[column].apply(remove_stopwords)
    df[column] = df[column].apply(remove_tabs)
    df[column] = df[column].apply(remove_extra_space)

    return df 

In [9]:

def get_claim_df():
    
    cache_dir = '/home/rahvk/data/tmp/cache/fever3' # change this to your own path
    claim_dataset = load_dataset('fever', 'v1.0', cache_dir=cache_dir)
    claim_df = pd.DataFrame()

    for split in ['train']:
        # Load train file
        claim_split = claim_dataset[split]
        claim_d = pd.DataFrame(claim_split)

        claim_d = claim_d.drop(columns=[ 'id'])
        
        # Remove rows with label NOT ENOUGH INFO
        claim_d = claim_d[claim_d['evidence_sentence_id'] != -1]

        # Clean claim DataFrame
        claim_d['raw_text'] = claim_d['claim']
        claim_d = clean_text(df=claim_d, column="claim")
        claim_d.rename(columns={'evidence_wiki_url': 'title', 'claim': 'clean_text'}, inplace=True)
        claim_df = pd.concat([claim_df, claim_d], axis=0).drop_duplicates()

        del claim_split
        del claim_d

    del claim_dataset
    
    return claim_df

In [10]:
claim_df = get_claim_df()
claim_df.head()

Found cached dataset fever (/home/rahvk/data/tmp/cache/fever3/fever/v1.0/1.0.0/7f8936e0558704771b08c7ce9cc202071b29a0050603374507ba61d23c00a58e)


  0%|          | 0/6 [00:00<?, ?it/s]

Unnamed: 0,label,clean_text,evidence_annotation_id,evidence_id,title,evidence_sentence_id,raw_text
0,SUPPORTS,nikolaj coster waldau worked fox broadcasting ...,92206,104971,Nikolaj_Coster-Waldau,7,Nikolaj Coster-Waldau worked with the Fox Broa...
2,SUPPORTS,roman atwood content creator,174271,187498,Roman_Atwood,1,Roman Atwood is a content creator.
3,SUPPORTS,roman atwood content creator,174271,187499,Roman_Atwood,3,Roman Atwood is a content creator.
4,SUPPORTS,history art includes architecture dance sculpt...,255136,254645,History_of_art,2,"History of art includes architecture, dance, s..."
11,SUPPORTS,boston celtics play home games td garden,49158,58489,Boston_Celtics,3,The Boston Celtics play their home games at TD...


In [11]:
claim_df.shape

(168291, 7)

In [12]:
claim_df = claim_df.reset_index(drop=True)
claim_df.head()


Unnamed: 0,label,clean_text,evidence_annotation_id,evidence_id,title,evidence_sentence_id,raw_text
0,SUPPORTS,nikolaj coster waldau worked fox broadcasting ...,92206,104971,Nikolaj_Coster-Waldau,7,Nikolaj Coster-Waldau worked with the Fox Broa...
1,SUPPORTS,roman atwood content creator,174271,187498,Roman_Atwood,1,Roman Atwood is a content creator.
2,SUPPORTS,roman atwood content creator,174271,187499,Roman_Atwood,3,Roman Atwood is a content creator.
3,SUPPORTS,history art includes architecture dance sculpt...,255136,254645,History_of_art,2,"History of art includes architecture, dance, s..."
4,SUPPORTS,boston celtics play home games td garden,49158,58489,Boston_Celtics,3,The Boston Celtics play their home games at TD...


In [18]:

def lookup_doc_ids(fever_df, parquet_files):
    """
    Lookup and map doc_ids from parquet files to titles in fever_df DataFrame.

    Args:
    - fever_df (pd.DataFrame): DataFrame containing titles.
    - parquet_files (list): List of paths to parquet files.

    Returns:
    - pd.DataFrame: Updated fever_df DataFrame with 'doc_id' column added.
    """
    # Load the titles from fever_df into a set for faster lookup
    fever_titles = set(fever_df['title'])

    # Create an empty dictionary to store doc_id mappings
    doc_id_mapping = {}
    
#     print("Looking up.")
    
    x = 0
    # Iterate over each parquet file
    for parquet_file in parquet_files:
        
        print(f"Looking up in {x}.parquet")
        # Load the parquet file into a DataFrame
        df = pd.read_parquet(parquet_file)

        # Filter the DataFrame to include only rows with titles in fever_titles
        filtered_df = df[df['title'].isin(fever_titles)]

        # Iterate over each row in the filtered DataFrame
        for index, row in filtered_df.iterrows():
            # Store the doc_id in the doc_id_mapping dictionary
            doc_id_mapping[row['title']] = str(row['doc_id'])
        
        x+=1
        
    # Map doc_ids to titles in fever_df using the doc_id_mapping dictionary
    fever_df['doc_id'] = fever_df['title'].map(doc_id_mapping)

    return fever_df


In [19]:
read_dir = "wiki_docs_parquets"  # Directory containing processed Parquet files
processed_files = [read_dir+"/"+name for name in sorted(os.listdir(read_dir))]
print(processed_files)

['wiki_docs_parquets/0.parquet', 'wiki_docs_parquets/1.parquet', 'wiki_docs_parquets/2.parquet', 'wiki_docs_parquets/3.parquet', 'wiki_docs_parquets/4.parquet', 'wiki_docs_parquets/5.parquet', 'wiki_docs_parquets/6.parquet', 'wiki_docs_parquets/7.parquet', 'wiki_docs_parquets/8.parquet', 'wiki_docs_parquets/9.parquet']


In [24]:
updated_fever_df = lookup_doc_ids(claim_df, processed_files)

Looking up in 0.parquet
Looking up in 1.parquet
Looking up in 2.parquet
Looking up in 3.parquet
Looking up in 4.parquet
Looking up in 5.parquet
Looking up in 6.parquet
Looking up in 7.parquet
Looking up in 8.parquet
Looking up in 9.parquet


In [55]:
df = updated_fever_df.dropna(subset=['doc_id'])


In [56]:
df

Unnamed: 0,label,clean_text,evidence_annotation_id,evidence_id,title,evidence_sentence_id,raw_text,doc_id
0,SUPPORTS,nikolaj coster waldau worked fox broadcasting ...,92206,104971,Nikolaj_Coster-Waldau,7,Nikolaj Coster-Waldau worked with the Fox Broa...,3508521
3,SUPPORTS,history art includes architecture dance sculpt...,255136,254645,History_of_art,2,"History of art includes architecture, dance, s...",2247952
4,SUPPORTS,boston celtics play home games td garden,49158,58489,Boston_Celtics,3,The Boston Celtics play their home games at TD...,834104
5,SUPPORTS,boston celtics play home games td garden,49159,58490,Boston_Celtics,3,The Boston Celtics play their home games at TD...,834104
8,SUPPORTS,cyndi lauper best new artist award 27th grammy...,56492,66697,Cyndi_Lauper,2,Cyndi Lauper won the Best New Artist award at ...,1251056
...,...,...,...,...,...,...,...,...
168276,SUPPORTS,jeff goldblum starred film,210069,217837,Igby_Goes_Down,1,Jeff Goldblum starred in a film.,2161172
168278,SUPPORTS,jeff goldblum starred film,210069,217838,Jeff_Goldblum,7,Jeff Goldblum starred in a film.,2447223
168285,REFUTES,led zeppelin released eponymous debut album 1960,91851,104659,Led_Zeppelin,6,Led Zeppelin released an eponymous debut album...,2908175
168286,SUPPORTS,stars american actress rooney mara,28520,34848,Her_-LRB-film-RRB-,3,Her stars American actress Rooney Mara.,2134332


In [57]:
import pandas as pd

# Group by 'raw_text' column and aggregate 'doc_id' values into a list
grouped = df.groupby('raw_text')['doc_id'].apply(list).reset_index()

# Merge the grouped DataFrame with the original DataFrame on the 'claim' column
df = df.merge(grouped, on='raw_text', how='left')

# Rename the column containing the lists of 'doc_id' values
df.rename(columns={'doc_id_x': 'doc_id', 'doc_id_y': 'doc_ids'}, inplace=True)

# Drop duplicate rows based on 'raw_text' column
df = df.drop_duplicates(subset='raw_text')

# Reset index
df = df.reset_index(drop=True)


In [58]:
df

Unnamed: 0,label,clean_text,evidence_annotation_id,evidence_id,title,evidence_sentence_id,raw_text,doc_id,doc_ids
0,SUPPORTS,nikolaj coster waldau worked fox broadcasting ...,92206,104971,Nikolaj_Coster-Waldau,7,Nikolaj Coster-Waldau worked with the Fox Broa...,3508521,[3508521]
1,SUPPORTS,history art includes architecture dance sculpt...,255136,254645,History_of_art,2,"History of art includes architecture, dance, s...",2247952,"[2247952, 2247952, 2247952, 2247952, 2247952, ..."
2,SUPPORTS,boston celtics play home games td garden,49158,58489,Boston_Celtics,3,The Boston Celtics play their home games at TD...,834104,"[834104, 834104]"
3,SUPPORTS,cyndi lauper best new artist award 27th grammy...,56492,66697,Cyndi_Lauper,2,Cyndi Lauper won the Best New Artist award at ...,1251056,[1251056]
4,SUPPORTS,michael giacchino composed score doctor strange,208457,216486,Michael_Giacchino,1,Michael Giacchino composed the score for Docto...,3318853,[3318853]
...,...,...,...,...,...,...,...,...,...
48971,SUPPORTS,absolute beginners starred david bowie,268538,265101,Absolute_Beginners_-LRB-film-RRB-,1,Absolute Beginners starred David Bowie.,299631,[299631]
48972,REFUTES,neil young singer songwriter,330685,317289,Neil_Young,7,Neil Young is not a singer-songwriter.,3471431,[3471431]
48973,REFUTES,led zeppelin released eponymous debut album 1960,91851,104659,Led_Zeppelin,6,Led Zeppelin released an eponymous debut album...,2908175,[2908175]
48974,SUPPORTS,stars american actress rooney mara,28520,34848,Her_-LRB-film-RRB-,3,Her stars American actress Rooney Mara.,2134332,[2134332]


In [59]:
df[df['raw_text']=='Jeff Goldblum starred in a film.']

Unnamed: 0,label,clean_text,evidence_annotation_id,evidence_id,title,evidence_sentence_id,raw_text,doc_id,doc_ids
35651,SUPPORTS,jeff goldblum starred film,211149,218805,Jeff_Goldblum,3,Jeff Goldblum starred in a film.,2447223,"[2447223, 2447223, 274381, 2447223, 2447223, 2..."


In [60]:
df = df.drop(columns=['evidence_annotation_id', 'evidence_id', 'evidence_sentence_id', 'doc_id'])


In [61]:
df

Unnamed: 0,label,clean_text,title,raw_text,doc_ids
0,SUPPORTS,nikolaj coster waldau worked fox broadcasting ...,Nikolaj_Coster-Waldau,Nikolaj Coster-Waldau worked with the Fox Broa...,[3508521]
1,SUPPORTS,history art includes architecture dance sculpt...,History_of_art,"History of art includes architecture, dance, s...","[2247952, 2247952, 2247952, 2247952, 2247952, ..."
2,SUPPORTS,boston celtics play home games td garden,Boston_Celtics,The Boston Celtics play their home games at TD...,"[834104, 834104]"
3,SUPPORTS,cyndi lauper best new artist award 27th grammy...,Cyndi_Lauper,Cyndi Lauper won the Best New Artist award at ...,[1251056]
4,SUPPORTS,michael giacchino composed score doctor strange,Michael_Giacchino,Michael Giacchino composed the score for Docto...,[3318853]
...,...,...,...,...,...
48971,SUPPORTS,absolute beginners starred david bowie,Absolute_Beginners_-LRB-film-RRB-,Absolute Beginners starred David Bowie.,[299631]
48972,REFUTES,neil young singer songwriter,Neil_Young,Neil Young is not a singer-songwriter.,[3471431]
48973,REFUTES,led zeppelin released eponymous debut album 1960,Led_Zeppelin,Led Zeppelin released an eponymous debut album...,[2908175]
48974,SUPPORTS,stars american actress rooney mara,Her_-LRB-film-RRB-,Her stars American actress Rooney Mara.,[2134332]


In [12]:
nan_present = True
num_of_not_nan = 1000
num_supports = 600
num_refutes = 600
num_noi = 334

while nan_present:
    
    # Preprocessing step
    supports_df = claim_df[claim_df['label'] == 'SUPPORTS'].sample(n=num_supports)
    refutes_df = claim_df[claim_df['label'] == 'REFUTES'].sample(n=num_refutes)
    result_df = pd.concat([supports_df, refutes_df], ignore_index=True)
    result_df = result_df.sample(frac=1).reset_index(drop=True)

    # Lookup step
    updated_fever_df = lookup_doc_ids(result_df, processed_files)

    # Check for NaN values in the 'doc_id' column for SUPPORTS and REFUTES labels
    
    condition = updated_fever_df['doc_id'].isna()
    num_of_not_nan = num_supports + num_refutes - condition.sum()
    
    print(f"Have {num_of_not_nan} good rows.")
    
    if num_of_not_nan >= 666:
        break


Looking up.
Have 731 good rows.


In [13]:
updated_fever_df = updated_fever_df[~condition]

In [14]:
updated_fever_df.head(-5)

Unnamed: 0,label,clean_text,title,raw_text,doc_id
3,REFUTES,anne boleyn mentioned children stories,Anne_Boleyn,Anne Boleyn is mentioned in all children stories.,501768
4,REFUTES,henry iii france succeeded henry iv founder ho...,Henry_III_of_France,"Henry III of France was succeeded by Henry IV,...",2142984
7,REFUTES,malta lacks archdiocese,Malta,Malta lacks an archdiocese.,3065790
9,SUPPORTS,video game called team fortress 2,Multiplayer_video_game,There is a video game called Team Fortress 2.,3410034
10,SUPPORTS,phoenix arizona southwestern united states,Arizona,"Phoenix, Arizona is in the southwestern United...",519803
...,...,...,...,...,...
1183,SUPPORTS,agent raghav crime branch nominated best weeke...,Agent_Raghav_–_Crime_Branch,Agent Raghav – Crime Branch was nominated for ...,283303
1184,REFUTES,boxing helena debuted cannes,Boxing_Helena,Boxing Helena debuted at Cannes.,846013
1185,REFUTES,keith godchaux still member grateful dead,Grateful_Dead,Keith Godchaux is still a member of the Gratef...,2026543
1186,SUPPORTS,commodore naval rank america,Commodore_-LRB-rank-RRB-,Commodore is a naval rank in America.,1142655


In [15]:
actual_num_supports = 333
actual_num_refutes = 333

supports_df = updated_fever_df[updated_fever_df['label'] == 'SUPPORTS'].sample(n=actual_num_supports)
refutes_df = updated_fever_df[updated_fever_df['label'] == 'REFUTES'].sample(n=actual_num_supports)

In [16]:
noi_df = claim_df[claim_df['label'] == 'NOT ENOUGH INFO'].sample(n=num_noi)
result_df = pd.concat([supports_df, refutes_df, noi_df], ignore_index=True)
final_fever_df = result_df.sample(frac=1).reset_index(drop=True)


In [17]:
final_fever_df

Unnamed: 0,label,clean_text,title,raw_text,doc_id
0,SUPPORTS,brian de palma writer,Brian_De_Palma,Brian De Palma is a writer.,836316
1,NOT ENOUGH INFO,hermit crabs belong superfamily incredibles,,Hermit crabs belong to the superfamily the Inc...,
2,SUPPORTS,elizabeth olsen born,Elizabeth_Olsen,Elizabeth Olsen was born.,1569863
3,NOT ENOUGH INFO,clueless film followed series films,,Clueless (film) was followed by a series of fi...,
4,REFUTES,guthrie theater first building stopped operati...,Guthrie_Theater,The Guthrie Theater's first building stopped o...,1952760
...,...,...,...,...,...
995,SUPPORTS,dissociative identity disorder may result disr...,Dissociative_identity_disorder,"Dissociative identity disorder, or DID, may re...",1438283
996,REFUTES,henry cavill famous actor,Henry_Cavill,Henry Cavill is not a famous actor.,2132742
997,NOT ENOUGH INFO,north african champions africa cup nations,,There have been no North African champions at ...,
998,NOT ENOUGH INFO,chaperone film directed stephen herek,,The Chaperone (film) was directed by Stephen H...,


In [68]:
# CHECKS
def perform_checks(df):
    
    print("Number of NOT ENOUGH INFO rows: ",(df['label'] == 'NOT ENOUGH INFO').sum())
    print("Number of proper SUPPORTS rows: ",((df['label'] == 'SUPPORTS').sum()))
    print("Number of proper REFUTES rows: ",((df['label'] == 'REFUTES').sum()))

In [69]:
perform_checks(df)

Number of NOT ENOUGH INFO rows:  0
Number of proper SUPPORTS rows:  36473
Number of proper REFUTES rows:  12503


In [70]:
df.to_parquet('processed_fever/fever-train-split.parquet')
df.to_json('processed_fever/fever-train-split.json', orient='records')