In [1]:
import numpy as np
import pandas as pd
from datasets import load_dataset
import cudf
import os


In [2]:
# 'evidence_sentence_id', raw_text of the title

In [3]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
import re
from nltk.corpus import stopwords
from unidecode import unidecode

# Clean text
def remove_non_ascii(text):
    return re.sub(r'[^\x00-\x7F]', ' ', text)


def remove_punctuation(text):
    return re.sub(r'[^\w]', ' ', text)

def remove_digits(text):
    return re.sub(r'[\d]', '', text)


def to_lowercase(text):
    return text.lower()


def remove_extra_space(text):
    return re.sub(' +', ' ', text)


def remove_url(text):
    return re.sub(r'http\S+', ' ', text)


def remove_underline(text):
    return text.replace('_', ' ')


def remove_hyphen(text):
    return text.replace('-', ' ')


def remove_leading_whitespace(text):
    return text.lstrip()

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return ' '.join([word for word in text.split() if word not in stop_words])

def decode_special_chars(text):
    return re.sub(r'-[A-Z]+-', ' ', text)

def remove_newline(text):
    return re.sub('\n', ' ', text)

def remove_tabs(text):
    return re.sub('\t', '', text)

def remove_intext_tabs(text):
    return re.sub(r'(?<!\d)\t', ' ', text)

def remove_special_tokens(text):
    return re.sub(r'-[A-Z]+-', '', text)

def remove_quotes(text):
    text = re.sub(r'(``|\' \')', '', text)
    return re.sub(r"''", '', text)


def clean_text(df: pd.DataFrame, column: str):
    
    df[column] = df[column].apply(remove_punctuation)
    df[column] = df[column].apply(remove_non_ascii)
    df[column] = df[column].apply(remove_special_tokens)
    df[column] = df[column].apply(remove_extra_space)
    df[column] = df[column].apply(remove_quotes)
    df[column] = df[column].apply(to_lowercase)
    df[column] = df[column].apply(remove_stopwords)
    df[column] = df[column].apply(remove_tabs)
    df[column] = df[column].apply(remove_extra_space)

    return df 

In [5]:

def get_claim_df():
    
    cache_dir = '/home/rahvk/data/tmp/cache/fever3' # change this to your own path
    claim_dataset = load_dataset('fever', 'v1.0', cache_dir=cache_dir)
    claim_df = pd.DataFrame()

    for split in ['train']:
        # Load train file
        claim_split = claim_dataset[split]
        claim_d = pd.DataFrame(claim_split)

        claim_d = claim_d.drop(columns=[ 'id'])
        
        # Remove rows with label NOT ENOUGH INFO
        claim_d = claim_d[claim_d['evidence_sentence_id'] != -1]

        # Clean claim DataFrame
        claim_d['raw_text'] = claim_d['claim']
        claim_d = clean_text(df=claim_d, column="claim")
        claim_d.rename(columns={'evidence_wiki_url': 'title', 'claim': 'clean_text'}, inplace=True)
        claim_df = pd.concat([claim_df, claim_d], axis=0).drop_duplicates()

        del claim_split
        del claim_d

    del claim_dataset
    
    return claim_df

In [6]:
claim_df = get_claim_df()
claim_df.head()

Found cached dataset fever (/home/rahvk/data/tmp/cache/fever3/fever/v1.0/1.0.0/7f8936e0558704771b08c7ce9cc202071b29a0050603374507ba61d23c00a58e)


  0%|          | 0/6 [00:00<?, ?it/s]

Unnamed: 0,label,clean_text,evidence_annotation_id,evidence_id,title,evidence_sentence_id,raw_text
0,SUPPORTS,nikolaj coster waldau worked fox broadcasting ...,92206,104971,Nikolaj_Coster-Waldau,7,Nikolaj Coster-Waldau worked with the Fox Broa...
2,SUPPORTS,roman atwood content creator,174271,187498,Roman_Atwood,1,Roman Atwood is a content creator.
3,SUPPORTS,roman atwood content creator,174271,187499,Roman_Atwood,3,Roman Atwood is a content creator.
4,SUPPORTS,history art includes architecture dance sculpt...,255136,254645,History_of_art,2,"History of art includes architecture, dance, s..."
11,SUPPORTS,boston celtics play home games td garden,49158,58489,Boston_Celtics,3,The Boston Celtics play their home games at TD...


In [7]:
claim_df.shape

(168291, 7)

In [8]:
claim_df = claim_df.reset_index(drop=True)
claim_df.head()


Unnamed: 0,label,clean_text,evidence_annotation_id,evidence_id,title,evidence_sentence_id,raw_text
0,SUPPORTS,nikolaj coster waldau worked fox broadcasting ...,92206,104971,Nikolaj_Coster-Waldau,7,Nikolaj Coster-Waldau worked with the Fox Broa...
1,SUPPORTS,roman atwood content creator,174271,187498,Roman_Atwood,1,Roman Atwood is a content creator.
2,SUPPORTS,roman atwood content creator,174271,187499,Roman_Atwood,3,Roman Atwood is a content creator.
3,SUPPORTS,history art includes architecture dance sculpt...,255136,254645,History_of_art,2,"History of art includes architecture, dance, s..."
4,SUPPORTS,boston celtics play home games td garden,49158,58489,Boston_Celtics,3,The Boston Celtics play their home games at TD...


In [9]:

def lookup_doc_ids(fever_df, parquet_files):
    """
    Lookup and map doc_ids from parquet files to titles in fever_df DataFrame.

    Args:
    - fever_df (pd.DataFrame): DataFrame containing titles.
    - parquet_files (list): List of paths to parquet files.

    Returns:
    - pd.DataFrame: Updated fever_df DataFrame with 'doc_id' column added.
    """
    # Load the titles from fever_df into a set for faster lookup
    fever_titles = set(fever_df['title'])

    # Create an empty dictionary to store doc_id mappings
    doc_id_mapping = {}
    
#     print("Looking up.")
    
    x = 0
    # Iterate over each parquet file
    for parquet_file in parquet_files:
        
        print(f"Looking up in {x}.parquet")
        # Load the parquet file into a DataFrame
        df = pd.read_parquet(parquet_file)

        # Filter the DataFrame to include only rows with titles in fever_titles
        filtered_df = df[df['title'].isin(fever_titles)]

        # Iterate over each row in the filtered DataFrame
        for index, row in filtered_df.iterrows():
            # Store the doc_id in the doc_id_mapping dictionary
            doc_id_mapping[row['title']] = str(row['doc_id'])
        
        x+=1
        
    # Map doc_ids to titles in fever_df using the doc_id_mapping dictionary
    fever_df['doc_id'] = fever_df['title'].map(doc_id_mapping)

    return fever_df


In [10]:
read_dir = "wiki_docs_parquets"  # Directory containing processed Parquet files
processed_files = [read_dir+"/"+name for name in sorted(os.listdir(read_dir))]
print(processed_files)

['wiki_docs_parquets/0.parquet', 'wiki_docs_parquets/1.parquet', 'wiki_docs_parquets/2.parquet', 'wiki_docs_parquets/3.parquet', 'wiki_docs_parquets/4.parquet', 'wiki_docs_parquets/5.parquet', 'wiki_docs_parquets/6.parquet', 'wiki_docs_parquets/7.parquet', 'wiki_docs_parquets/8.parquet', 'wiki_docs_parquets/9.parquet']


In [11]:
updated_fever_df = lookup_doc_ids(claim_df, processed_files)

Looking up in 0.parquet
Looking up in 1.parquet
Looking up in 2.parquet
Looking up in 3.parquet
Looking up in 4.parquet
Looking up in 5.parquet
Looking up in 6.parquet
Looking up in 7.parquet
Looking up in 8.parquet
Looking up in 9.parquet


In [12]:
df = updated_fever_df.dropna(subset=['doc_id'])


In [13]:
df

Unnamed: 0,label,clean_text,evidence_annotation_id,evidence_id,title,evidence_sentence_id,raw_text,doc_id
0,SUPPORTS,nikolaj coster waldau worked fox broadcasting ...,92206,104971,Nikolaj_Coster-Waldau,7,Nikolaj Coster-Waldau worked with the Fox Broa...,3508521
3,SUPPORTS,history art includes architecture dance sculpt...,255136,254645,History_of_art,2,"History of art includes architecture, dance, s...",2247952
4,SUPPORTS,boston celtics play home games td garden,49158,58489,Boston_Celtics,3,The Boston Celtics play their home games at TD...,834104
5,SUPPORTS,boston celtics play home games td garden,49159,58490,Boston_Celtics,3,The Boston Celtics play their home games at TD...,834104
8,SUPPORTS,cyndi lauper best new artist award 27th grammy...,56492,66697,Cyndi_Lauper,2,Cyndi Lauper won the Best New Artist award at ...,1251056
...,...,...,...,...,...,...,...,...
168276,SUPPORTS,jeff goldblum starred film,210069,217837,Igby_Goes_Down,1,Jeff Goldblum starred in a film.,2161172
168278,SUPPORTS,jeff goldblum starred film,210069,217838,Jeff_Goldblum,7,Jeff Goldblum starred in a film.,2447223
168285,REFUTES,led zeppelin released eponymous debut album 1960,91851,104659,Led_Zeppelin,6,Led Zeppelin released an eponymous debut album...,2908175
168286,SUPPORTS,stars american actress rooney mara,28520,34848,Her_-LRB-film-RRB-,3,Her stars American actress Rooney Mara.,2134332


In [14]:
df['joint_id'] = df['doc_id'].astype(str) + '_' + df['evidence_sentence_id'].astype(str)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['joint_id'] = df['doc_id'] + '_' + df['evidence_sentence_id'].astype(str)


Unnamed: 0,label,clean_text,evidence_annotation_id,evidence_id,title,evidence_sentence_id,raw_text,doc_id,joint_id
0,SUPPORTS,nikolaj coster waldau worked fox broadcasting ...,92206,104971,Nikolaj_Coster-Waldau,7,Nikolaj Coster-Waldau worked with the Fox Broa...,3508521,3508521_7
3,SUPPORTS,history art includes architecture dance sculpt...,255136,254645,History_of_art,2,"History of art includes architecture, dance, s...",2247952,2247952_2
4,SUPPORTS,boston celtics play home games td garden,49158,58489,Boston_Celtics,3,The Boston Celtics play their home games at TD...,834104,834104_3
5,SUPPORTS,boston celtics play home games td garden,49159,58490,Boston_Celtics,3,The Boston Celtics play their home games at TD...,834104,834104_3
8,SUPPORTS,cyndi lauper best new artist award 27th grammy...,56492,66697,Cyndi_Lauper,2,Cyndi Lauper won the Best New Artist award at ...,1251056,1251056_2
...,...,...,...,...,...,...,...,...,...
168276,SUPPORTS,jeff goldblum starred film,210069,217837,Igby_Goes_Down,1,Jeff Goldblum starred in a film.,2161172,2161172_1
168278,SUPPORTS,jeff goldblum starred film,210069,217838,Jeff_Goldblum,7,Jeff Goldblum starred in a film.,2447223,2447223_7
168285,REFUTES,led zeppelin released eponymous debut album 1960,91851,104659,Led_Zeppelin,6,Led Zeppelin released an eponymous debut album...,2908175,2908175_6
168286,SUPPORTS,stars american actress rooney mara,28520,34848,Her_-LRB-film-RRB-,3,Her stars American actress Rooney Mara.,2134332,2134332_3


In [15]:
import pandas as pd

# Group by 'raw_text' column and aggregate 'doc_id' values into a list
grouped = df.groupby('raw_text')['joint_id'].apply(list).reset_index()

# Merge the grouped DataFrame with the original DataFrame on the 'claim' column
df = df.merge(grouped, on='raw_text', how='left')

# Rename the column containing the lists of 'doc_id' values
df.rename(columns={'joint_id_x': 'joint_id', 'joint_id_y': 'joint_ids'}, inplace=True)

# Drop duplicate rows based on 'raw_text' column
df = df.drop_duplicates(subset='raw_text')

# Reset index
df = df.reset_index(drop=True)


In [16]:
df

Unnamed: 0,label,clean_text,evidence_annotation_id,evidence_id,title,evidence_sentence_id,raw_text,doc_id,joint_id,joint_ids
0,SUPPORTS,nikolaj coster waldau worked fox broadcasting ...,92206,104971,Nikolaj_Coster-Waldau,7,Nikolaj Coster-Waldau worked with the Fox Broa...,3508521,3508521_7,[3508521_7]
1,SUPPORTS,history art includes architecture dance sculpt...,255136,254645,History_of_art,2,"History of art includes architecture, dance, s...",2247952,2247952_2,"[2247952_2, 2247952_2, 2247952_2, 2247952_2, 2..."
2,SUPPORTS,boston celtics play home games td garden,49158,58489,Boston_Celtics,3,The Boston Celtics play their home games at TD...,834104,834104_3,"[834104_3, 834104_3]"
3,SUPPORTS,cyndi lauper best new artist award 27th grammy...,56492,66697,Cyndi_Lauper,2,Cyndi Lauper won the Best New Artist award at ...,1251056,1251056_2,[1251056_2]
4,SUPPORTS,michael giacchino composed score doctor strange,208457,216486,Michael_Giacchino,1,Michael Giacchino composed the score for Docto...,3318853,3318853_1,[3318853_1]
...,...,...,...,...,...,...,...,...,...,...
48971,SUPPORTS,absolute beginners starred david bowie,268538,265101,Absolute_Beginners_-LRB-film-RRB-,1,Absolute Beginners starred David Bowie.,299631,299631_1,[299631_1]
48972,REFUTES,neil young singer songwriter,330685,317289,Neil_Young,7,Neil Young is not a singer-songwriter.,3471431,3471431_7,[3471431_7]
48973,REFUTES,led zeppelin released eponymous debut album 1960,91851,104659,Led_Zeppelin,6,Led Zeppelin released an eponymous debut album...,2908175,2908175_6,[2908175_6]
48974,SUPPORTS,stars american actress rooney mara,28520,34848,Her_-LRB-film-RRB-,3,Her stars American actress Rooney Mara.,2134332,2134332_3,[2134332_3]


In [17]:
df[df['raw_text']=='Jeff Goldblum starred in a film.']

Unnamed: 0,label,clean_text,evidence_annotation_id,evidence_id,title,evidence_sentence_id,raw_text,doc_id,joint_id,joint_ids
35651,SUPPORTS,jeff goldblum starred film,211149,218805,Jeff_Goldblum,3,Jeff Goldblum starred in a film.,2447223,2447223_3,"[2447223_3, 2447223_6, 274381_4, 2447223_7, 24..."


In [18]:
df = df.drop(columns=['evidence_annotation_id', 'evidence_id', 'evidence_sentence_id', 'joint_id', 'doc_id'])


In [19]:
df

Unnamed: 0,label,clean_text,title,raw_text,joint_ids
0,SUPPORTS,nikolaj coster waldau worked fox broadcasting ...,Nikolaj_Coster-Waldau,Nikolaj Coster-Waldau worked with the Fox Broa...,[3508521_7]
1,SUPPORTS,history art includes architecture dance sculpt...,History_of_art,"History of art includes architecture, dance, s...","[2247952_2, 2247952_2, 2247952_2, 2247952_2, 2..."
2,SUPPORTS,boston celtics play home games td garden,Boston_Celtics,The Boston Celtics play their home games at TD...,"[834104_3, 834104_3]"
3,SUPPORTS,cyndi lauper best new artist award 27th grammy...,Cyndi_Lauper,Cyndi Lauper won the Best New Artist award at ...,[1251056_2]
4,SUPPORTS,michael giacchino composed score doctor strange,Michael_Giacchino,Michael Giacchino composed the score for Docto...,[3318853_1]
...,...,...,...,...,...
48971,SUPPORTS,absolute beginners starred david bowie,Absolute_Beginners_-LRB-film-RRB-,Absolute Beginners starred David Bowie.,[299631_1]
48972,REFUTES,neil young singer songwriter,Neil_Young,Neil Young is not a singer-songwriter.,[3471431_7]
48973,REFUTES,led zeppelin released eponymous debut album 1960,Led_Zeppelin,Led Zeppelin released an eponymous debut album...,[2908175_6]
48974,SUPPORTS,stars american actress rooney mara,Her_-LRB-film-RRB-,Her stars American actress Rooney Mara.,[2134332_3]


In [20]:
# CHECKS
def perform_checks(df):
    
    print("Number of NOT ENOUGH INFO rows: ",(df['label'] == 'NOT ENOUGH INFO').sum())
    print("Number of proper SUPPORTS rows: ",((df['label'] == 'SUPPORTS').sum()))
    print("Number of proper REFUTES rows: ",((df['label'] == 'REFUTES').sum()))

In [21]:
perform_checks(df)

Number of NOT ENOUGH INFO rows:  0
Number of proper SUPPORTS rows:  36473
Number of proper REFUTES rows:  12503


In [22]:
df.to_parquet('processed_fever/fever-train-split.parquet')
df.to_json('processed_fever/fever-train-split.json', orient='records')