In [1]:
import numpy as np
import pandas as pd
from datasets import load_dataset
import cudf
import os


In [1]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [3]:
import re
from nltk.corpus import stopwords
from unidecode import unidecode

# Clean text
def remove_non_ascii(text):
    return re.sub(r'[^\x00-\x7F]', ' ', text)


def remove_punctuation(text):
    return re.sub(r'[^\w]', ' ', text)

def remove_digits(text):
    return re.sub(r'[\d]', '', text)


def to_lowercase(text):
    return text.lower()


def remove_extra_space(text):
    return re.sub(' +', ' ', text)


def remove_url(text):
    return re.sub(r'http\S+', ' ', text)


def remove_underline(text):
    return text.replace('_', ' ')


def remove_hyphen(text):
    return text.replace('-', ' ')


def remove_leading_whitespace(text):
    return text.lstrip()

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return ' '.join([word for word in text.split() if word not in stop_words])

def decode_special_chars(text):
    return re.sub(r'-[A-Z]+-', ' ', text)

def remove_newline(text):
    return re.sub('\n', ' ', text)

def remove_tabs(text):
    return re.sub('\t', '', text)

def remove_intext_tabs(text):
    return re.sub(r'(?<!\d)\t', ' ', text)

def remove_special_tokens(text):
    return re.sub(r'-[A-Z]+-', '', text)

def remove_quotes(text):
    text = re.sub(r'(``|\' \')', '', text)
    return re.sub(r"''", '', text)


def clean_text(df: pd.DataFrame, column: str):
    
    df[column] = df[column].apply(remove_special_tokens)
    df[column] = df[column].apply(remove_extra_space)
    df[column] = df[column].apply(remove_quotes)

    return df 

In [4]:

def get_claim_df():
    
    cache_dir = '/home/rahvk/data/tmp/cache/fever3' # change this to your own path
    claim_dataset = load_dataset('fever', 'v1.0', cache_dir=cache_dir)
    claim_df = pd.DataFrame()

    for split in ['paper_test']:
        # Load train file
        claim_split = claim_dataset[split]
        claim_d = pd.DataFrame(claim_split)

        claim_d = claim_d.drop(columns=['evidence_annotation_id', 'evidence_id', 'evidence_sentence_id'])
        
        # Remove rows with label NOT ENOUGH INFO
        claim_d = claim_d[claim_d['label'] != "NOT ENOUGH INFO"]

        # Clean claim DataFrame
        claim_d = clean_text(df=claim_d, column="claim")
        claim_d.rename(columns={'evidence_wiki_url': 'title'}, inplace=True)
        claim_df = pd.concat([claim_df, claim_d], axis=0).drop_duplicates()

        del claim_split
        del claim_d

    del claim_dataset
    
    return claim_df

In [5]:
claim_df = get_claim_df()
claim_df.head()

Found cached dataset fever (/home/rahvk/data/tmp/cache/fever3/fever/v1.0/1.0.0/7f8936e0558704771b08c7ce9cc202071b29a0050603374507ba61d23c00a58e)


  0%|          | 0/6 [00:00<?, ?it/s]

Unnamed: 0,id,label,claim,title
1,163803,SUPPORTS,Ukrainian Soviet Socialist Republic was a foun...,Ukrainian_Soviet_Socialist_Republic
3,163803,SUPPORTS,Ukrainian Soviet Socialist Republic was a foun...,United_Nations
8,70041,SUPPORTS,2 Hearts is a musical composition by Minogue.,2_Hearts_-LRB-Kylie_Minogue_song-RRB-
12,202314,REFUTES,The New Jersey Turnpike has zero shoulders.,New_Jersey_Turnpike
18,6032,REFUTES,Aruba is the only ABC Island.,ABC_islands_-LRB-Lesser_Antilles-RRB-


In [6]:
claim_df.shape

(7937, 4)

In [7]:
claim_df = claim_df.reset_index(drop=True)
claim_df.head()


Unnamed: 0,id,label,claim,title
0,163803,SUPPORTS,Ukrainian Soviet Socialist Republic was a foun...,Ukrainian_Soviet_Socialist_Republic
1,163803,SUPPORTS,Ukrainian Soviet Socialist Republic was a foun...,United_Nations
2,70041,SUPPORTS,2 Hearts is a musical composition by Minogue.,2_Hearts_-LRB-Kylie_Minogue_song-RRB-
3,202314,REFUTES,The New Jersey Turnpike has zero shoulders.,New_Jersey_Turnpike
4,6032,REFUTES,Aruba is the only ABC Island.,ABC_islands_-LRB-Lesser_Antilles-RRB-


In [8]:
claim_df.to_parquet('processed_fever/fever-without-noi.parquet')

In [9]:

# Randomly select 333 rows where label = 'SUPPORTS'
supports_df = claim_df[claim_df['label'] == 'SUPPORTS'].sample(n=333, random_state=42)

# Randomly select 333 rows where label = 'REFUTES'
refutes_df = claim_df[claim_df['label'] == 'REFUTES'].sample(n=333, random_state=42)

# Randomly select 334 rows where label = 'NOT ENOUGH INFO'
noi_df = claim_df[claim_df['label'] == 'NOT ENOUGH INFO'].sample(n=334, random_state=42)

# Concatenate the DataFrames
result_df = pd.concat([supports_df, refutes_df, noi_df], ignore_index=True)

# Shuffle the resulting DataFrame
result_df = result_df.sample(frac=1, random_state=42).reset_index(drop=True)


In [10]:
result_df = result_df.reset_index(drop=True)
result_df.head()


Unnamed: 0,id,label,claim,title
0,109876,REFUTES,Little Dorrit was only published in the 1980s.,Little_Dorrit
1,137384,REFUTES,Johnny Van Zant is incapable of being a musician.,No_More_Dirty_Deals
2,131309,REFUTES,Henry Cavill is a famous director.,Henry_Cavill
3,213437,REFUTES,The Originals (TV series) began airing on The ...,The_Originals_-LRB-TV_series-RRB-
4,79366,SUPPORTS,Temple Mount is related to an Abrahamic religion.,Bible


In [11]:
result_df.shape

(1000, 4)

In [12]:
result_df.to_parquet('processed_fever/fever-1000.parquet')