In [None]:
! pip install simpletransformers
! pip install tensorboardX
! pip install Unidecode
! pip install nltk

In [None]:
# import nltk
# nltk.download("stopwords")


In [1]:
import torch
torch.cuda.is_available()

True

In [2]:

from transformers import DPRContextEncoder, DPRContextEncoderTokenizer, DPRQuestionEncoder, DPRQuestionEncoderTokenizer
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset


In [3]:
import cudf

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [5]:
def get_wiki_paraquet_files():
    
    ids = [str(i) for i in range(10)]
    base_url = "https://huggingface.co/api/datasets/fever/parquet/wiki_pages/wikipedia_pages/"
    
    for index in ids:
        data_files = {"wikipedia_pages": base_url + f"{index}.parquet"}
        wiki = load_dataset("parquet", data_files=data_files, split="wikipedia_pages", cache_dir='/home/rahvk/data/tmp/cache')
        
        wiki.to_csv(f"wiki_pages_parquets/{index}_parquet_wiki.csv")
        
        del wiki
        
        print(f"completed downloading {index}")

In [None]:
# get_wiki_paraquet_files()

In [6]:
import re
from nltk.corpus import stopwords
from unidecode import unidecode

# Clean text
def remove_non_ascii(text):
    return re.sub(r'[^\x00-\x7F]', ' ', text)


def remove_punctuation(text):
    return re.sub(r'[^\w]', ' ', text)

def remove_digits(text):
    return re.sub(r'[\d]', '', text)


def to_lowercase(text):
    return text.lower()


def remove_extra_space(text):
    return re.sub(' +', ' ', text)


def remove_url(text):
    return re.sub(r'http\S+', ' ', text)


def remove_underline(text):
    return text.replace('_', ' ')


def remove_hyphen(text):
    return text.replace('-', ' ')


def remove_leading_whitespace(text):
    return text.lstrip()

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return ' '.join([word for word in text.split() if word not in stop_words])

def clean_text(df: pd.DataFrame, column: str):
    
    df[column] = df[column].apply(remove_non_ascii)
    df[column] = df[column].apply(unidecode)
    df[column] = df[column].apply(str.lower)
    df[column] = df[column].apply(remove_punctuation)
    df[column] = df[column].apply(remove_underline)
    df[column] = df[column].apply(remove_hyphen)
    df[column] = df[column].apply(remove_extra_space)
    df[column] = df[column].apply(str.strip)
    # df[column] = df[column].apply(remove_stopwords)
    return df # [column].tolist()

In [7]:

def process_single_file(file_index, uid_start):
    print(f"Started processing file - {file_index}")
    wiki_csv = pd.read_csv(f"wiki_pages_parquets/{file_index}_parquet_wiki.csv")

    # Remove "lines" column
    df_v0 = wiki_csv.drop(columns=['lines'])
    del wiki_csv
    # Remove NaN rows
    df_v0 = df_v0.dropna()

    # Clean Text
    df_v1 = clean_text(df=df_v0, column='text')
    del df_v0
    # Drop rows where `id` is NaN (or empty)
    df_v1['id'].replace('', np.nan, inplace=True)
    df_v2 = df_v1[df_v1['id'].notna()]
    df_v2.rename(columns={'id': 'title'}, inplace=True)

    del df_v1

    # Adjust index to create a unique identifier
    df_v2.reset_index(drop=True, inplace=True)
    df_v2.index += uid_start

    # Convert to cudf
    df_v2_gpu = cudf.DataFrame.from_pandas(df_v2)

    # Return processed DataFrame and last UID
    return df_v2_gpu, df_v2.index[-1] + 1


In [8]:
def process_parquet_files():
    ids = [str(i) for i in range(10)]  # Assuming 10 files
    uid_start = 0
    batch_size = 2  # Define your batch size based on memory constraints
    all_batches = []

    for i in range(0, len(ids), batch_size):
        current_batch = cudf.DataFrame()

        for index in ids[i:i + batch_size]:
            df_processed, uid_start = process_single_file(index, uid_start)
            current_batch = cudf.concat([current_batch, df_processed], axis=0)
            del df_processed
            torch.cuda.empty_cache()

        all_batches.append(current_batch)
        del current_batch
    
    print("Concatenating")
    # Concatenate all batches
    final_df = cudf.concat(all_batches, axis=0)
    del all_batches

    print("Processing complete")
    return final_df


In [9]:
wiki_df = process_parquet_files()


Started processing file - 0
Started processing file - 1
Started processing file - 2
Started processing file - 3
Started processing file - 4
Started processing file - 5
Started processing file - 6
Started processing file - 7
Started processing file - 8
Started processing file - 9
Concatenating
Processing complete


In [10]:
len(wiki_df)

3723220

In [11]:
# Reset the index to make 'id' a column
wiki_df.reset_index(inplace=True)
wiki_df.rename(columns={'level_0': 'uid'}, inplace=True)
wiki_df = wiki_df.to_pandas()

In [12]:
wiki_df.head()

Unnamed: 0,index,title,text
0,0,1928_in_association_football,the following are the football lrb soccer rrb ...
1,1,1986_NBA_Finals,the 1986 nba finals was the championship round...
2,2,1901_Villanova_Wildcats_football_team,the 1901 villanova wildcats football team repr...
3,3,1992_Northwestern_Wildcats_football_team,the 1992 northwestern wildcats team represente...
4,4,1897_Princeton_Tigers_football_team,the 1897 princeton tigers football team repres...


In [13]:
# CHECK IF index is unique
is_unique = wiki_df['index'].nunique() == len(wiki_df)
is_unique

True

In [14]:
is_unique = wiki_df['title'].nunique() == len(wiki_df)
is_unique

True

In [33]:
def get_claim_df():
    claim_dataset = load_dataset('fever', 'v1.0', cache_dir='/home/rahvk/data/tmp/cache')
    claim_df = pd.DataFrame()

    for split in ['train', 'labelled_dev', 'paper_dev', 'paper_test']:
        # Load train file
        claim_split = claim_dataset[split]
        claim_d = pd.DataFrame(claim_split)

        claim_d = claim_d.drop(columns=['evidence_annotation_id', 'evidence_id', 'evidence_sentence_id'])
        # Remove rows with label NOT ENOUGH INFO
        claim_d = claim_d[claim_d['label'] != "NOT ENOUGH INFO"]

        # Clean claim DataFrame
        claim_d = clean_text(df=claim_d, column="claim")
        claim_d.rename(columns={'evidence_wiki_url': 'title'}, inplace=True)
        claim_df = pd.concat([claim_df, claim_d], axis=0).drop_duplicates()

        del claim_split
        del claim_d

    del claim_dataset
    
    return claim_df

In [34]:
claim_df = get_claim_df()
claim_df.head()

Found cached dataset fever (/home/rahvk/data/tmp/cache/fever/v1.0/1.0.0/7f8936e0558704771b08c7ce9cc202071b29a0050603374507ba61d23c00a58e)


  0%|          | 0/6 [00:00<?, ?it/s]

Unnamed: 0,id,label,claim,title
0,75397,SUPPORTS,nikolaj coster waldau worked with the fox broa...,Nikolaj_Coster-Waldau
1,75397,SUPPORTS,nikolaj coster waldau worked with the fox broa...,Fox_Broadcasting_Company
2,150448,SUPPORTS,roman atwood is a content creator,Roman_Atwood
4,214861,SUPPORTS,history of art includes architecture dance scu...,History_of_art
5,156709,REFUTES,adrienne bailon is an accountant,Adrienne_Bailon


In [41]:
len(claim_df)

156101

In [44]:
claim_df = claim_df.reset_index(drop=True)
claim_df.head()
# print(len(claim_df))

Unnamed: 0,id,label,claim,title
0,75397,SUPPORTS,nikolaj coster waldau worked with the fox broa...,Nikolaj_Coster-Waldau
1,75397,SUPPORTS,nikolaj coster waldau worked with the fox broa...,Fox_Broadcasting_Company
2,150448,SUPPORTS,roman atwood is a content creator,Roman_Atwood
3,214861,SUPPORTS,history of art includes architecture dance scu...,History_of_art
4,156709,REFUTES,adrienne bailon is an accountant,Adrienne_Bailon


In [45]:
claim_df['title'] = claim_df['title'].str.strip()
wiki_df['title'] = wiki_df['title'].str.strip()

In [77]:
# merged_df = pd.concat([claim_df, wiki_df], axis=1, join="inner")
merged_df = pd.merge(claim_df, wiki_df, on="title")
merged_df = merged_df.dropna(subset=['index'])
merged_df.rename(columns={'id': 'claim_id', 'title': 'wiki_title', 'index': 'wiki_index', 'text':'wiki_text'}, inplace=True)
merged_df = merged_df.drop(columns=['title_index'])

merged_df.head()

Unnamed: 0,claim_id,label,claim,wiki_title,wiki_index,wiki_text
0,75397,SUPPORTS,nikolaj coster waldau worked with the fox broa...,Nikolaj_Coster-Waldau,3508521,nikolaj coster waldau lrb lsb ne ola k sd ald ...
1,58718,REFUTES,nikolaj coster waldau was not in a danish thri...,Nikolaj_Coster-Waldau,3508521,nikolaj coster waldau lrb lsb ne ola k sd ald ...
2,134655,SUPPORTS,nikolaj coster waldau worked with peter dinklage,Nikolaj_Coster-Waldau,3508521,nikolaj coster waldau lrb lsb ne ola k sd ald ...
3,86306,REFUTES,nikolaj coster waldau refused to ever work wit...,Nikolaj_Coster-Waldau,3508521,nikolaj coster waldau lrb lsb ne ola k sd ald ...
4,149361,SUPPORTS,nikolaj coster waldau was in a film,Nikolaj_Coster-Waldau,3508521,nikolaj coster waldau lrb lsb ne ola k sd ald ...


In [78]:
merged_df.head(-5)

Unnamed: 0,claim_id,label,claim,wiki_title,wiki_index,wiki_text
0,75397,SUPPORTS,nikolaj coster waldau worked with the fox broa...,Nikolaj_Coster-Waldau,3508521,nikolaj coster waldau lrb lsb ne ola k sd ald ...
1,58718,REFUTES,nikolaj coster waldau was not in a danish thri...,Nikolaj_Coster-Waldau,3508521,nikolaj coster waldau lrb lsb ne ola k sd ald ...
2,134655,SUPPORTS,nikolaj coster waldau worked with peter dinklage,Nikolaj_Coster-Waldau,3508521,nikolaj coster waldau lrb lsb ne ola k sd ald ...
3,86306,REFUTES,nikolaj coster waldau refused to ever work wit...,Nikolaj_Coster-Waldau,3508521,nikolaj coster waldau lrb lsb ne ola k sd ald ...
4,149361,SUPPORTS,nikolaj coster waldau was in a film,Nikolaj_Coster-Waldau,3508521,nikolaj coster waldau lrb lsb ne ola k sd ald ...
...,...,...,...,...,...,...
102741,29142,SUPPORTS,pakistan s semi industrialized economy is the ...,Economy_of_Pakistan,1481261,the economy of pakistan is the 25th largest in...
102742,76838,REFUTES,hollow man has one sequel called hollow man 2 ...,Hollow_Man_2,2220699,hollow man 2 is a 2006 american science fictio...
102743,77911,SUPPORTS,the fame was released in or before 2016,52nd_Annual_Grammy_Awards,270081,the 52nd annual grammy awards took place on ja...
102744,75621,REFUTES,south island is separated from north island by...,Cook_Strait,1257124,cook strait lrb te moana o raukawa rrb lies be...


In [81]:
merged_df.to_parquet('processed_df/merged_df.parquet')