In [None]:
! pip install simpletransformers
! pip install tensorboardX
! pip install Unidecode
! pip install nltk

In [1]:
import torch
torch.cuda.is_available()

True

In [2]:
import numpy as np
import pandas as pd
from datasets import load_dataset
import cudf
import os


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
def get_wiki_paraquet_files():
    
    ids = [str(i) for i in range(10)]
    base_url = "https://huggingface.co/api/datasets/fever/parquet/wiki_pages/wikipedia_pages/"
    cach_dir = '/home/rahvk/data/tmp/cache' # change this to your own path
    output_dir = 'wiki_pages_parquets'
    os.makedirs(output_dir, exist_ok=True)  # Create the output directory if it doesn't exist

    for index in ids:
        data_files = {"wikipedia_pages": base_url + f"{index}.parquet"}
        wiki = load_dataset("parquet", data_files=data_files, split="wikipedia_pages", cache_dir=cache_dir)
        
        wiki.to_csv(f"{output_dir}/{index}_parquet_wiki.csv")
        
        del wiki
        
        print(f"completed downloading {index}")

In [5]:
# get_wiki_paraquet_files()

In [6]:
import re
from nltk.corpus import stopwords
from unidecode import unidecode

# Clean text
def remove_non_ascii(text):
    return re.sub(r'[^\x00-\x7F]', ' ', text)

def remove_punctuation(text):
    return re.sub(r'[^\w]', ' ', text)

def remove_digits(text):
    return re.sub(r'[\d]', '', text)

def to_lowercase(text):
    return text.lower()

def remove_extra_space(text):
    return re.sub(' +', ' ', text)

def remove_url(text):
    return re.sub(r'http\S+', ' ', text)

def remove_underline(text):
    return text.replace('_', ' ')

def remove_hyphen(text):
    return text.replace('-', ' ')

def remove_leading_whitespace(text):
    return text.lstrip()

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return ' '.join([word for word in text.split() if word not in stop_words])

def decode_special_chars(text):
    return re.sub(r'-[A-Z]+-', ' ', text)

def remove_newline(text):
    return re.sub('\n', ' ', text)

def remove_tabs(text):
    return re.sub('\t', '', text)

def remove_intext_tabs(text):
    return re.sub(r'(?<!\d)\t', ' ', text)

def split_wiki_lines(lines):
    """
    Seperates lines in Wiki pages based on line index followed by 
    new tab char.
    @param lines - lines column from wikipedia pages DataFrame.
    ______
    Returns pd.DataFrame: new column containing list of lines 
            in wikipedia pages separated by comma.
    """
    lines = re.split(r'\d+\t', lines)
    lines = lines[1:len(lines)-1]
    return lines

def remove_special_tokens(text):
    return re.sub(r'-[A-Z]+-', '', text)

def remove_quotes(text):
    text = re.sub(r'(``|\' \')', '', text)
    return re.sub(r"''", '', text)

def remove_empty_lines(lines):
    return [s for s in lines if s != '\n']

def lines_to_dict(lines):
    lines_dict = {}
    for line in lines.split('\n'):
        if line.strip():  # Ignore empty lines
            parts = line.split('\t')
            index = parts[0]
            value = '\t'.join(parts[1:])  # Reconstruct the value part
            lines_dict[index] = value
    # Remove empty strings from values
    lines_dict = {k: v for k, v in lines_dict.items() if v}
    return lines_dict


def clean_text(df: pd.DataFrame, column: str):
    
#     df[column] = df[column].apply(remove_special_tokens)
#     df[column] = df[column].apply(remove_extra_space)
    df[column] = df[column].apply(lines_to_dict)
#     df[column] = df[column].apply(remove_empty_lines)

    return df 

def clean_lines(df: pd.DataFrame, column: str):

    df[column] = df[column].apply(remove_special_tokens)
    df[column] = df[column].apply(remove_punctuation)
    df[column] = df[column].apply(remove_non_ascii)
    df[column] = df[column].apply(to_lowercase)
    df[column] = df[column].apply(remove_stopwords)
    df[column] = df[column].apply(remove_tabs)
    df[column] = df[column].apply(remove_extra_space)
    df[column] = df[column].apply(remove_quotes)


    return df

In [7]:
def explode_df(df):
    
    print("creating dictionary out of lines column")
    df['lines'] = df['lines'].apply(lines_to_dict)
    
    # Initialize an empty list to store expanded rows
    expanded_rows = []
    
    print("splitting lines into separate rows")
    # Iterate over each row in the DataFrame
    for idx, row in df.iterrows():
        # Iterate over each key-value pair in the 'lines' dictionary
        for key, value in row['lines'].items():
            # Create a new row with the key-value pair and other columns from the original DataFrame
            new_row = row.drop('lines').to_dict()
            new_row.update({'passage_id': key, 'passage_content': value})
            expanded_rows.append(new_row)

    # Create a new DataFrame from the expanded rows
    expanded_df = pd.DataFrame(expanded_rows)
    
    del df, expanded_rows

    return expanded_df

In [8]:
def process_single_file(file_index, uid_start):
    print(f"Started processing file - {file_index}")
    wiki_csv = pd.read_csv(f"wiki_pages_parquets/{file_index}_parquet_wiki.csv")

    # Remove "text" column
    df_v0 = wiki_csv.drop(columns=['text'])
    del wiki_csv
    # Remove NaN rows
    df_v0 = df_v0.dropna()
    
    # Adjust index to create a unique identifier
    df_v0.reset_index(drop=True, inplace=True)
    df_v0.index += uid_start
    # Store as doc_id
    df_v0['doc_id'] = list(df_v0.index)
    next_uid_start = df_v0.index[-1] + 1

    # split the lines column into separate rows
    df_v1 = explode_df(df_v0)
    del df_v0
    
    print("Further cleaning lines")
    df_v1['raw_passage_content'] = df_v1['passage_content']
    df_v2 = clean_lines(df=df_v1, column='passage_content')
    del df_v1
    
    df_v2.rename(columns={'id': 'title', 'passage_content': 'clean_passage_content'}, inplace=True)
    
    # Return processed DataFrame and last UID
    return df_v2, next_uid_start


In [9]:

def process_and_store_parquet_files_with_doc_id():

    ids = [str(i) for i in range(10)]  # 10 files
    output_dir = "wiki_passages_parquets_2"  # Directory to store processed Parquet files
    os.makedirs(output_dir, exist_ok=True)  # Create the output directory if it doesn't exist
    global uid_start
    uid_start = 0
    
    # Process each file and store the processed DataFrame as a separate Parquet file
    for index in ids:

        df_processed, uid_start = process_single_file(index, uid_start)
        df_processed['joint_id'] = df_processed['doc_id'].astype(str) + '_' + df_processed['passage_id'].astype(str)
        
        # Store the df
        output_filename = os.path.join(output_dir, f"{index}.parquet")
        df_processed.to_parquet(output_filename, index=False)

        torch.cuda.empty_cache()
        print("Finish\n")
    

In [10]:
process_and_store_parquet_files_with_doc_id()

Started processing file - 0
creating dictionary out of lines column
splitting lines into separate rows
Further cleaning lines
Finish

Started processing file - 1
creating dictionary out of lines column
splitting lines into separate rows
Further cleaning lines
Finish

Started processing file - 2
creating dictionary out of lines column
splitting lines into separate rows
Further cleaning lines
Finish

Started processing file - 3
creating dictionary out of lines column
splitting lines into separate rows
Further cleaning lines
Finish

Started processing file - 4
creating dictionary out of lines column
splitting lines into separate rows
Further cleaning lines
Finish

Started processing file - 5
creating dictionary out of lines column
splitting lines into separate rows
Further cleaning lines
Finish

Started processing file - 6
creating dictionary out of lines column
splitting lines into separate rows
Further cleaning lines
Finish

Started processing file - 7
creating dictionary out of lines c

In [11]:
def process_parquet_files_to_json():
    read_dir = "wiki_passages_parquets_2"  # Directory containing processed Parquet files
    output_dir = "wiki_passages_jsons_2"
    processed_files = sorted(os.listdir(read_dir))
    os.makedirs(output_dir, exist_ok=True)  # Create the output directory if it doesn't exist


    for filename in processed_files:
        # Read the Parquet file
        wiki_df = pd.read_parquet(os.path.join(read_dir, filename))

        # Convert unique_id to string
        wiki_df['passage_id'] = wiki_df['passage_id'].astype(str)
        wiki_df['doc_id'] = wiki_df['doc_id'].astype(str)
        wiki_df['joint_id'] = wiki_df['joint_id'].astype(str)

        # Write to JSON file
        json_filename = filename.split(".")[0] + ".json"
        wiki_df.to_json(output_dir+"/"+json_filename, orient='records')

        print(f"Processed {filename} and saved as {json_filename}")

    print("Conversion completed")


In [12]:
process_parquet_files_to_json()

Processed 0.parquet and saved as 0.json
Processed 1.parquet and saved as 1.json
Processed 2.parquet and saved as 2.json
Processed 3.parquet and saved as 3.json
Processed 4.parquet and saved as 4.json
Processed 5.parquet and saved as 5.json
Processed 6.parquet and saved as 6.json
Processed 7.parquet and saved as 7.json
Processed 8.parquet and saved as 8.json
Processed 9.parquet and saved as 9.json
Conversion completed


In [13]:
def convert_json_to_pyserini_format():
    
    '''
    Required format:
    {
      "id": "doc1",
      "contents": "this is the contents."
    }
    '''
    read_dir = "wiki_passages_parquets_2"  # Directory containing processed Parquet files
    output_dir = "pyserini_format_docs_clean_texts"
    processed_files = sorted(os.listdir(read_dir))
    os.makedirs(output_dir, exist_ok=True)  # Create the output directory if it doesn't exist


    for filename in processed_files:
        # Read the Parquet file
        wiki_df = pd.read_parquet(os.path.join(read_dir, filename))
        
        wiki_df.drop(['passage_id', 'title', 'doc_id', 'raw_passage_content'], axis=1, inplace=True)

        # Rename columns
        wiki_df = wiki_df.rename(columns={'clean_passage_content': 'contents', 'joint_id': 'id'})
        
        # Write to JSON file
        json_filename = filename.split(".")[0] + ".json"
        wiki_df.to_json(output_dir+"/"+json_filename, orient='records')

        print(f"Processed {filename} and saved as {json_filename}")

    print("Conversion completed")


In [14]:
convert_json_to_pyserini_format()

Processed 0.parquet and saved as 0.json
Processed 1.parquet and saved as 1.json
Processed 2.parquet and saved as 2.json
Processed 3.parquet and saved as 3.json
Processed 4.parquet and saved as 4.json
Processed 5.parquet and saved as 5.json
Processed 6.parquet and saved as 6.json
Processed 7.parquet and saved as 7.json
Processed 8.parquet and saved as 8.json
Processed 9.parquet and saved as 9.json
Conversion completed


In [15]:
def convert_json_to_pyserini_format_raw_texts():
    
    '''
    Required format:
    {
      "id": "doc1",
      "contents": "this is the contents."
    }
    '''
    read_dir = "wiki_passages_parquets_2"  # Directory containing processed Parquet files
    output_dir = "pyserini_format_docs_raw_texts"
    processed_files = sorted(os.listdir(read_dir))
    os.makedirs(output_dir, exist_ok=True)  # Create the output directory if it doesn't exist


    for filename in processed_files:
        # Read the Parquet file
        wiki_df = pd.read_parquet(os.path.join(read_dir, filename))

        wiki_df.drop(['passage_id', 'title', 'doc_id', 'clean_passage_content'], axis=1, inplace=True)

        # Rename columns
        wiki_df = wiki_df.rename(columns={'raw_passage_content': 'contents', 'joint_id': 'id'})
        
        # Write to JSON file
        json_filename = filename.split(".")[0] + ".json"
        wiki_df.to_json(output_dir+"/"+json_filename, orient='records')

        print(f"Processed {filename} and saved as {json_filename}")

    print("Conversion completed")


In [16]:
convert_json_to_pyserini_format_raw_texts()

Processed 0.parquet and saved as 0.json
Processed 1.parquet and saved as 1.json
Processed 2.parquet and saved as 2.json
Processed 3.parquet and saved as 3.json
Processed 4.parquet and saved as 4.json
Processed 5.parquet and saved as 5.json
Processed 6.parquet and saved as 6.json
Processed 7.parquet and saved as 7.json
Processed 8.parquet and saved as 8.json
Processed 9.parquet and saved as 9.json
Conversion completed
