# Functions

In [1]:
from bs4 import BeautifulSoup

def html_to_text(html: str) -> str:
    """
    Converts an HTML string into plain text.
    
    Args:
        html (str): The HTML content as a string.
    
    Returns:
        str: Extracted plain text.
    """
    soup = BeautifulSoup(html, "html.parser")
    return soup.get_text(separator=" ", strip=True)

# Main df Processing code

In [None]:
# Important Note
# The original datasets are being overwritten because there was no memory available on the HPC (Terumo)
# To retrieve the original datasets, just run the 1-main.py script located in the datasets folder

import os
from tqdm import tqdm
import pandas as pd
import ast

# Build list of parquet files
files = ['crag_dataset/parquet/' + x for x in os.listdir("crag_dataset/parquet/") if x.endswith(".parquet")]

# Process each file
for file in tqdm(files[-2:], desc="Processing files"):
    df = pd.read_parquet(file)
    all_page_results = []

    for index, row in tqdm(df.iterrows(), total=len(df), desc=f"Processing rows in {os.path.basename(file)}", leave=False):
        search_results = ast.literal_eval(row.search_results)
        curr_search_page_results = []

        for search_result in search_results:
            curr_search_page_results.append(html_to_text(search_result['page_result']))

        str_curr_search_page_results = "\n".join(curr_search_page_results)
        all_page_results.append(str_curr_search_page_results)

    # Add new column and remove old one
    df['page_results_text'] = all_page_results
    df.drop(columns=['search_results'], inplace=True)

    # Save processed DataFrame back (overwrite original)
    df.to_parquet(file, index=False)


In [3]:
import os
from tqdm import tqdm
import pandas as pd
import ast
from bs4 import BeautifulSoup

# Function to convert HTML to clean text
def html_to_text(html: str) -> str:
    """
    Converts an HTML string into plain text.
    """
    soup = BeautifulSoup(html, "html.parser")
    text = soup.get_text(separator=" ", strip=True)
    return clean_text(text)

# Function to remove problematic Unicode characters
def clean_text(text: str) -> str:
    """
    Removes characters that cause UnicodeEncodeError when saving to UTF-8.
    """
    if isinstance(text, str):
        return text.encode("utf-8", "surrogatepass").decode("utf-8", "ignore")
    return text

# Build list of parquet files
files = ['crag_dataset/parquet/' + x for x in os.listdir("crag_dataset/parquet/") if x.endswith(".parquet")]

# Process each file
for file in tqdm(files[-2:], desc="Processing files"):
    df = pd.read_parquet(file)
    all_page_results = []

    for index, row in tqdm(df.iterrows(), total=len(df), desc=f"Processing rows in {os.path.basename(file)}", leave=False):
        search_results = ast.literal_eval(row.search_results)
        curr_search_page_results = []

        for search_result in search_results:
            page_result = search_result.get('page_result', '')
            curr_search_page_results.append(html_to_text(page_result))

        str_curr_search_page_results = "\n".join(curr_search_page_results)
        all_page_results.append(str_curr_search_page_results)

    # Add new column and remove old one
    df['page_results_text'] = all_page_results
    df.drop(columns=['search_results'], inplace=True)

    # Save processed DataFrame back (overwrite original)
    df.to_parquet(file, index=False, engine="pyarrow")  # pyarrow is better with Unicode


Processing files:   0%|                                   | 0/2 [00:00<?, ?it/s]
Processing rows in crag_task_3_dev_v4_5.parquet:   0%|  | 0/300 [00:00<?, ?it/s][A
Processing rows in crag_task_3_dev_v4_5.parquet:   0%| | 1/300 [00:03<16:38,  3.[A
Processing rows in crag_task_3_dev_v4_5.parquet:   1%| | 2/300 [00:10<26:19,  5.[A
Processing rows in crag_task_3_dev_v4_5.parquet:   1%| | 3/300 [00:12<20:20,  4.[A
Processing rows in crag_task_3_dev_v4_5.parquet:   1%| | 4/300 [00:15<17:19,  3.[A
Processing rows in crag_task_3_dev_v4_5.parquet:   2%| | 5/300 [00:17<15:40,  3.[A
Processing rows in crag_task_3_dev_v4_5.parquet:   2%| | 6/300 [00:20<14:57,  3.[A
Processing rows in crag_task_3_dev_v4_5.parquet:   2%| | 7/300 [00:24<15:43,  3.[A
Processing rows in crag_task_3_dev_v4_5.parquet:   3%| | 8/300 [00:27<16:08,  3.[A
Processing rows in crag_task_3_dev_v4_5.parquet:   3%| | 9/300 [00:30<14:42,  3.[A
Processing rows in crag_task_3_dev_v4_5.parquet:   3%| | 10/300 [00:33<14:51,  

In [13]:
files

['crag_dataset/parquet/crag_task_3_dev_v4_9.parquet',
 'crag_dataset/parquet/crag_task_3_dev_v4_4.parquet',
 'crag_dataset/parquet/crag_task_3_dev_v4_2.parquet',
 'crag_dataset/parquet/crag_task_3_dev_v4_6.parquet',
 'crag_dataset/parquet/crag_task_3_dev_v4_8.parquet',
 'crag_dataset/parquet/crag_task_3_dev_v4_3.parquet',
 'crag_dataset/parquet/crag_task_3_dev_v4_7.parquet',
 'crag_dataset/parquet/crag_task_3_dev_v4_0.parquet',
 'crag_dataset/parquet/crag_task_3_dev_v4_5.parquet',
 'crag_dataset/parquet/crag_task_3_dev_v4_1.parquet']