In [2]:
from bs4 import BeautifulSoup
import pathlib

def extract_elsevier_tables(xml_path, header_map=None):
    """
    Extracts all tables from an Elsevier XML file (ScienceDirect full-text format).
    
    Args:
        xml_path (str or Path): Path to the XML file.
        header_map (dict): Optional mapping from raw header text to standard names.
    
    Returns:
        list of dict: Each dict contains:
            'headers': list of column names (mapped if header_map provided)
            'rows': list of dicts with column:value pairs
    """
    xml_content = pathlib.Path(xml_path).read_text(encoding="utf-8")
    soup = BeautifulSoup(xml_content, "lxml")
    
    all_tables = []
    
    for table in soup.find_all(lambda tag: tag.name and tag.name.endswith('table')):
        # Extract headers from thead
        headers = []
        thead = table.find(lambda tag: tag.name and tag.name.endswith('thead'))
        if thead:
            header_row = thead.find(lambda tag: tag.name and tag.name.endswith('row'))
            if header_row:
                headers = [
                    entry.get_text(" ", strip=True).replace(" ", " ").replace("\n", " ")
                    for entry in header_row.find_all(lambda tag: tag.name and tag.name.endswith('entry'))
                ]
        
        # Apply header mapping if provided
        if header_map:
            mapped_headers = [header_map.get(h, h) for h in headers]
        else:
            mapped_headers = headers
        
        # Extract data from tbody
        rows = []
        tbody = table.find(lambda tag: tag.name and tag.name.endswith('tbody'))
        if tbody:
            for row in tbody.find_all(lambda tag: tag.name and tag.name.endswith('row')):
                cells = [
                    entry.get_text(" ", strip=True).replace(" ", " ").replace("\n", " ")
                    for entry in row.find_all(lambda tag: tag.name and tag.name.endswith('entry'))
                ]
                # Pad if row shorter than headers
                if len(cells) < len(mapped_headers):
                    cells += [""] * (len(mapped_headers) - len(cells))
                rows.append(dict(zip(mapped_headers, cells)))
        
        if headers and rows:
            all_tables.append({"headers": mapped_headers, "rows": rows})
    
    return all_tables


# Example usage:
HEADER_MAP = {
    "Thickness (nm)": "thickness_nm",
    "Ideal CO₂ permeance (GPU)": "ideal_co2_perm_gpu",
    "Ideal CO2 permeance (GPU)": "ideal_co2_perm_gpu",
    "Ideal CO₂/N₂ selectivity": "ideal_selectivity",
    "Experimental CO₂ permeance (GPU)": "exp_co2_perm_gpu",
    "Experimental CO₂/N₂ selectivity": "exp_selectivity"
}

tables = extract_elsevier_tables("../data/XMLs/10.1016_j.memsci.2023.122272.xml", HEADER_MAP)

for t in tables:
    print("Headers:", t["headers"])
    for row in t["rows"]:
        print(row)


Headers: ['', 'thickness_nm', 'Ideal CO 2 permeance (GPU)', 'Ideal CO 2 /N 2 selectivity', 'Experimental CO 2 permeance (GPU)', 'Experimental CO 2 /N 2 selectivity']
{'': 'CAP ATRP 2h', 'thickness_nm': '145.3\xa0±\xa019.2', 'Ideal CO 2 permeance (GPU)': '1354.9', 'Ideal CO 2 /N 2 selectivity': '69.0', 'Experimental CO 2 permeance (GPU)': '1013', 'Experimental CO 2 /N 2 selectivity': '71.3'}
{'': 'CAP ATRP 3h', 'thickness_nm': '195.9\xa0±\xa019.7', 'Ideal CO 2 permeance (GPU)': '1009.0', 'Ideal CO 2 /N 2 selectivity': '69.2', 'Experimental CO 2 permeance (GPU)': '838', 'Experimental CO 2 /N 2 selectivity': '72.5'}
{'': 'CAP ATRP 4h', 'thickness_nm': '247.7\xa0±\xa07.3', 'Ideal CO 2 permeance (GPU)': '800.0', 'Ideal CO 2 /N 2 selectivity': '69.4', 'Experimental CO 2 permeance (GPU)': '204', 'Experimental CO 2 /N 2 selectivity': '57.5'}
