#### Test of the CDE v2 Table reader on XML files. Finds 

In [13]:
import pathlib
xml_file = pathlib.Path("../data/XMLs/10.1016_j.memsci.2023.122272.xml")
xml_content = xml_file.read_text(encoding="utf-8")

# Check for table tags
for keyword in ["<table", "<ce:table", "<table-wrap"]:
    if keyword in xml_content:
        print(f"Found '{keyword}' in XML")
    else:
        print(f"Did NOT find '{keyword}' in XML")


Did NOT find '<table' in XML
Found '<ce:table' in XML
Did NOT find '<table-wrap' in XML


In [19]:
from bs4 import BeautifulSoup
import pathlib

xml_file = pathlib.Path("../data/XMLs/10.1016_j.memsci.2023.122272.xml")
xml_content = xml_file.read_text(encoding="utf-8")
soup = BeautifulSoup(xml_content, "lxml")

tables = soup.find_all(lambda tag: tag.name and tag.name.endswith('table'))
print(f"Found {len(tables)} tables")

for t_index, table in enumerate(tables, start=1):
    print(f"\n--- Raw content of table {t_index} ---")
    print(table.prettify()[:2000])  # first 2000 characters only


Found 1 tables

--- Raw content of table 1 ---
<ce:table colsep="0" frame="topbot" id="tbl1" rowsep="0" xmlns="http://www.elsevier.com/xml/common/cals/dtd">
 <ce:label>
  Table 1
 </ce:label>
 <ce:caption id="cap0060">
  <ce:simple-para id="tspara0010" view="all">
   The ideal gas separation performance of the CAP
   <ce:inf loc="post">
    ATRP
   </ce:inf>
   -UTFC membranes by resistance-in-series model in composite membranes. The CO
   <ce:inf loc="post">
    2
   </ce:inf>
   and N
   <ce:inf loc="post">
    2
   </ce:inf>
   permeabilities of PDMS and PDXL were used according to the literature (PDMS permeabilities: 3,800 Barrer (CO
   <ce:inf loc="post">
    2
   </ce:inf>
   ) and 400 Barrer (N
   <ce:inf loc="post">
    2
   </ce:inf>
   ); PDXL permeabilities: 200 Barrer (CO
   <ce:inf loc="post">
    2
   </ce:inf>
   ) and 2.86 Barrer (N
   <ce:inf loc="post">
    2
   </ce:inf>
   )) [
   <ce:cross-ref id="crosref0615" refid="bib14">
    14
   </ce:cross-ref>
   ,
   <ce:cr

In [20]:
# Look for any <tbody> content in the table
tbody = tables[0].find(lambda tag: tag.name and tag.name.endswith('tbody'))
if tbody:
    print("Tbody exists, length:", len(tbody.get_text(strip=True)))
    print(tbody.get_text("\n", strip=True)[:500])  # preview first 500 chars
else:
    print("No <tbody> in table")


Tbody exists, length: 113
CAP
ATRP
2h
145.3 ± 19.2
1354.9
69.0
1013
71.3
CAP
ATRP
3h
195.9 ± 19.7
1009.0
69.2
838
72.5
CAP
ATRP
4h
247.7 ± 7.3
800.0
69.4
204
57.5


In [21]:
from bs4 import BeautifulSoup
import pathlib

xml_file = pathlib.Path("../data/XMLs/10.1016_j.memsci.2023.122272.xml")
xml_content = xml_file.read_text(encoding="utf-8")
soup = BeautifulSoup(xml_content, "lxml")

tables = soup.find_all(lambda tag: tag.name and tag.name.endswith('table'))
print(f"Found {len(tables)} tables")

for t_index, table in enumerate(tables, start=1):
    # Extract headers from thead/row/entry
    headers = []
    thead = table.find(lambda tag: tag.name and tag.name.endswith('thead'))
    if thead:
        header_row = thead.find(lambda tag: tag.name and tag.name.endswith('row'))
        if header_row:
            headers = [entry.get_text(" ", strip=True) for entry in header_row.find_all(lambda tag: tag.name and tag.name.endswith('entry'))]

    # Extract data rows from tbody
    data_rows = []
    tbody = table.find(lambda tag: tag.name and tag.name.endswith('tbody'))
    if tbody:
        for row in tbody.find_all(lambda tag: tag.name and tag.name.endswith('row')):
            cells = [entry.get_text(" ", strip=True) for entry in row.find_all(lambda tag: tag.name and tag.name.endswith('entry'))]
            data_rows.append(cells)

    print(f"\nTable {t_index} headers:", headers)
    for row in data_rows:
        print(dict(zip(headers, row)))


Found 1 tables

Table 1 headers: ['', 'Thickness (nm)', 'Ideal CO 2 permeance (GPU)', 'Ideal CO 2 /N 2 selectivity', 'Experimental CO 2 permeance (GPU)', 'Experimental CO 2 /N 2 selectivity']
{'': 'CAP ATRP 2h', 'Thickness (nm)': '145.3\xa0±\xa019.2', 'Ideal CO 2 permeance (GPU)': '1354.9', 'Ideal CO 2 /N 2 selectivity': '69.0', 'Experimental CO 2 permeance (GPU)': '1013', 'Experimental CO 2 /N 2 selectivity': '71.3'}
{'': 'CAP ATRP 3h', 'Thickness (nm)': '195.9\xa0±\xa019.7', 'Ideal CO 2 permeance (GPU)': '1009.0', 'Ideal CO 2 /N 2 selectivity': '69.2', 'Experimental CO 2 permeance (GPU)': '838', 'Experimental CO 2 /N 2 selectivity': '72.5'}
{'': 'CAP ATRP 4h', 'Thickness (nm)': '247.7\xa0±\xa07.3', 'Ideal CO 2 permeance (GPU)': '800.0', 'Ideal CO 2 /N 2 selectivity': '69.4', 'Experimental CO 2 permeance (GPU)': '204', 'Experimental CO 2 /N 2 selectivity': '57.5'}
