In [21]:
from pathlib import Path
from chemdataextractor import Document
from chemdataextractor.errors import ReaderError
import chemdataextractor as cde

# PDF path (print absolute path for clarity)
pdf_path = Path("../data/pdf/Test2.pdf")
print("Looking for PDF at:", pdf_path.resolve())

# try CDE PDF reader first, then fallback to pdfminer if it fails
try:
    from chemdataextractor.reader.pdf import PdfReader
    with open(pdf_path, "rb") as f:
        doc = Document.from_file(f, readers=[PdfReader()])
    print("Loaded with CDE PdfReader")
except Exception as e:
    print("CDE PdfReader failed:", e)
    # fallback to pdfminer
    try:
        from pdfminer.high_level import extract_text
    except ImportError:
        raise SystemExit("You need pdfminer.six. Install with: !pip install pdfminer.six")

    text = extract_text(str(pdf_path))
    if not text or not text.strip():
        raise SystemExit("The PDF produced no text")
    doc = Document(text)
    print("Loaded via pdfminer text fallback")

# Try showing basic chemical names only for testing 
print("Entities (cems):", [str(c) for c in doc.cems])

# E) PRINT THE FIRST CLEAR PROPERTY FOUND
"""
def first_property_sentence(doc):
    if not doc.records:
        return "No structured properties found."
    for rec in doc.records:
        data = rec.serialize()
        names = data.get("names") or []
        chem = names[0] if names else "Unknown compound"
        for prop_key, label in [
            ("melting_points", "melting point"),
            ("boiling_points", "boiling point"),
            ("glass_transitions", "glass transition"),
            ("densities", "density"),
            ("solubilities", "solubility"),
        ]:
            vals = data.get(prop_key) or []
            if vals:
                v = vals[0]
                val = v.get("value") or v.get("raw_value") or "?"
                units = v.get("units") or v.get("raw_units") or ""
                return f"{chem} has a {label} of {val} {units}".strip()
    return "No recognized property values found."

print(first_property_sentence(doc))
"""


Looking for PDF at: /Users/oscarrosseneu/Desktop/EPFL/Summer_2025_LAS/MEX/data/pdf/Test2.pdf
Loaded with CDE PdfReader
Entities (cems): ['hydroxyl', 'CO2', 'pentiptycene', 'PAA-C3H7', 'CO2', 'Pent-PI-PEO2000', 'ﬁlm', 'CO2', 'coal', 'iptycene', 'amines', 'poly (ether-b-amide)', '2,3,5,6-tetramethyl-1,4-phenylenediamine', 'iptycene', 'carbamate', 'amino acid', 'CO2/CH4', '2-aminoisobutyric acid', 'triptycene', 'Tf2N', 'epoxide', 'CO2', 'alcohol', 'bisAPAF', 'HPB', 'PEO', 'ﬂue', 'carbamate', 'amines', 'perﬂuoropolymers', 'Zn2+', 'spiro', 'TMAOH', 'CO2/CH4', 'CO2', 'CO2', 'CO2', 'CO2/H2', 'Trip', 'Pentiptycene', 'CO2', 'amine', 'CO2', 'CO2/\nCH4', 'CO2', 'AD60', 'CO2', 'polyaniline', 'TR-α-PBO', 'N2', 'ﬁxed', 'PVA', 'N2', 'hydrogen', 'diacid chloride', 'CO2', 'FDA', 'PVAm/PZ-Gly', 'tetramethylammonium hydroxide', 'Hg', 'amine', 'F', 'perﬂuoropolymers', 'CO2', '6FDA', 'CO2', 'polyimides', 'ﬂexible', 'DABA', 'PVAm', 'PIM-1', 'PEO-ran-PPO-T6T6T', 'histidine', 'Lys', 'ﬂue', 'CO2', 'CO2/N2', 't

'\ndef first_property_sentence(doc):\n    if not doc.records:\n        return "No structured properties found."\n    for rec in doc.records:\n        data = rec.serialize()\n        names = data.get("names") or []\n        chem = names[0] if names else "Unknown compound"\n        for prop_key, label in [\n            ("melting_points", "melting point"),\n            ("boiling_points", "boiling point"),\n            ("glass_transitions", "glass transition"),\n            ("densities", "density"),\n            ("solubilities", "solubility"),\n        ]:\n            vals = data.get(prop_key) or []\n            if vals:\n                v = vals[0]\n                val = v.get("value") or v.get("raw_value") or "?"\n                units = v.get("units") or v.get("raw_units") or ""\n                return f"{chem} has a {label} of {val} {units}".strip()\n    return "No recognized property values found."\n\nprint(first_property_sentence(doc))\n'

In [16]:
from pprint import pprint

print("\n--- SUMMARY ---")
print("CEM count:", len(doc.cems))
print("Record count:", len(doc.records))



--- SUMMARY ---
CEM count: 1172
Record count: 0


In [17]:
print("\n--- RAW RECORDS (serialized) ---")
for i, rec in enumerate(doc.records, 1):
    data = rec.serialize()
    print(f"\n[Record {i}] keys:", list(data.keys()))
    pprint(data, width=100, compact=False)



--- RAW RECORDS (serialized) ---
