In [8]:
import tarfile, pyarrow as pa, pyarrow.parquet as pq
import numpy as np, tqdm

# ---- minimal parser that skips rotA/B/C -------------------------------
PROP_LABELS = [
    "mu", "alpha", "homo", "lumo", "gap",
    "r2", "zpve", "U0", "U", "H", "G", "Cv"
]

def parse_qm9_xyz_stream(f):
    """Read one QM9‑style .xyz from an *open binary stream*.
       Returns (index, smiles, 12‑value list)."""
    def _line():
        return f.readline().decode("utf‑8")

    n_atoms = int(_line())            # 1st line
    header  = _line().split('\t')     # 2nd line
    _, idx  = header[0].split()       # "gdb 3895"
    idx     = int(idx)
    values  = list(map(float, header[4:16]))     # skip 3 rot.const

    for _ in range(n_atoms - 1):      # skip coords
        f.readline()
    penultimate = f.readline().decode("utf‑8")
    smiles      = penultimate.split('\t')[0]
    return idx, smiles, values


In [14]:
tar_path     = "dataset.tar.gz"        # <‑‑ change to your archive
parquet_path = "qm9_trimmed.parquet"   # output file name
BATCH        = 11_000                   # rows per flush (adjust if you like)


In [15]:
ids, smiles, props = [], [], []

schema = pa.schema([
    ("index",  pa.int32()),
    ("smiles", pa.string()),
    ("props",  pa.list_(pa.float32()))
])
writer = pq.ParquetWriter(parquet_path, schema, compression="zstd")

def flush():
    global ids, smiles, props
    table = pa.Table.from_pydict(
        {"index": ids, "smiles": smiles, "props": props},
        schema=schema
    )
    writer.write_table(table)
    ids, smiles, props = [], [], []

with tarfile.open(tar_path, "r:*") as tf:
    for member in tqdm.tqdm(tf.getmembers(), desc="Parsing"):
        if not member.name.endswith(".xyz"):
            continue
        with tf.extractfile(member) as f:
            idx, smi, vals = parse_qm9_xyz_stream(f)
        ids.append(idx); smiles.append(smi); props.append(vals)
        if len(ids) == BATCH:
            flush()
    if ids:                           # tail
        flush()

writer.close()
print("✔ Done – wrote", parquet_path)


Parsing: 100%|██████████| 11002/11002 [00:00<00:00, 23635.84it/s]

✔ Done – wrote qm9_trimmed.parquet





In [16]:
import pandas as pd

df = pd.read_parquet("qm9_trimmed.parquet")
print(df.shape)      # (11000, 3)  → index, smiles, props
df.head()


(11001, 3)


Unnamed: 0,index,smiles,props
0,3895,O,"[3.3067, 46.55, -0.263, -0.0607, 0.2023, 965.3..."
1,3896,H,"[2.4177, 46.66, -0.2701, -0.0861, 0.184, 567.5..."
2,3897,H,"[0.2386, 48.24, -0.2681, -0.1008, 0.1674, 577...."
3,3898,H,"[1.5473, 45.28, -0.2714, -0.12, 0.1514, 556.63..."
4,3899,H,"[1.7309, 41.79, -0.2961, -0.057, 0.2391, 913.3..."


In [21]:
props = pd.DataFrame(df["props"].tolist(), columns=[
    "mu", "alpha", "homo", "lumo", "gap",
    "r2", "zpve", "U0", "U", "H", "G", "Cv"
])
full = pd.concat([df[["index", "smiles"]], props], axis=1)

print(full.shape)    # (11000, 15)  → index, smiles, 12 props

# save to CSV (optional)
full.to_csv("qm9_trimmed.csv", index=False)


(11001, 14)
