In [None]:
from rdkit import Chem
from torch_geometric.data import Data
import torch
from library.functions_to_abstract_data import extract_qm9_data
from torch_geometric.datasets import QM9

In [None]:
def smiles_to_graph(smiles, y_value):
    mol = Chem.MolFromSmiles(smiles)
    atoms = mol.GetAtoms()
    bonds = mol.GetBonds()

    # Node features (atomic number)
    x = torch.tensor([atom.GetAtomicNum() for atom in atoms], dtype=torch.long).unsqueeze(1)

    # Edges (bond connections)
    edge_index = []
    for bond in bonds:
        i, j = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        edge_index.append((i, j))
        edge_index.append((j, i))  # undirected

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    y = torch.tensor([y_value], dtype=torch.float)

    return Data(x=x, edge_index=edge_index, y=y)

In [None]:
# --- Load data ---
dataset = QM9(root="../data/QM9")
df_qm9 = extract_qm9_data(dataset)

smiles = df_qm9["smiles"]
gaps = df_qm9['gap']

dataset = [smiles_to_graph(smi, gap) for smi, gap in zip(smiles, gaps)]

# Save dataset