In [1]:
from rdkit import Chem
from torch_geometric.data import Data
import torch
from library.functions_to_abstract_data import extract_qm9_data
from torch_geometric.datasets import QM9
from library.GCN import *
import pandas as pd
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def smiles_to_graph(smiles, y_value):
    mol = Chem.MolFromSmiles(smiles)
    atoms = mol.GetAtoms()
    bonds = mol.GetBonds()

    # Node features (atomic number)
    x = torch.tensor([atom.GetAtomicNum() for atom in atoms], dtype=torch.long).unsqueeze(1)

    # Edges (bond connections)
    edge_index = []
    for bond in bonds:
        i, j = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        edge_index.append((i, j))
        edge_index.append((j, i))  # undirected

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    y = torch.tensor([y_value], dtype=torch.float)

    return Data(x=x, edge_index=edge_index, y=y)

In [3]:
# --- Load data ---
df_qm9 = pd.read_pickle('../data/RDKit/rdkit_only_valid_smiles_qm9.pkl')

smiles = df_qm9["SMILES"]
gaps = df_qm9['gaps']

In [4]:
def smiles_to_graph(smiles, y_value):
    mol = Chem.MolFromSmiles(smiles)
    mol = Chem.AddHs(mol)
    atoms = mol.GetAtoms()
    bonds = mol.GetBonds()

    # Node features (atomic number)
    x = torch.tensor([atom.GetAtomicNum() for atom in atoms], dtype=torch.long).unsqueeze(1)

    # Edges (bond connections)
    edge_index = []
    for bond in bonds:
        i, j = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        edge_index.append((i, j))
        edge_index.append((j, i))  # undirected

    # Edges in COO format (see general explanation GCN on GitHub)
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    y = torch.tensor([y_value], dtype=torch.float)

    return Data(x=x, edge_index=edge_index, y=y)

In [5]:
# Generate grpah dataset
dataset = [smiles_to_graph(smi, gap) for smi, gap in zip(smiles, gaps)]

df_graphs = pd.DataFrame({})
df_graphs['SMILES'] = smiles
df_graphs['gaps'] = gaps
df_graphs['graphs'] = dataset

df_graphs.to_pickle('../data/RDKit/valid_smiles_graphs.pkl')

In [None]:
# Test import of dataframe
df = pd.read_pickle('../data/RDKit/valid_smiles_graphs.pkl')

In [8]:
# How to call specific properties of a graph/molecule

test_idx = 10

print(f'SMILES: {df['SMILES'][test_idx]}\n')
print(f'Atomic numbers of atoms present in molecule:\n {df['graphs'][test_idx].x}\n')
print(f'Connections between atoms in the molecule (COO format):\n {df['graphs'][test_idx].edge_index}\n')
print(f'HOMO-LUMO gap of the molecule: {float(df['graphs'][test_idx].y)} eV\n')

SMILES: [H]C(=O)C([H])([H])[H]

Atomic numbers of atoms present in molecule:
 tensor([[6],
        [8],
        [6],
        [1],
        [1],
        [1],
        [1]])

Connections between atoms in the molecule (COO format):
 tensor([[0, 1, 0, 2, 0, 3, 2, 4, 2, 5, 2, 6],
        [1, 0, 2, 0, 3, 0, 4, 2, 5, 2, 6, 2]])

HOMO-LUMO gap of the molecule: 6.372906684875488 eV

