In [8]:
!pip install git+https://github.com/NREL/alfabet.git@0.2.2


Collecting git+https://github.com/NREL/alfabet.git@0.2.2
  Cloning https://github.com/NREL/alfabet.git (to revision 0.2.2) to c:\users\80710\appdata\local\temp\pip-req-build-w3k8arnm
  Resolved https://github.com/NREL/alfabet.git to commit 9942cbd6fceeed549e8126692b15bb135e103f5a
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'


  Running command git clone --filter=blob:none --quiet https://github.com/NREL/alfabet.git 'C:\Users\80710\AppData\Local\Temp\pip-req-build-w3k8arnm'
  Running command git checkout -q 9942cbd6fceeed549e8126692b15bb135e103f5a


In [9]:
from alfabet.drawing import draw_mol_outlier
from alfabet.fragment import canonicalize_smiles
from alfabet.neighbors import find_neighbor_bonds
from alfabet.prediction import predict_bdes, check_input

In [10]:
import alfabet
alfabet.__version__

'0.2.2'

In [11]:
import rdkit

In [12]:
rdkit.__version__

'2024.03.5'

In [13]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [15]:
import networkx as nx
from rdkit import Chem
import numpy as np

def create_bde_graph_selective_hs(smiles: str, bde_df) -> nx.Graph:
    """
    Build a NetworkX graph from the *original (heavy-atom)* RDKit Mol:
      - Keep all heavy-atom ring & skeleton bonds from the SMILES.
      - Add new H-X bonds (i.e., only the hydrogens needed) when a row in bde_df indicates
        a predicted bond that doesn't already exist in the heavy-atom Mol.
    
    bde_df is expected to have columns:
       - start_atom, end_atom: integer indexes or placeholders
       - bde_pred, bdfe_pred, etc.: predicted data for each bond
       - possibly bond_index (optional)
    
    Steps:
       1) Parse the SMILES without adding Hs (just once).
       2) Build a base Nx graph with all heavy-atom nodes & edges.
       3) Iterate over bde_df. If the row corresponds to an existing heavy–heavy bond,
          update the Nx edge with predicted data. If the row corresponds to an H–X bond,
          add the H node + edge and store the predictions.
    """

    # 1. Parse the SMILES into an RDKit Mol (no AddHs)
    base_mol = Chem.MolFromSmiles(smiles)
    if base_mol is None:
        # Handle parse error, e.g. return empty graph
        return nx.Graph()

    # 2. Create an Nx graph, optionally store the RDKit Mol for reference
    G = nx.Graph(mol=base_mol)

    # 3. Add heavy-atom nodes
    #    We'll store:
    #      - 'symbol': e.g. 'C', 'O', 'N', etc.
    #      - 'rdkit_idx': the integer index assigned by RDKit
    #    Feel free to store other attributes as well.
    for atom in base_mol.GetAtoms():
        atom_idx = atom.GetIdx()
        G.add_node(atom_idx, 
                   symbol=atom.GetSymbol(),
                   rdkit_idx=atom_idx)

    # 4. Add edges for all heavy-atom bonds in the original (no-H) Mol
    #    We won't attach any BDE predictions yet (set them to None).
    #    We'll also store a default bond_index=None if desired.
    for bond in base_mol.GetBonds():
        a1 = bond.GetBeginAtomIdx()
        a2 = bond.GetEndAtomIdx()
        G.add_edge(a1, a2,
                   bond_index=None,
                   bde_pred=None,
                   bdfe_pred=None)

    # 5. Iterate over bde_df.  We'll assume the columns are something like:
    #     start_atom, end_atom, bde_pred, bdfe_pred, bond_index, etc.
    #    - For heavy–heavy predictions, update the existing edge with predicted data.
    #    - For H–X predictions, add the new hydrogen node & edge if not present.
    #    - This approach assumes that for an H–X bond, either start_atom or end_atom
    #      is a placeholder for hydrogen or an integer representing "H" in your dataset.
    for _, row in bde_df.iterrows():
        s = row['start_atom']
        e = row['end_atom']
        
        # Attempt to interpret s and e in the context of the base mol
        # We'll use a simple rule:
        #  - If the index is >= base_mol.GetNumAtoms(), treat it as "this is a hydrogen"
        #  - Or you could have a special marker like -1 for hydrogen
        #    (depends on how your data is structured)
        
        # We also store predicted data
        bde_pred_value = row.get('bde_pred', None)
        bdfe_pred_value = row.get('bdfe_pred', None)
        bond_index_value = row.get('bond_index', None)
        
        # Convert them to integers if needed
        # (In practice, you may need to handle missing or invalid indexes carefully)
        
        # We'll define a helper function to check if an index is "heavy" or "hydrogen"
        def is_heavy(idx):
            return (0 <= idx < base_mol.GetNumAtoms())
        
        # Determine the "types" of s and e
        s_is_heavy = is_heavy(s)
        e_is_heavy = is_heavy(e)

        if s_is_heavy and e_is_heavy:
            # This is a heavy–heavy bond.
            # If it already exists in G, update attributes.
            if G.has_edge(s, e):
                # Just update the existing edge
                G[s][e]['bde_pred'] = bde_pred_value
                G[s][e]['bdfe_pred'] = bdfe_pred_value
                G[s][e]['bond_index'] = bond_index_value
            else:
                # Possibly -?> no, not possible the bond doesn't exist in the original skeleton 
                # (this can happen if the SMILES didn't have it).
                # Add it as a new edge. This is unusual, but let's handle it anyway.
                G.add_edge(s, e,
                           bond_index=bond_index_value,
                           bde_pred=bde_pred_value,
                           bdfe_pred=bdfe_pred_value)

        else:
            # At least one of them is a "hydrogen" or out-of-range index
            # We'll figure out which one is the heavy atom and which is the hydrogen.
            if s_is_heavy and not e_is_heavy:
                heavy_idx, hydrogen_idx = s, e
            elif e_is_heavy and not s_is_heavy:
                heavy_idx, hydrogen_idx = e, s
            else:
                # Both are hydrogens or out-of-range, which might be invalid.
                # For safety, just skip or handle error.
                # Could print a warning, raise an exception, etc.
                continue

            # Step 1: ensure the hydrogen node is present in G
            # We'll generate a unique node key for the H, e.g. "H_{hydrogen_idx}"
            # or something that won't collide with integer-based heavy nodes.
            # You could also store the actual integer if your system allows it.
            h_node = f"H_{hydrogen_idx}"
            if not G.has_node(h_node):
                # Add the hydrogen node with minimal attributes
                G.add_node(h_node,
                           symbol='H',
                           rdkit_idx=None)  # or some other placeholder

            # Step 2: add the H–X bond or update if it already exists
            # The heavy_idx is the integer from RDKit.
            if not G.has_edge(heavy_idx, h_node):
                G.add_edge(heavy_idx, h_node,
                           bond_index=bond_index_value,
                           bde_pred=bde_pred_value,
                           bdfe_pred=bdfe_pred_value)
            else:
                # If it somehow exists, just update attributes
                G[heavy_idx][h_node]['bde_pred'] = bde_pred_value
                G[heavy_idx][h_node]['bdfe_pred'] = bdfe_pred_value
                G[heavy_idx][h_node]['bond_index'] = bond_index_value

    return G


In [16]:
def graph_to_df(bde_graph: nx.Graph) -> pd.DataFrame:
    """
    Convert the edges of bde_graph into a DataFrame with columns:
      ['u', 'v', 'bond_index', 'graph_bde_pred', 'graph_bdfe_pred'].
    """
    rows = []
    for u, v, data in bde_graph.edges(data=True):
        rows.append({
            'u': u,
            'v': v,
            'bond_index': data['bond_index'],
            'graph_bde_pred': data.get('bde_pred', None),
            'graph_bdfe_pred': data.get('bdfe_pred', None)
        })
    return pd.DataFrame(rows)

In [17]:
smiles_list = ['C1CC([C@H]3[C@@](C1)(C)[C@H]2CC[C@H](C)[C@H]([C@@]2(CC3)C)CCCC)(C)C',
       'C1CC([C@H]3[C@@](C1)(C)[C@H]2CC[C@H](C)[C@H]([C@@]2(CC3)C)CCC(C)C)(C)C',
       'C1CC([C@H]3[C@@](C1)(C)[C@H]2CC[C@H](C)[C@H]([C@@]2(CC3)C)CC[C@@H](C)CC)(C)C',
       'C1CC([C@H]3[C@@](C1)(C)[C@H]2CC[C@H](C)[C@H]([C@@]2(CC3)C)CC[C@H](CCC)C)(C)C',
       'C1CC([C@H]3[C@@](C1)(C)[C@H]2CC[C@H](C)[C@H]([C@]2(C)CC3)CC[C@H](C)CCCCC)(C)C',
       'C(CCC)C[C@H](C)CC[C@@H]1[C@H](CC[C@H]2[C@]1(CC[C@@H]3[C@@]2(CCCC3(C)C)C)C)C',
       'C1CC([C@H]3[C@@](C1)(C)[C@H]2CC[C@H](C)[C@H]([C@@]2(CC3)C)CC[C@@H](CCCC(C)C)C)(C)C',
       'C(C[C@@H](CC[C@H]1[C@]3([C@H](CC[C@@H]1C)[C@]2(CCCC(C)(C)[C@@H]2CC3)C)C)C)CC(C)C',
       '[C@]23(CC[C@@H]1[C@@](CCCC1(C)C)(C)[C@H]2CC[C@H]4[C@]3(CC[C@]5([C@@H]4CCC5)C)C)C',
       '[C@]12(CC[C@@H]5[C@@]([C@H]1CC[C@H]3[C@@]2(C)CC[C@H]4[C@@]3(CCC4)C)(CCCC5(C)C)C)C',
       'CC[C@@H]1CC[C@]2(C1CCC3(C2CCC4C3(CCC5C4(CCCC5(C)C)C)C)C)C',
       'CCC[C@@H]1CC[C@]2(C1CCC3(C2CCC4C3(CCC5C4(CCCC5(C)C)C)C)C)C',
       'CCC(C)[C@@H]1CC[C@]2(C1CCC3(C2CCC4C3(CCC5C4(CCCC5(C)C)C)C)C)C',
       'CC[C@@H](C)[C@@H]1CC[C@]2(C1CCC3(C2CCC4C3(CCC5C4(CCCC5(C)C)C)C)C)C',
       'CCCC(C)[C@@H]1CC[C@]2(C1CCC3(C2CCC4C3(CCC5C4(CCCC5(C)C)C)C)C)C',
       'CCC[C@@H](C)[C@@H]1CC[C@]2(C1CCC3(C2CCC4C3(CCC5C4(CCCC5(C)C)C)C)C)C',
       'CCCCC(C)[C@@H]1CC[C@]2(C1CCC3(C2CCC4C3(CCC5C4(CCCC5(C)C)C)C)C)C',
       'CCCC[C@@H](C)[C@@H]1CC[C@]2(C1CCC3(C2CCC4C3(CCC5C4(CCCC5(C)C)C)C)C)C',
       'CCCCCC(C)[C@@H]1CC[C@]2(C1CCC3(C2CCC4C3(CCC5C4(CCCC5(C)C)C)C)C)C',
       'CCCCC[C@@H](C)[C@@H]1CC[C@]2(C1CCC3(C2CCC4C3(CCC5C4(CCCC5(C)C)C)C)C)C']

In [18]:
import urllib.parse
def quote(x):
    return urllib.parse.quote(x, safe='')

In [20]:
dfs = []
graphs = []  # Optionally keep a list of graphs if you want them separately

for smiles in smiles_list:
    # 1) Canonicalize and sanity-check input
    can_smiles = canonicalize_smiles(smiles)
    is_outlier, missing_atom, missing_bond = check_input(can_smiles)

    # 2) Get DataFrame of predicted BDE/BDFE for each bond
    bde_df = predict_bdes(can_smiles, draw=True)
    bde_df['raw_smiles'] = smiles

    # 3) Deduplicate and store any extra columns you like
    bde_df = bde_df.drop_duplicates(['fragment1', 'fragment2']).reset_index(drop=True)
    bde_df['smiles_link'] = bde_df.molecule.apply(quote)

    # 4) Build a NetworkX graph containing predicted BDE/BDFE
    bde_graph = create_bde_graph_selective_hs(can_smiles, bde_df)

    # 5) (Optional) store the graph in the DataFrame if you want
    #    the same graph for all rows (one per entire molecule)
    bde_df['nx_graph'] = [bde_graph] * len(bde_df)

    # 6) Append to your results
    dfs.append(bde_df)
    graphs.append(bde_graph)   # In case you want them in parallel




In [21]:
# Merge all DataFrame results
alfabet_results_022 = pd.concat(dfs, ignore_index=True)


In [22]:
graph_to_df(graphs[0])

Unnamed: 0,u,v,bond_index,graph_bde_pred,graph_bdfe_pred
0,0,1,0.0,89.382645,75.711853
1,0,H_23,25.0,100.077187,91.049133
2,1,2,1.0,85.872467,71.412849
3,1,H_27,29.0,97.163109,87.689636
4,2,3,2.0,85.041306,70.000275
5,2,H_28,30.0,95.392189,86.257256
6,3,4,3.0,83.115479,66.99527
7,3,H_30,32.0,94.518456,84.748627
8,4,5,,,
9,4,10,,,


In [24]:
import torch
from torch.utils.data import Dataset

class MoleculeEnvDataset(Dataset):
    """
    A simple Dataset that yields (graph_data, env_features, target).
    """
    def __init__(self, df):
        """
        Args:
          df: a pandas DataFrame with columns:
              [ 'nx_graph', 'temperature', 'Seawater', 'Concentration', 'Time', 'degradation_rate', ...]
        """
        self.df = df.reset_index(drop=True)
        
        # Optionally do some numerical encoding of 'Seawater' if it is categorical
        # or handle arbitrary env variables. Here we assume they are numeric or
        # can be turned numeric. For example:
        #   'Seawater' -> 1, 'freshwater' -> 0
        # or keep them as real values if numeric.
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # 1) The graph is a networkx Graph or something we can convert to PyG's Data
        nx_graph = row['nx_graph']  # e.g. from your create_bde_graph_selective_hs()
        # Convert it if needed to PyG Data: you might have a helper function like:
        #   data = convert_nx_to_pyg(nx_graph)
        # For simplicity here, we’ll just return nx_graph directly.
        
        # 2) Env features: 
        # Suppose we stack them in a tensor [temp, concentration, time, ...].
        # If "Seawater" is categorical, encode that too.
        # We'll do a minimal example, but adapt to your actual columns.
        env_features = torch.tensor([
            row['temperature'],
            row['Concentration'],
            row['Time']
            # Possibly also handle 'Seawater' (0 or 1)
        ], dtype=torch.float)
        
        # 3) The target:
        target = torch.tensor(row['degradation_rate'], dtype=torch.float)
        
        return nx_graph, env_features, target



In [25]:
import torch.nn as nn
import torch.nn.functional as F

class EnvPositionalEncoder(nn.Module):
    """
    Maps environment features (temperature, concentration, etc.) into a
    learned positional embedding. Instead of sinusoidal, this is an MLP.
    """
    def __init__(self, env_input_dim, d_model, hidden_dim=64):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(env_input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, d_model)
        )
        
    def forward(self, env_features):
        """
        env_features: (batch_size, env_input_dim)
        Returns: (batch_size, d_model)
        """
        return self.mlp(env_features)


In [26]:
from torch_geometric.nn import GCNConv, global_mean_pool
# or any Graph Transformer block you'd like

class SimpleGraphModel(nn.Module):
    def __init__(self, 
                 num_node_features,   # e.g. dimension of (atomic symbol) embeddings
                 env_input_dim,       # dimension of environment features
                 hidden_dim=128, 
                 output_dim=1):       # for regression (1 = predicted degradation rate)
        super().__init__()
        
        self.env_encoder = EnvPositionalEncoder(env_input_dim, d_model=hidden_dim)
        
        # Example: We embed node features up to hidden_dim
        self.node_embedding = nn.Linear(num_node_features, hidden_dim)
        
        # A couple of GCNConv layers for demonstration
        self.conv1 = GCNConv(hidden_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        
        # Final MLP to produce a single scalar
        self.fc_out = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )
    
    def forward(self, data, env_features):
        """
        data: a PyG Data object or something with:
              data.x (node_features) shape = [num_nodes, num_node_features]
              data.edge_index
              data.batch if using a batch of graphs
        env_features: (batch_size, env_input_dim) -> we map to (batch_size, hidden_dim)
        
        Return: predicted degradation rate shape (batch_size,)
        """
        # 1) Encode environment -> positional embedding
        batch_size = env_features.shape[0]
        env_pos_emb = self.env_encoder(env_features)  # (batch_size, hidden_dim)
        
        # 2) Node embedding
        # data.x shape = (total_num_nodes_in_batch, num_node_features)
        x = self.node_embedding(data.x)
        
        # 3) For each graph in the batch, we add the environment embedding:
        #    We need to figure out which of the `batch_size` each node belongs to.
        #    PyG uses data.batch to indicate the graph index for each node.
        #    So we broadcast-add env_pos_emb to x for each node belonging to the same graph.
        # shape of data.batch = (total_num_nodes_in_batch,)
        
        # Expand env_pos_emb for each node. For example:
        x = x + env_pos_emb[data.batch]  # broadcast by indexing the correct row in env_pos_emb
        
        # 4) Pass through GCN layers
        x = self.conv1(x, data.edge_index)
        x = F.relu(x)
        x = self.conv2(x, data.edge_index)
        x = F.relu(x)
        
        # 5) Global pooling to get a single graph-level vector
        #    shape = (batch_size, hidden_dim)
        x = global_mean_pool(x, data.batch)
        
        # 6) Final MLP to get predicted scalar
        out = self.fc_out(x)  # shape (batch_size, 1)
        
        return out.squeeze(-1)  # shape (batch_size,)


In [27]:
import torch.optim as optim
import math
from sklearn.metrics import r2_score

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

def train_model(model, dataloader, epochs=20):
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.MSELoss()

    for epoch in range(1, epochs + 1):
        model.train()
        total_loss = 0.0
        total_samples = 0
        
        for graphs_batch, envs_batch, targets_batch in dataloader:
            graphs_batch = graphs_batch.to(device)
            envs_batch = envs_batch.to(device)
            targets_batch = targets_batch.to(device)

            optimizer.zero_grad()
            outputs = model(graphs_batch, envs_batch)

            # Compute MSE Loss
            loss = criterion(outputs, targets_batch)
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * targets_batch.size(0)
            total_samples += targets_batch.size(0)

        avg_loss = total_loss / total_samples  # Compute average MSE loss

        # === Evaluation Step ===
        model.eval()
        with torch.no_grad():
            preds, gts = [], []
            
            for graphs_batch, envs_batch, targets_batch in dataloader:
                graphs_batch = graphs_batch.to(device)
                envs_batch = envs_batch.to(device)
                targets_batch = targets_batch.to(device)

                out = model(graphs_batch, envs_batch)
                preds.append(out.cpu())
                gts.append(targets_batch.cpu())

            preds = torch.cat(preds).numpy()
            gts = torch.cat(gts).numpy()

            mse_val = F.mse_loss(torch.tensor(preds), torch.tensor(gts)).item()
            rmse_val = math.sqrt(mse_val)
            r2_val = r2_score(gts, preds)

        print(f"Epoch {epoch}/{epochs} - Train MSE: {avg_loss:.4f}, Val RMSE: {rmse_val:.4f}, R²: {r2_val:.4f}")




Using device: cpu


In [28]:
# Assuming the environment variable

# 1) Add random environment columns for demonstration
num_rows = len(alfabet_results_022)

# Temperatures between 10°C and 40°C
alfabet_results_022['temperature'] = np.random.uniform(10, 40, size=num_rows)

# Concentration in mg/L, random 1–100
alfabet_results_022['Concentration'] = np.random.uniform(1, 100, size=num_rows)

# Time in hours, random 0–120
alfabet_results_022['Time'] = np.random.uniform(0, 120, size=num_rows)

# Categorical 'Seawater' vs 'fresh' environment
alfabet_results_022['Seawater'] = np.random.choice(['sea', 'fresh'], size=num_rows)

# And a random target: 'degradation_rate' (arbitrary range)
alfabet_results_022['degradation_rate'] = np.random.uniform(0.1, 1.0, size=num_rows)

# 2) Inspect the updated DataFrame
alfabet_results_022.head(5)


Unnamed: 0,molecule,bond_index,bond_type,start_atom,end_atom,fragment1,fragment2,is_valid_stereo,bde_pred,bdfe_pred,bde,bdfe,set,svg,has_dft_bde,raw_smiles,smiles_link,nx_graph,temperature,Concentration,Time,Seawater,degradation_rate
0,CCCC[C@@H]1[C@@H](C)CC[C@H]2[C@@]1(C)CC[C@H]1C...,10,C-C,10,11,CCCC[C@H]1[C]2CC[C@H]3C(C)(C)CCC[C@]3(C)[C@H]2...,[CH3],True,79.460541,64.25386,,,,<?xml version='1.0' encoding='iso-8859-1'?>\n<...,False,C1CC([C@H]3[C@@](C1)(C)[C@H]2CC[C@H](C)[C@H]([...,CCCC%5BC%40%40H%5D1%5BC%40%40H%5D%28C%29CC%5BC...,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",32.238788,43.460098,4.486329,sea,0.812391
1,CCCC[C@@H]1[C@@H](C)CC[C@H]2[C@@]1(C)CC[C@H]1C...,21,C-C,21,22,CCCC[C@@H]1[C@@H](C)CC[C@@H]2[C]3CCCC(C)(C)[C@...,[CH3],True,79.644073,64.361122,,,,<?xml version='1.0' encoding='iso-8859-1'?>\n<...,False,C1CC([C@H]3[C@@](C1)(C)[C@H]2CC[C@H](C)[C@H]([...,CCCC%5BC%40%40H%5D1%5BC%40%40H%5D%28C%29CC%5BC...,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",39.681957,32.678321,3.274464,fresh,0.584275
2,CCCC[C@@H]1[C@@H](C)CC[C@H]2[C@@]1(C)CC[C@H]1C...,15,C-C,15,16,CCCC[C@@H]1[C@@H](C)CC[C@H]2[C@@]1(C)CC[C@H]1[...,[CH3],True,82.40863,67.338509,,,,<?xml version='1.0' encoding='iso-8859-1'?>\n<...,False,C1CC([C@H]3[C@@](C1)(C)[C@H]2CC[C@H](C)[C@H]([...,CCCC%5BC%40%40H%5D1%5BC%40%40H%5D%28C%29CC%5BC...,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",21.003736,97.309748,7.638673,fresh,0.400473
3,CCCC[C@@H]1[C@@H](C)CC[C@H]2[C@@]1(C)CC[C@H]1C...,3,C-C,3,4,[CH2]CCC,C[C@@H]1[CH][C@]2(C)CC[C@H]3C(C)(C)CCC[C@]3(C)...,True,83.115479,66.99527,,,,<?xml version='1.0' encoding='iso-8859-1'?>\n<...,False,C1CC([C@H]3[C@@](C1)(C)[C@H]2CC[C@H](C)[C@H]([...,CCCC%5BC%40%40H%5D1%5BC%40%40H%5D%28C%29CC%5BC...,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",32.229716,47.585323,105.86437,fresh,0.281733
4,CCCC[C@@H]1[C@@H](C)CC[C@H]2[C@@]1(C)CC[C@H]1C...,2,C-C,2,3,[CH2]CC,[CH2][C@@H]1[C@@H](C)CC[C@H]2[C@@]1(C)CC[C@H]1...,True,85.041306,70.000275,,,,<?xml version='1.0' encoding='iso-8859-1'?>\n<...,False,C1CC([C@H]3[C@@](C1)(C)[C@H]2CC[C@H](C)[C@H]([...,CCCC%5BC%40%40H%5D1%5BC%40%40H%5D%28C%29CC%5BC...,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",12.298953,35.873991,70.58111,fresh,0.562167


In [31]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np

class MoleculeDataset(Dataset):
    """
    Dataset for molecular graphs with environmental variables.
    """
    def __init__(self, df, list_of_graphs, env_columns, target_column='degradation_rate'):
        """
        Args:
            df: pandas DataFrame with environment variables and degradation rate.
            list_of_graphs: list of NetworkX graphs (one per molecule).
            env_columns: list of environment variable column names.
            target_column: the name of the target variable in df.
        """
        self.df = df.reset_index(drop=True)
        self.graphs = list_of_graphs  # 20 molecules
        self.env_columns = env_columns
        self.target_column = target_column
        
        # Ensure the number of molecules match across environments
        self.num_molecules = len(self.graphs)
        self.num_env_conditions = len(self.df) // self.num_molecules  # 16 environments
        assert len(self.df) == self.num_molecules * self.num_env_conditions, \
            "Mismatch between molecules and environment conditions."

    def __len__(self):
        return len(self.df)  # Total = 20 molecules * 16 conditions = 320
    
    def __getitem__(self, idx):
        # Get the graph index (ensures each molecule repeats across environments)
        graph_idx = idx % self.num_molecules
        G = self.graphs[graph_idx]
        
        # Extract node features (e.g., atom types mapped to indices)
        node_features = [G.nodes[n]['symbol'] for n in G.nodes()]
        node_features = torch.tensor([self.atom_symbol_to_index(atom) for atom in node_features], dtype=torch.long)

        # Extract environmental variables
        env_data = torch.tensor(self.df.loc[idx, self.env_columns].values.astype(np.float32), dtype=torch.float32)

        # Extract target degradation rate
        y = torch.tensor(self.df.loc[idx, self.target_column], dtype=torch.float32)
        
        return node_features, env_data, y

    @staticmethod
    def atom_symbol_to_index(symbol):
        """Map atom symbols (C, O, H, etc.) to unique indices."""
        symbol_to_idx = {'C': 0, 'O': 1, 'H': 2, 'N': 3}  # Extend as needed
        return symbol_to_idx.get(symbol, len(symbol_to_idx))  # Default to unknown index


In [33]:

# Create dataset
dataset = MoleculeEnvDataset(alfabet_results_022)

# Create DataLoader
dataloader = DataLoader(dataset, batch_size=8, shuffle=True, collate_fn=my_collate)


In [38]:

# Build the model
#    For demonstration, suppose node features = 1 (maybe a 1-hot or embedding dimension)
num_node_features = 1
env_input_dim = 3  # e.g. [temp, concentration, time], or more if you encode "Seawater" etc
model = SimpleGraphModel(num_node_features=num_node_features,
                         env_input_dim=env_input_dim,
                         hidden_dim=128,
                         output_dim=1)

# 4) Train
train_model(model, dataloader, device='cpu', epochs=20)


TypeError: train_model() got an unexpected keyword argument 'device'

In [39]:
# Example

import networkx as nx

dummy_g = nx.Graph()
dummy_g.add_nodes_from([0, 1, 2])  # trivial 3-node graph
dummy_g.add_edges_from([(0,1), (1,2)])  # 2 edges

alfabet_results_022['nx_graph'] = [dummy_g]*num_rows

########################################
# 3) Define the Dataset class
########################################
class MoleculeEnvDataset(torch.utils.data.Dataset):
    """
    Yields (nx_graph, env_tensor, target). In practice, you’d convert the Nx graph
    to a PyG Data object in the collate function or here.
    """
    def __init__(self, df):
        self.df = df.reset_index(drop=True)
        # For example, encode 'Seawater' as 1.0 for 'sea' and 0.0 for 'fresh'
        self.water_map = {'sea': 1.0, 'fresh': 0.0}

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # Nx graph for this row (toy example: they’re all the same)
        nx_graph = row['nx_graph']

        # Build environment tensor. For demonstration, we use [temp, conc, time]
        # If you want to include 'Seawater' too, just do so below.
        env_features = torch.tensor([
            row['temperature'],
            row['Concentration'],
            row['Time']
            # or self.water_map.get(row['Seawater'], 0.0) if you want
        ], dtype=torch.float)

        # Regression target
        target = torch.tensor(row['degradation_rate'], dtype=torch.float)

        return nx_graph, env_features, target

########################################
# 4) Define a collate_fn to handle Nx -> PyG
########################################
from torch_geometric.data import Data, Batch
from torch_geometric.utils import from_networkx

def my_collate(batch):
    """
    batch: list of (nx_graph, env_tensor, target)
    """
    nx_list   = [b[0] for b in batch]
    env_list  = [b[1] for b in batch]
    tgt_list  = [b[2] for b in batch]

    # Convert Nx -> PyG
    pyg_list = []
    for g in nx_list:
        pyg_data = from_networkx(g)
        # Minimal node feature: each node just has 1 feature = node index
        # (In reality, you'd embed atomic symbols, etc.)
        x = []
        for node_idx in range(pyg_data.num_nodes):
            x.append([float(node_idx)])  # toy
        pyg_data.x = torch.tensor(x, dtype=torch.float)

        pyg_list.append(pyg_data)

    # Combine into a single batch
    pyg_batch = Batch.from_data_list(pyg_list)

    # Stack environment + targets
    env_batch = torch.stack(env_list, dim=0)
    tgt_batch = torch.stack(tgt_list, dim=0)

    return pyg_batch, env_batch, tgt_batch

########################################
# 5) Instantiate the dataset / dataloader
########################################
dataset = MoleculeEnvDataset(alfabet_results_022)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True, collate_fn=my_collate)

########################################
# 6) Build a simple GNN model
########################################
import torch.nn as nn
import torch.nn.functional as F
import math

# MLP that maps env_features -> embedding
class EnvPositionalEncoder(nn.Module):
    def __init__(self, env_input_dim, d_model, hidden_dim=64):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(env_input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, d_model)
        )
    def forward(self, env):
        return self.mlp(env)

# A small GCN-based model
from torch_geometric.nn import GCNConv, global_mean_pool

class SimpleGraphModel(nn.Module):
    def __init__(self, 
                 num_node_features,
                 env_input_dim,
                 hidden_dim=128,
                 output_dim=1):
        super().__init__()
        self.env_encoder = EnvPositionalEncoder(env_input_dim, d_model=hidden_dim)
        self.node_encoder = nn.Linear(num_node_features, hidden_dim)
        self.conv1 = GCNConv(hidden_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.fc_out = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, data, env):
        # data: PyG batch
        # env: (batch_size, env_input_dim)
        env_emb = self.env_encoder(env)  # => (B, hidden_dim)

        x = self.node_encoder(data.x)    # => (num_nodes, hidden_dim)
        x = x + env_emb[data.batch]      # broadcast env to each node

        x = F.relu(self.conv1(x, data.edge_index))
        x = F.relu(self.conv2(x, data.edge_index))

        # Global average pooling => (batch_size, hidden_dim)
        x = global_mean_pool(x, data.batch)

        # Output => (batch_size, 1)
        return self.fc_out(x).squeeze(-1)  # => (batch_size,)

########################################
# 7) Instantiate + Train
########################################
model = SimpleGraphModel(
    num_node_features=1,  # we only gave each node a single feature in my_collate
    env_input_dim=3,      # [temp, concentration, time]
    hidden_dim=64,
    output_dim=1
)

device = 'cpu'
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

epochs = 20
for epoch in range(1, epochs+1):
    model.train()
    total_loss = 0.0
    total_samples = 0

    for pyg_batch, env_batch, tgt_batch in dataloader:
        pyg_batch, env_batch, tgt_batch = (
            pyg_batch.to(device),
            env_batch.to(device),
            tgt_batch.to(device)
        )

        optimizer.zero_grad()
        preds = model(pyg_batch, env_batch)
        loss = criterion(preds, tgt_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * tgt_batch.size(0)
        total_samples += tgt_batch.size(0)

    avg_loss = total_loss / total_samples



    # For a toy example, we'll just print training loss each epoch
    print(f"Epoch {epoch}/{epochs}, MSE: {avg_loss:.4f}, RMSE: {math.sqrt(avg_loss):.4f}")

        # === Evaluation Step ===
    model.eval()
    with torch.no_grad():
        preds, gts = [], []
            
        for graphs_batch, envs_batch, targets_batch in dataloader:
            graphs_batch = graphs_batch.to(device)
            envs_batch = envs_batch.to(device)
            targets_batch = targets_batch.to(device)

            out = model(graphs_batch, envs_batch)
            preds.append(out.cpu())
            gts.append(targets_batch.cpu())

        preds = torch.cat(preds).numpy()
        gts = torch.cat(gts).numpy()

        mse_val = F.mse_loss(torch.tensor(preds), torch.tensor(gts)).item()
        rmse_val = math.sqrt(mse_val)
        r2_val = r2_score(gts, preds)

        print(f"Epoch {epoch}/{epochs} - Train MSE: {avg_loss:.4f}, Val RMSE: {rmse_val:.4f}, R²: {r2_val:.4f}")

Epoch 1/20, MSE: 0.2216, RMSE: 0.4707
Epoch 1/20 - Train MSE: 0.2216, Val RMSE: 0.3000, R²: -0.2757
Epoch 2/20, MSE: 0.1088, RMSE: 0.3298
Epoch 2/20 - Train MSE: 0.1088, Val RMSE: 0.3035, R²: -0.3056
Epoch 3/20, MSE: 0.0962, RMSE: 0.3101
Epoch 3/20 - Train MSE: 0.0962, Val RMSE: 0.2953, R²: -0.2361
Epoch 4/20, MSE: 0.0950, RMSE: 0.3082
Epoch 4/20 - Train MSE: 0.0950, Val RMSE: 0.2965, R²: -0.2463
Epoch 5/20, MSE: 0.0963, RMSE: 0.3103
Epoch 5/20 - Train MSE: 0.0963, Val RMSE: 0.2855, R²: -0.1555
Epoch 6/20, MSE: 0.0849, RMSE: 0.2915
Epoch 6/20 - Train MSE: 0.0849, Val RMSE: 0.2792, R²: -0.1047
Epoch 7/20, MSE: 0.0853, RMSE: 0.2921
Epoch 7/20 - Train MSE: 0.0853, Val RMSE: 0.2886, R²: -0.1803
Epoch 8/20, MSE: 0.0836, RMSE: 0.2891
Epoch 8/20 - Train MSE: 0.0836, Val RMSE: 0.2749, R²: -0.0709
Epoch 9/20, MSE: 0.0803, RMSE: 0.2834
Epoch 9/20 - Train MSE: 0.0803, Val RMSE: 0.2788, R²: -0.1018
Epoch 10/20, MSE: 0.0795, RMSE: 0.2819
Epoch 10/20 - Train MSE: 0.0795, Val RMSE: 0.2806, R²: -0.115