In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import MACCSkeys
from rdkit import DataStructs
import torch
import numpy as np

import torch
from fpgnn import FP_GNN  # Assuming the model class is named FP_GNN in the repo
from data_utils import load_data  # Hypothetical data loading function
from train_utils import train_model, evaluate_model  # Hypothetical training functions

ModuleNotFoundError: No module named 'torch'

In [None]:
### generate graphs and fingerprints (not Morgan)

def mol_to_graph_and_fp_from_csv(csv_path, smiles_col='canonical_smiles'):
    """
    Reads a CSV with a column of SMILES strings, converts to molecular graphs and fingerprints.
    Args:
      csv_path (str): Path to the CSV file.
      smiles_col (str): Name of the column containing SMILES strings.
    Returns:
      graphs (list of dict): Each dict has 'atom_features' (tensor) and 'adjacency' (tensor).
      fps (np.ndarray): Fingerprint array (e.g., MACCS keys).
      labels (np.ndarray or list): Labels if present in CSV (assumed in 'label' column).
    """
    df = pd.read_csv(csv_path)
    smiles_list = df[smiles_col].tolist()

    graphs = []
    fps = []

    for smi in smiles_list:
        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            # Skip invalid molecules
            continue
        mol = Chem.AddHs(mol)  # Add explicit hydrogens

        # Atom features: atomic number as a simple feature (can be expanded)
        atom_features = []
        for atom in mol.GetAtoms():
            atom_features.append([atom.GetAtomicNum()])
        atom_features = torch.tensor(atom_features, dtype=torch.float)

        # Adjacency matrix
        adj = Chem.GetAdjacencyMatrix(mol)
        adj = torch.tensor(adj, dtype=torch.float)

        graphs.append({'atom_features': atom_features, 'adjacency': adj})

        # Generate MACCS keys fingerprint (166 bits)
        fp = MACCSkeys.GenMACCSKeys(mol)
        arr = np.zeros((166,), dtype=np.int8)
        DataStructs.ConvertToNumpyArray(fp, arr)
        fps.append(arr)

    fps = np.array(fps)

    # Extract labels if available
    labels = None
    if 'label' in df.columns:
        labels = df['label'].values

    return graphs, fps, labels

# Example usage:
csv_file = 'your_dataset.csv'
graphs, fps, labels = mol_to_graph_and_fp_from_csv(csv_file)

print(f"Processed {len(graphs)} molecules")
print(f"Fingerprint shape: {fps.shape}")
if labels is not None:
    print(f"Labels shape: {labels.shape}")


In [None]:


# Load your dataset (molecular graphs + fingerprints + labels)
train_data, val_data, test_data = load_data('your_dataset_path')

# Initialize model
model = FP_GNN(input_dim=train_data.feature_dim, hidden_dim=256, output_dim=train_data.num_classes)

# Define optimizer and loss
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    train_loss = train_model(model, train_data, optimizer, criterion)
    val_acc = evaluate_model(model, val_data)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {train_loss:.4f}, Val Acc: {val_acc:.4f}")

# Test evaluation
test_acc = evaluate_model(model, test_data)
print(f"Test Accuracy: {test_acc:.4f}")
