In [1]:
import chemprop

print(dir(chemprop))
print(chemprop.__version__)

['__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '__version__', 'conf', 'data', 'exceptions', 'featurizers', 'models', 'nn', 'schedulers', 'types', 'utils']
2.1.1


In [None]:
# First chemprop run
import pandas as pd
from pathlib import Path
from lightning import pytorch as pl
import chemprop
from chemprop import data, featurizers, models, nn
import torch

# Specify input for the model
input_path = r"C:\Users\panag\OneDrive\Documents\coding\Projects\AIbiotics\mycobacteria_ml_project\training_data\descriptors\05_descriptors.csv"
num_workers = 0 # number of workers for dataloader. 0 means using main process for data loading
smiles_column = 'SMILES' # name of the column containing SMILES strings
target_columns = ['Hit_Miss'] # classification of activity (either 0 or 1)

# load the input dataframe
df_input = pd.read_csv(input_path)

# Get SMILES and targets
smis = df_input.loc[:, smiles_column].values
ys = df_input.loc[:, target_columns].values

# Get molecular datapoints
all_data = [data.MoleculeDatapoint.from_smi(smi, y) for smi, y in zip(smis, ys)] # I am not fully sure what this does exactly

# Perform data splitting for training, validation and testing
# scaffold balanced is chosen in an attempt to reduce the chance of overtraining
mols = [d.mol for d in all_data]  # RDkit Mol objects are use for structure based splits
train_indices, val_indices, test_indices = data.make_split_indices(mols, "scaffold_balanced", (0.8, 0.1, 0.1))
train_data, val_data, test_data = data.split_data_by_indices(
    all_data, train_indices, val_indices, test_indices
)

# Use the pre-computed descriptors to create a custom MoleculeDataset (no need for re-featurization)
train_dset = data.MoleculeDataset(train_data[0])
val_dset = data.MoleculeDataset(val_data[0])
test_dset = data.MoleculeDataset(test_data[0])

# load the necessary dataloaders
train_loader = data.build_dataloader(train_dset, num_workers=num_workers)
val_loader = data.build_dataloader(val_dset, num_workers=num_workers, shuffle=False)
test_loader = data.build_dataloader(test_dset, num_workers=num_workers, shuffle=False)

# Adapting message-passing neural network inputs
mp = nn.BondMessagePassing()  # Bond-level message passing
agg = nn.MeanAggregation()    # Aggregation by mean (typical for molecular tasks)
ffn = nn.BinaryClassificationFFN()  # Binary classification
batch_norm = False  # Start without batch normalization (adjust if needed)
metric_list = [nn.metrics.BinaryF1Score(), nn.metrics.BinaryMCCMetric(), nn.metrics.BinaryAccuracy()] 

# construct MPNN
mpnn = models.MPNN(mp, agg, ffn, batch_norm, metric_list)

# Ensuring proper utilization of Tensor cores
torch.set_float32_matmul_precision('high')

trainer = pl.Trainer(
    logger=False,  # Enable logging (e.g., TensorBoard, MLflow)
    enable_checkpointing=True,  # Save model checkpoints
    enable_progress_bar=True,  # Show progress bar during training
    accelerator="gpu", # Use GPU
    devices=1,  # Use 1 GPU (or CPU if no GPU is available)
    max_epochs=50,  # Increase epochs for better convergence
    min_epochs=10,  # Ensure at least 10 epochs are completed
    log_every_n_steps=10,  # Log metrics every 10 steps
    precision="16-mixed",  # Use mixed precision for faster training (if GPU supports it)
    deterministic=False,  # Ensure reproducibility
)

# Start training
trainer.fit(mpnn, train_loader, val_loader)

# Check results
results = trainer.test(mpnn, test_loader)

In [None]:
# Next iteration with improved logging and more epochs
# The model seems to select the final iteration, which is overtrained
import pandas as pd
from pathlib import Path
from lightning import pytorch as pl
import chemprop
from chemprop import data, featurizers, models, nn
import torch
import mlflow
import mlflow.pytorch

# Specify input for the model
input_path = r"C:\Users\panag\OneDrive\Documents\coding\Projects\AIbiotics\mycobacteria_ml_project\training_data\descriptors\05_descriptors.csv"
num_workers = 0  # Number of workers for dataloader (0 uses the main process). I keep getting recommended to increase the number of workers, but this causes issues.
smiles_column = 'SMILES'  # Name of the column containing SMILES strings
target_columns = ['Hit_Miss']  # Binary classification labels (0 or 1)

# Load the input dataframe
df_input = pd.read_csv(input_path)

# Extract SMILES and targets
smis = df_input[smiles_column].values
targets = df_input[target_columns].values

# Convert to MoleculeDatapoint format
all_data = [data.MoleculeDatapoint.from_smi(smi, y) for smi, y in zip(smis, targets)]

# Scaffold-balanced train/val/test split
mols = [d.mol for d in all_data]
train_indices, val_indices, test_indices = data.make_split_indices(mols, "scaffold_balanced", (0.8, 0.1, 0.1))
train_data, val_data, test_data = data.split_data_by_indices(all_data, train_indices, val_indices, test_indices)

# Create MoleculeDataset without re-featurization
train_dset = data.MoleculeDataset(train_data[0])
val_dset = data.MoleculeDataset(val_data[0])
test_dset = data.MoleculeDataset(test_data[0])

# Dataloaders with batch size 64
batch_size = 64
train_loader = data.build_dataloader(train_dset, batch_size=batch_size, num_workers=num_workers)
val_loader = data.build_dataloader(val_dset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
test_loader = data.build_dataloader(test_dset, batch_size=batch_size, num_workers=num_workers, shuffle=False)

# Define model components
mp = nn.AtomMessagePassing()  # Atom-level message passing
agg = nn.MeanAggregation()  # I have to figure out why this one is ideal still
ffn = nn.BinaryClassificationFFN(dropout=0.3)  # Dropout added for regularization
batch_norm = True  # Enable batch normalization
metric_list = [
    nn.metrics.BinaryF1Score(),
    nn.metrics.BinaryMCCMetric(),
    nn.metrics.BinaryAccuracy()
]

# Construct the MPNN model
mpnn = models.MPNN(mp, agg, ffn, batch_norm, metric_list)

# Use mixed precision for faster training
torch.set_float32_matmul_precision('high')

# Define optimizer with weight decay
optimizer = torch.optim.AdamW(mpnn.parameters(), lr=1e-4, weight_decay=1e-5)

# Learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=5, factor=0.5)

# Add an early stopper 
early_stopping = pl.callbacks.EarlyStopping(monitor='val_loss', patience=5, mode='min')

# Initialize MLflow
mlflow.set_experiment("MPNN_Training")

with mlflow.start_run():
    mlflow.log_param("batch_size", batch_size)
    mlflow.log_param("learning_rate", 1e-4)
    mlflow.log_param("weight_decay", 1e-5)
    
    # Define PyTorch Lightning trainer
    trainer = pl.Trainer(
        logger=pl.loggers.MLFlowLogger(experiment_name="MPNN_Training"),
        enable_checkpointing=True,  # Save model checkpoints
        enable_progress_bar=True,  # Show progress bar during training
        accelerator="gpu",  # Use GPU if available
        devices=1,  # Number of GPUs (or CPU fallback)
        max_epochs=100,  # Increased training duration
        min_epochs=10,  # Minimum epochs before early stopping
        log_every_n_steps=1,  # Log metrics frequently
        precision="16-mixed",  # Mixed precision for efficiency
        deterministic=True,  # Ensure reproducibility
        gradient_clip_val=1.0,  # Prevent exploding gradients
    )

    # Train the model
    trainer.fit(mpnn, train_loader, val_loader)

    # Test the trained model
    results = trainer.test(mpnn, test_loader)
    
    # Log results to MLflow
    for metric, value in results[0].items():
        mlflow.log_metric(metric, value)
    
    # Save the trained model to MLflow
    mlflow.pytorch.log_model(mpnn, "mpnn_model")


In [None]:
# Run 3: model stopped early due to not enough learning occurring over the generations.abs
import pandas as pd
from pathlib import Path
from lightning import pytorch as pl
import chemprop
from chemprop import data, featurizers, models, nn
import torch
import mlflow
import mlflow.pytorch

# Specify input for the model
input_path = r"C:\Users\panag\OneDrive\Documents\coding\Projects\AIbiotics\mycobacteria_ml_project\training_data\descriptors\05_descriptors.csv"
num_workers = 0  # Number of workers for dataloader (0 uses the main process). I keep getting recommended to increase the number of workers, but this causes issues.
smiles_column = 'SMILES'  # Name of the column containing SMILES strings
target_columns = ['Hit_Miss']  # Binary classification labels (0 or 1)

# Load the input dataframe
df_input = pd.read_csv(input_path)

# Extract SMILES and targets
smis = df_input[smiles_column].values
targets = df_input[target_columns].values

# Convert to MoleculeDatapoint format
all_data = [data.MoleculeDatapoint.from_smi(smi, y) for smi, y in zip(smis, targets)]

# Scaffold-balanced train/val/test split
mols = [d.mol for d in all_data]
train_indices, val_indices, test_indices = data.make_split_indices(mols, "scaffold_balanced", (0.8, 0.1, 0.1))
train_data, val_data, test_data = data.split_data_by_indices(all_data, train_indices, val_indices, test_indices)

# Create MoleculeDataset without re-featurization
train_dset = data.MoleculeDataset(train_data[0])
val_dset = data.MoleculeDataset(val_data[0])
test_dset = data.MoleculeDataset(test_data[0])

# Dataloaders with batch size 64
batch_size = 64
train_loader = data.build_dataloader(train_dset, batch_size=batch_size, num_workers=num_workers)
val_loader = data.build_dataloader(val_dset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
test_loader = data.build_dataloader(test_dset, batch_size=batch_size, num_workers=num_workers, shuffle=False)

# Define model components
mp = nn.AtomMessagePassing()  # Atom-level message passing
agg = nn.MeanAggregation()  # I have to figure out why this one is ideal still
ffn = nn.BinaryClassificationFFN(dropout=0.3)  # Dropout added for regularization
batch_norm = True  # Enable batch normalization
metric_list = [
    nn.metrics.BinaryF1Score(),
    nn.metrics.BinaryMCCMetric(),
    nn.metrics.BinaryAccuracy()
]

# Construct the MPNN model
mpnn = models.MPNN(mp, agg, ffn, batch_norm, metric_list)

# Use mixed precision for faster training
torch.set_float32_matmul_precision('high')

# Define optimizer with weight decay
optimizer = torch.optim.AdamW(mpnn.parameters(), lr=1e-4, weight_decay=1e-5)

# Learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=5, factor=0.5)

# Add an early stopper 
early_stopping = pl.callbacks.EarlyStopping(monitor='val_loss', patience=5, mode='min')

# Initialize MLflow
mlflow.set_experiment("MPNN_Training")

with mlflow.start_run():
    mlflow.log_param("batch_size", batch_size)
    mlflow.log_param("learning_rate", 1e-4)
    mlflow.log_param("weight_decay", 1e-5)

    # Add ModelCheckpoint to callbacks
    checkpoint_cb = pl.callbacks.ModelCheckpoint(
        monitor="val_loss",
        mode="min",
        save_top_k=1,
        filename="best_model"
    )

    # Define PyTorch Lightning trainer
    trainer = pl.Trainer(
        logger=pl.loggers.MLFlowLogger(experiment_name="MPNN_Training"),
        callbacks=[early_stopping, checkpoint_cb],  # Add checkpoint callback here
        enable_checkpointing=True,  # Save model checkpoints
        enable_progress_bar=True,  # Show progress bar during training
        accelerator="gpu",  # Use GPU if available
        devices=1,  # Number of GPUs (or CPU fallback)
        max_epochs=100,  # Increased training duration
        min_epochs=10,  # Minimum epochs before early stopping
        log_every_n_steps=1,  # Log metrics frequently
        precision="16-mixed",  # Mixed precision for efficiency
        deterministic=True,  # Ensure reproducibility
        gradient_clip_val=1.0,  # Prevent exploding gradients
    )

    # Train the model
    trainer.fit(mpnn, train_loader, val_loader)

    # After training, load best model
    best_model = models.MPNN.load_from_checkpoint(checkpoint_cb.best_model_path)

    # Test the best model (not the final model)
    results = trainer.test(best_model, test_loader)

    # Log results to MLflow
    for metric, value in results[0].items():
        mlflow.log_metric(metric, value)

    # Save the best model to MLflow
    mlflow.pytorch.log_model(best_model, "mpnn_model")
