In [1]:
import torch 
import pandas as pd 
from data_preparation import *
from tensorconversion import *
from dataloader import *
from torch.utils.data import DataLoader
from GAT_MLP_Network import *
import torch.optim as optim
from train_evaluate import *
from hyperparameters import *

In [2]:
train_df = pd.read_csv('Data/train.csv')
test_df = pd.read_csv('Data/test.csv')

In [3]:
print(len(train_df),len(test_df))

6644 977


In [4]:
def preprocess(smiles,labels,batch_size=None,shuffle=None,generator=None):
    processor = MoleculeProcessor(smiles,labels)
    full_graph_batch = processor.node_edge_feature_extraction()
    fps = np.array(processor.calculate_fingerprint(radius=3,fpsize=4096))
    tensor_converter = TensorConversion()
    fps_data = tensor_converter.convert_to_tensor(fps)
    combined_dataset = SynchronizedDataset(full_graph_batch,fps_data)
    combined_dataloader = DataLoader(
    combined_dataset, 
    batch_size = batch_size,
    shuffle = shuffle,
    generator = generator, 
    collate_fn=collate_fn
    # generator=torch.Generator().manual_seed(42)
)
    return combined_dataloader

In [5]:
train_dataloader = preprocess(train_df['Smiles'],train_df['Label'],batch_size=64, shuffle=True, generator=torch.Generator().manual_seed(42))

2026-01-30 11:18:55 - data_preparation.py:45 - _generate_molecules - Total valid SMILES found: 6644
2026-01-30 11:18:59 - tensorconversion.py:24 - convert_to_tensor - Converting fingerprint array to tensor. Shape: (6644, 4096)
2026-01-30 11:18:59 - tensorconversion.py:26 - convert_to_tensor - Converted tensor shape: torch.Size([6644, 4096])


In [6]:
test_dataloader = preprocess(test_df['Smiles'],test_df['Label'],batch_size=64)

2026-01-30 11:18:59 - data_preparation.py:45 - _generate_molecules - Total valid SMILES found: 977
2026-01-30 11:19:00 - tensorconversion.py:24 - convert_to_tensor - Converting fingerprint array to tensor. Shape: (977, 4096)
2026-01-30 11:19:00 - tensorconversion.py:26 - convert_to_tensor - Converted tensor shape: torch.Size([977, 4096])


In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CombinedModelGATMLP(config.gnn_in_channels, config.gnn_hidden_channels, config.gnn_out_channels,
                      config.mlp_input_size, config.mlp_hidden_sizes, config.mlp_output_size, config.dense_size_1, config.dense_size_2).to(device)
criterion = torch.nn.BCEWithLogitsLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=config.lr,weight_decay= config.weight_decay)

In [8]:
print(model)

CombinedModelGATMLP(
  (gnn): GATModel(
    (gat_conv): GATConv(9, 57, heads=1)
    (lin): Linear(in_features=57, out_features=34, bias=True)
  )
  (mlp): MLPModel(
    (mlp): Sequential(
      (0): Linear(in_features=4096, out_features=2469, bias=True)
      (1): BatchNorm1d(2469, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): Dropout(p=0.17988426707538552, inplace=False)
      (4): Linear(in_features=2469, out_features=185, bias=True)
    )
  )
  (fc1): Linear(in_features=219, out_features=112, bias=True)
  (batchnorm1): BatchNorm1d(112, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout2): Dropout(p=0.3, inplace=False)
  (linear): Linear(in_features=112, out_features=1, bias=True)
)


In [9]:
trainer = ModelTrainer(model,criterion,optimizer,device)

2026-01-30 11:19:00 - train_evaluate.py:23 - __init__ - 
Model Architecture:
####################################################################################################
# CombinedModelGATMLP(                                                                             #
#   (gnn): GATModel(                                                                               #
#     (gat_conv): GATConv(9, 57, heads=1)                                                          #
#     (lin): Linear(in_features=57, out_features=34, bias=True)                                    #
#   )                                                                                              #
#   (mlp): MLPModel(                                                                               #
#     (mlp): Sequential(                                                                           #
#       (0): Linear(in_features=4096, out_features=2469, bias=True)                                #
#       (1): B

In [10]:
best_model, train_losses, train_accuracies, final_true_labels, final_predicted_labels = trainer.train(train_dataloader)

Epoch 1, Loss: 0.5302, Accuracy: 0.7407
Epoch 2, Loss: 0.4505, Accuracy: 0.7992
Epoch 3, Loss: 0.4460, Accuracy: 0.7967
Epoch 4, Loss: 0.4345, Accuracy: 0.8042
Epoch 5, Loss: 0.4282, Accuracy: 0.8030
Epoch 6, Loss: 0.4176, Accuracy: 0.8141
Epoch 7, Loss: 0.4058, Accuracy: 0.8197
Epoch 8, Loss: 0.3995, Accuracy: 0.8283
Epoch 9, Loss: 0.3923, Accuracy: 0.8283
Epoch 10, Loss: 0.3665, Accuracy: 0.8435
Epoch 11, Loss: 0.3543, Accuracy: 0.8496
Epoch 12, Loss: 0.3286, Accuracy: 0.8662
Epoch 13, Loss: 0.3162, Accuracy: 0.8692
Epoch 14, Loss: 0.2913, Accuracy: 0.8843
Epoch 15, Loss: 0.2711, Accuracy: 0.8858
Epoch 16, Loss: 0.2515, Accuracy: 0.9023
Epoch 17, Loss: 0.2326, Accuracy: 0.9113
Epoch 18, Loss: 0.2140, Accuracy: 0.9177
Epoch 19, Loss: 0.2031, Accuracy: 0.9217
Epoch 20, Loss: 0.1888, Accuracy: 0.9296
Epoch 21, Loss: 0.1748, Accuracy: 0.9363
Epoch 22, Loss: 0.1768, Accuracy: 0.9321
Epoch 23, Loss: 0.1660, Accuracy: 0.9415
Epoch 24, Loss: 0.1557, Accuracy: 0.9448
Epoch 25, Loss: 0.1512, A

2026-01-30 11:20:32 - train_evaluate.py:106 - train - Training Completed.
#####################################
# Loss: 0.1146 and Accuracy: 0.9624 #
#####################################


Epoch 53, Loss: 0.1200, Accuracy: 0.9603

Early stopping triggered. Best loss: 0.1146

Final Results:
Accuracy: 0.9624
Precision: 0.9643
Recall: 0.9577
Specificity for class_0: 0.9577
Specificity for class_1: 0.9667
F1 Score: 0.9610
Confusion Matrix:
[[3313  114]
 [ 136 3081]]

Classification Report:
              precision    recall  f1-score   support

     class_0       0.96      0.97      0.96      3427
     class_1       0.96      0.96      0.96      3217

    accuracy                           0.96      6644
   macro avg       0.96      0.96      0.96      6644
weighted avg       0.96      0.96      0.96      6644



In [11]:
all_probs,all_preds, all_targets,avg_loss, accuracy, conf_matrix, class_report, roc_auc, fpr, tpr = trainer.evaluate(best_model, test_dataloader)

2026-01-30 11:20:32 - train_evaluate.py:152 - evaluate - Evaluation Completed.
###################################
# Loss:0.5341 and Accuracy:0.8352 #
###################################
2026-01-30 11:20:32 - train_evaluate.py:153 - evaluate - 
Classification Report:
#########################################################
#               precision    recall  f1-score   support #
#                                                       #
#          0.0       0.82      0.79      0.81       426 #
#          1.0       0.84      0.87      0.86       551 #
#                                                       #
#     accuracy                           0.84       977 #
#    macro avg       0.83      0.83      0.83       977 #
# weighted avg       0.83      0.84      0.83       977 #
#                                                       #
#########################################################


Loss: 0.5341
Accuracy: 0.8352
[[337  89]
 [ 72 479]]
Specificity class_0 : 0.8693
Specificity class_1 : 0.7911
              precision    recall  f1-score   support

         0.0       0.82      0.79      0.81       426
         1.0       0.84      0.87      0.86       551

    accuracy                           0.84       977
   macro avg       0.83      0.83      0.83       977
weighted avg       0.83      0.84      0.83       977



In [12]:
# torch.save(best_model, 'Muta_MLP_best_model1.pth')