In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
from rdkit import Chem,DataStructs
from rdkit.Chem import AllChem,PandasTools
import rdkit
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader,random_split
from rdkit import Chem

In [3]:
df=pd.read_csv(r"C:\Users\19189\Desktop\original_data.csv")

In [4]:
df.head(2)

Unnamed: 0,CASRN,CATMoS_LD50_mgkg,Canonical_QSARr,LogLD50
0,68523-18-2,460.0,CC1(C)C(C1C=C(Cl)Cl)C(=O)OC(C#N)C1C=CC=C(N=1)O...,2.662758
1,88-04-0,3830.0,CC1C=C(O)C=C(C)C=1Cl,3.583199


In [5]:
PandasTools.AddMoleculeColumnToFrame(df,'Canonical_QSARr','Molecule')
df.head(2)

Unnamed: 0,CASRN,CATMoS_LD50_mgkg,Canonical_QSARr,LogLD50,Molecule
0,68523-18-2,460.0,CC1(C)C(C1C=C(Cl)Cl)C(=O)OC(C#N)C1C=CC=C(N=1)O...,2.662758,<rdkit.Chem.rdchem.Mol object at 0x000001AF92A...
1,88-04-0,3830.0,CC1C=C(O)C=C(C)C=1Cl,3.583199,<rdkit.Chem.rdchem.Mol object at 0x000001AF92A...


In [6]:
def MACCSfp(mol):
    fp = rdkit.Chem.rdMolDescriptors.GetMACCSKeysFingerprint(mol)
    ar = np.zeros((1,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(fp, ar)
    return ar

In [7]:
def mol2fp(mol):
    fp = AllChem.GetHashedMorganFingerprint(mol, 2, nBits=2048)
    ar = np.zeros((1,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(fp, ar)
    return ar

In [8]:
df["MACCS_FPs"] =df.Molecule.apply(MACCSfp)
df.head(2)

Unnamed: 0,CASRN,CATMoS_LD50_mgkg,Canonical_QSARr,LogLD50,Molecule,MACCS_FPs
0,68523-18-2,460.0,CC1(C)C(C1C=C(Cl)Cl)C(=O)OC(C#N)C1C=CC=C(N=1)O...,2.662758,<rdkit.Chem.rdchem.Mol object at 0x000001AF92A...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,88-04-0,3830.0,CC1C=C(O)C=C(C)C=1Cl,3.583199,<rdkit.Chem.rdchem.Mol object at 0x000001AF92A...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [9]:
df["Morgan_FPs"] = df.Molecule.apply(mol2fp)
df.head(2)

Unnamed: 0,CASRN,CATMoS_LD50_mgkg,Canonical_QSARr,LogLD50,Molecule,MACCS_FPs,Morgan_FPs
0,68523-18-2,460.0,CC1(C)C(C1C=C(Cl)Cl)C(=O)OC(C#N)C1C=CC=C(N=1)O...,2.662758,<rdkit.Chem.rdchem.Mol object at 0x000001AF92A...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,88-04-0,3830.0,CC1C=C(O)C=C(C)C=1Cl,3.583199,<rdkit.Chem.rdchem.Mol object at 0x000001AF92A...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [10]:
Morgan_FPs_list = df["Morgan_FPs"]
MACCS_FPs_list = df["MACCS_FPs"]
labels = df["LogLD50"]

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np

# PyTorch TensorBoard support
from torch.utils.tensorboard import SummaryWriter

In [12]:
# Create a custom dataset
class FPSDataset(torch.utils.data.Dataset):
    def __init__(self, fingerprint_list, labels):
        self.fingerprint_list = fingerprint_list
        self.labels = labels

    def __len__(self):
        return len(self.fingerprint_list)

    def __getitem__(self, idx):
        fingerprint = self.fingerprint_list[idx]
        label = self.labels[idx]
        return torch.Tensor(fingerprint), torch.Tensor([label])

In [13]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)
        self.activation1 = nn.ReLU()
        self.activation2 = nn.Sigmoid()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.fc1(x)
        out = self.activation1(out)
        out = self.fc2(out)
        out = self.activation1(out)
        out = self.fc3(out)
        return out

In [14]:
# Set the device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [34]:
from sklearn.model_selection import KFold
from tqdm import tqdm
import optuna
from sklearn.metrics import r2_score
x=Morgan_FPs_list
y=labels
criterion = nn.MSELoss()
def objective(trial):
    batch_size=trial.suggest_int("batch_size", 5,100,step=5)
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    R2=[]
    for train_index, test_index in tqdm(kfold.split(x, y)):
        x_train, x_test = x.iloc[train_index], x.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        train_set = FPSDataset(x_train,y_train)
        input_size = 1024  # Size of Morgan fingerprint
        hidden_size = trial.suggest_int("hidden_size", 60,200, step=5)
        output_size = 1
        model = Net(input_size, hidden_size, output_size).to(device)
        learning_rate=trial.suggest_float("learning_rate", 1e-8, 1.0, log=True)
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
        train_data_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
        num_epochs=10
        for epoch in range(num_epochs):
            for batch_inputs, batch_labels in train_data_loader:
                # Move the inputs and labels to the GPU
                batch_inputs = batch_inputs.to(device)
                batch_labels = batch_labels.to(device)
                # Forward pass
                outputs = model(batch_inputs)
                loss = criterion(outputs, batch_labels)
                # Backward pass and optimization
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
        y_pred = []
        model.eval()
        with torch.no_grad():
            for index, row in x_test.iterrows():
                tensor = torch.Tensor(row).unsqueeze(0)
                tensor.to(device)
                input_tensor = tensor
                output = model(input_tensor)
                prediction = output.item()
                y_pred.append(prediction)
        r2=r2_score(y_test,y_pred) 
        R2.append(r2)
    a=R2.mean()
    return a
        
study = optuna.create_study(
    direction="maximize",
    study_name="FCNN_parameter_opt")

study.optimize(objective, n_trials=3)
print(study.best_trial.value)
#df = study.trials_dataframe()
#df.to_csv(r'C:\Users\19189\Desktop\trial1.csv',index=False)



[I 2024-04-06 00:16:44,502] A new study created in memory with name: FCNN_parameter_opt
0it [00:00, ?it/s]
[W 2024-04-06 00:16:44,510] Trial 0 failed with parameters: {'batch_size': 30, 'hidden_size': 190, 'learning_rate': 0.00013269421697104677} because of the following error: KeyError(4176).
Traceback (most recent call last):
  File "D:\anaconda3\envs\CATMOS\Lib\site-packages\pandas\core\indexes\base.py", line 3802, in get_loc
    return self._engine.get_loc(casted_key)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "pandas\_libs\index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\_libs\index.pyx", line 165, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\_libs\hashtable_class_helper.pxi", line 2263, in pandas._libs.hashtable.Int64HashTable.get_item
  File "pandas\_libs\hashtable_class_helper.pxi", line 2273, in pandas._libs.hashtable.Int64HashTable.get_item
KeyError: 4176

The above exception was the direct cause of the following exception:

T

KeyError: 4176

In [20]:
# Define hyperparameters
input_size = 2048  # Size of Morgan fingerprint
hidden_size = 64
output_size = 1

# Create an instance of the neural network model
model = Net(input_size, hidden_size, output_size).to(device)

# Define loss function and optimizer
criterion = nn.MSELoss()

Morgan_FPs_list = df["Morgan_FPs"]
MACCS_FPs_list = df["MACCS_FPs"]
labels = df["LogLD50"]

# Create a custom dataset
dataset = FPSDataset(Morgan_FPs_list, labels)
writer = SummaryWriter('runs/loss_visualization')

# Create a data loader
batch_size = 10
generator2 = torch.Generator().manual_seed(42)
train_dataset, test_dataset=random_split(dataset, [0.8,0.2], generator=generator2)
data_loader1= DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
data_loader2= DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

# Training loop
num_epochs = 100
lr=[0.0001,0.001,0.01,0.1]
for learning_rate in lr:
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    for epoch in range(num_epochs):
        train_loss=0
        val_loss=0
        for batch_inputs, batch_labels in data_loader1:
            # Move the inputs and labels to the GPU
            batch_inputs = batch_inputs.to(device)
            batch_labels = batch_labels.to(device)
    
            # Forward pass
            outputs = model(batch_inputs)
            loss = criterion(outputs, batch_labels)
            
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            train_loss += loss.item()*batch_size
        for batch_inputs, batch_labels in data_loader2:
            # Move the inputs and labels to the GPU
            batch_inputs = batch_inputs.to(device)
            batch_labels = batch_labels.to(device)
    
            # Forward pass
            outputs = model(batch_inputs)
            loss = criterion(outputs, batch_labels)
            val_loss += loss.item()*batch_size
        dict={'train':train_loss,'val':val_loss}
        name='lr/'+str(learning_rate)
        writer.add_scalars(name,dict, epoch)
        print("Epoch [{}/{}], Loss: {:.4f},{:.4f}".format(epoch+1, num_epochs, train_loss,val_loss))
writer.close()

Epoch [1/100], Loss: 27960.0758,3812.1261
Epoch [2/100], Loss: 12619.5551,2443.5935
Epoch [3/100], Loss: 7209.8816,1596.3058
Epoch [4/100], Loss: 4686.2753,1257.0260
Epoch [5/100], Loss: 3748.5830,1120.5139
Epoch [6/100], Loss: 3311.8663,1083.7844
Epoch [7/100], Loss: 3029.4910,1034.5592
Epoch [8/100], Loss: 2799.8995,1012.8329
Epoch [9/100], Loss: 2559.1577,982.0962
Epoch [10/100], Loss: 2347.3276,952.1829
Epoch [11/100], Loss: 2150.5572,945.1775
Epoch [12/100], Loss: 1964.4579,923.2459
Epoch [13/100], Loss: 1768.0589,906.5797
Epoch [14/100], Loss: 1625.0435,902.4735
Epoch [15/100], Loss: 1464.2182,906.4117
Epoch [16/100], Loss: 1363.9200,899.6494
Epoch [17/100], Loss: 1220.1435,896.2683
Epoch [18/100], Loss: 1116.0821,885.9793
Epoch [19/100], Loss: 1011.5377,900.9386
Epoch [20/100], Loss: 923.3062,910.0396
Epoch [21/100], Loss: 854.5361,918.4527
Epoch [22/100], Loss: 794.4802,920.7369
Epoch [23/100], Loss: 755.1278,931.6073
Epoch [24/100], Loss: 663.6699,937.0281
Epoch [25/100], Loss

In [None]:
writer = SummaryWriter('runs/loss_visualization')
writer.add_scalar('Loss',train_loss, epoch)
writer.close()

In [17]:
lr=[0.0001,0.001,0.01,0.1]
for learning_rate in lr:
    name='lr/'+str(learning_rate)
    print(name)

lr/0.0001
lr/0.001
lr/0.01
lr/0.1
