In [10]:
import pandas as pd 
import os
import sys
from rdkit import Chem 
from rdkit.Chem import AllChem
from rdkit.Chem import MACCSkeys
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import root_mean_squared_error, r2_score
import time

In [4]:
## DataSet Reading
try:
    directory_path = os.path.dirname(os.path.abspath(__file__))
except NameError:
    directory_path = os.getcwd()

Data = pd.read_csv(os.path.join(directory_path, 'Lipophilicity.csv'))
display(Data)

Unnamed: 0,CMPD_CHEMBLID,exp,smiles
0,CHEMBL596271,3.54,Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14
1,CHEMBL1951080,-1.18,COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...
2,CHEMBL1771,3.69,COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl
3,CHEMBL234951,3.37,OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...
4,CHEMBL565079,3.10,Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N...
...,...,...,...
4195,CHEMBL496929,3.85,OCCc1ccc(NC(=O)c2cc3cc(Cl)ccc3[nH]2)cc1
4196,CHEMBL199147,3.21,CCN(C1CCN(CCC(c2ccc(F)cc2)c3ccc(F)cc3)CC1)C(=O...
4197,CHEMBL15932,2.10,COc1cccc2[nH]ncc12
4198,CHEMBL558748,2.65,Clc1ccc2ncccc2c1C(=O)NCC3CCCCC3


In [5]:
## Scaling target with MinMaxScaler
scaler = MinMaxScaler()
exp_data = Data['exp'].values.reshape(-1,1) # Reshape for scaler

Data['exp_scaled'] = scaler.fit_transform(exp_data)

display(Data.head())

Unnamed: 0,CMPD_CHEMBLID,exp,smiles,exp_scaled
0,CHEMBL596271,3.54,Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14,0.84
1,CHEMBL1951080,-1.18,COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...,0.053333
2,CHEMBL1771,3.69,COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl,0.865
3,CHEMBL234951,3.37,OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...,0.811667
4,CHEMBL565079,3.1,Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N...,0.766667


In [6]:
# Importing the fingerprints.py file to generate fingerprints
from fingerprints import generate_morgan_fingerprint, generate_maccs_fingerprint

# Apply the functions to the dataframe
Data['Morgan_Fingerprint'] = Data['smiles'].apply(generate_morgan_fingerprint)
Data['MACCS_Keys'] = Data['smiles'].apply(generate_maccs_fingerprint)

# Display the updated dataframe with fingerprints
display(Data.head())



Unnamed: 0,CMPD_CHEMBLID,exp,smiles,exp_scaled,Morgan_Fingerprint,MACCS_Keys
0,CHEMBL596271,3.54,Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14,0.84,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,CHEMBL1951080,-1.18,COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...,0.053333,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,CHEMBL1771,3.69,COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl,0.865,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,CHEMBL234951,3.37,OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...,0.811667,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,CHEMBL565079,3.1,Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N...,0.766667,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [7]:
## Creating Train-Test splits

#If fingerprint columns are missing or contain nulls, generate them from SMILES
if 'Morgan_Fingerprint' not in Data.columns or Data['Morgan_Fingerprint'].isnull().any():
    Data['Morgan_Fingerprint'] = Data['smiles'].apply(generate_morgan_fingerprint)
if 'MACCS_Keys' not in Data.columns or Data['MACCS_Keys'].isnull().any():
    Data['MACCS_Keys'] = Data['smiles'].apply(generate_maccs_fingerprint)

# Convert bitstring columns to numeric arrays (each string -> list of ints)
X_morgan = np.array([list(map(int, list(s))) for s in Data['Morgan_Fingerprint']])
X_maccs = np.array([list(map(int, list(s))) for s in Data['MACCS_Keys']])

Target = Data['exp_scaled'].values

# Split all arrays together so train/test rows remain aligned
X_train_morgan, X_test_morgan, X_train_maccs, X_test_maccs, Y_train, Y_test = train_test_split(
    X_morgan, X_maccs, Target, test_size=0.2, random_state=67)
print("Shape of X_train_morgan:", X_train_morgan.shape)
print("Shape of X_test_morgan:", X_test_morgan.shape)
print("Shape of X_train_maccs:", X_train_maccs.shape)
print("Shape of X_test_maccs:", X_test_maccs.shape)

print("Shape of y_train:", Y_train.shape)
print("Shape of y_test:", Y_test.shape)


Shape of X_train_morgan: (3360, 17284)
Shape of X_test_morgan: (840, 17284)
Shape of X_train_maccs: (3360, 167)
Shape of X_test_maccs: (840, 167)
Shape of y_train: (3360,)
Shape of y_test: (840,)


In [8]:
## Regression Models
mlp_model = MLPRegressor(hidden_layer_sizes=(5000,), learning_rate=('adaptive'), max_iter=(1000), random_state=(47))


# Train Morgan Model
print("Morgan Model Training")
start_time1 = time.time()

mlp_model.fit(X_train_morgan, Y_train)

end_time1 = time.time()
training_time1 = end_time1 - start_time1
print(f"Training finished in {training_time1:.2f} seconds.")

Y_pred_morgan = mlp_model.predict(X_test_morgan)

# Train MACCS Model
print("MACCS Model Training")
start_time2 = time.time()

mlp_model.fit(X_train_maccs, Y_train)

end_time2 = time.time()
training_time2 = end_time2 - start_time2
print(f"Training finished in {training_time2:.2f} seconds.")

Y_pred_maccs = mlp_model.predict(X_test_maccs)


print("Shape of y_pred_morgan:", Y_pred_morgan.shape)
print("Data type of y_pred:", Y_pred_morgan.dtype)
display(Y_pred_morgan[:10])

print("Shape of y_pred_morgan:", Y_pred_maccs.shape)
print("Data type of y_pred:", Y_pred_maccs.dtype)
display(Y_pred_maccs[:10])

Morgan Model Training
Training finished in 1104.57 seconds.
MACCS Model Training
Training finished in 42.95 seconds.
Shape of y_pred_morgan: (840,)
Data type of y_pred: float64


array([0.8049931 , 0.58243453, 0.73575334, 0.48344715, 0.39760933,
       0.73945126, 0.45661536, 0.44659296, 0.52920618, 0.65599566])

Shape of y_pred_morgan: (840,)
Data type of y_pred: float64


array([0.86112215, 0.47355983, 0.7375469 , 0.44353725, 0.50749221,
       0.90251731, 0.54056991, 0.42712904, 0.54108671, 0.62274746])

In [None]:
## Unscale the predictions
Y_pred_morgan_unscaled = scaler.inverse_transform(Y_pred_morgan.reshape(-1, 1))
Y_pred_maccs_unscaled = scaler.inverse_transform(Y_pred_maccs.reshape(-1, 1))
Y_test_unscaled = scaler.inverse_transform(Y_test.reshape(-1, 1))

## Metrics

# Compute RMSE using mean_squared_error with squared=False
Morgan_RMSE = root_mean_squared_error(Y_test_unscaled, Y_pred_morgan_unscaled)
MACCS_RMSE = root_mean_squared_error(Y_test_unscaled, Y_pred_maccs_unscaled)

print(f"Morgan RMSE: {Morgan_RMSE:.4f}")
print(f"MACCS RMSE: {MACCS_RMSE:.4f}")


Morgan RMSE: 0.8025
MACCS RMSE: 0.8112


In [None]:
# Displaying Conda Envi
os.getenv("CONDA_DEFAULT_ENV")

'chemprop'

In [21]:
#Saving RMSE and Conda Env name to a text file
with open("Deliverables", "w") as f:
    f.write(f"Conda Env Name: {os.getenv('CONDA_DEFAULT_ENV')}\n")
    f.write(f"Morgan RMSE: {Morgan_RMSE:.4f}\n")
    f.write(f"MACCS RMSE: {MACCS_RMSE:.4f}")