<body>
    <h1>Introduction to Model Validation</h1>
    <p>Welcome to our discussion on model validation! In this session, we'll explore the Ersilia eos6oli model and its validation using a wild dataset.</p>
</body>

In [None]:
# In this codeblock I will import the necessary packages and specify the paths to relevant folders
%%capture
%env MINICONDA_INSTALLER_SCRIPT=Miniconda3-py37_4.12.0-Linux-x86_64.sh
%env MINICONDA_PREFIX=/usr/local
%env PYTHONPATH= "$PYTHONPATH:/usr/local/lib/python3.7/site-packages"
%env PIP_ROOT_USER_ACTION=ignore

!wget https://repo.anaconda.com/miniconda/$MINICONDA_INSTALLER_SCRIPT
!chmod +x $MINICONDA_INSTALLER_SCRIPT
!./$MINICONDA_INSTALLER_SCRIPT -b -f -p $MINICONDA_PREFIX

!python -m pip install git+https://github.com/ersilia-os/ersilia.git
!python -m pip install requests --upgrade
import sys

_ = sys.path.append("/usr/local/lib/python3.7/site-packages")

In [None]:
import pandas as pd
!pip install rdkit

# Step 1: Read CSV files into pandas DataFrames
chembl_df = pd.read_csv('/content/chembl_stand.csv')
aqsol_df = pd.read_csv('/content/aqsol.csv')

chembl_df.shape


Collecting rdkit
  Using cached rdkit-2023.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.5 MB)
Installing collected packages: rdkit
Successfully installed rdkit-2023.3.2


(31099, 3)

In [None]:
aqsol_df.shape

(9982, 2)

In [None]:
# Step 2: Identify rows with similar SMILES entries
similar_smiles = chembl_df[chembl_df['smiles'].isin(aqsol_df['smiles'])]

# Step 3: Drop rows with similar SMILES entries from chembl DataFrame
unique_chembl_df = chembl_df[~chembl_df.index.isin(similar_smiles.index)]

# Step 4: Save unique records from chembl DataFrame to a new CSV file
unique_chembl_df.to_csv('unique_chembl.csv', index=False)

print("Unique records saved to 'unique_chembl.csv'")

Unique records saved to 'unique_chembl.csv'


In [None]:
# specify input folder, output folder, smiles column
input_folder = "/content/drive/MyDrive/Ersilia"
output_folder = "/content/output"
smiles_column = "smiles"

#finding number of smiles

import pandas as pd

#path = "%s/%s.csv" % (input_folder, input_filename)
smi_col = "%s" % smiles_column
df = pd.read_csv("/content/unique_chembl.csv")
smiles = df[smi_col].tolist()

print("My dataset contains" + " " + str(len(smiles)) + " " + "SMILES :)")

My dataset contains 31020 SMILES :)


In [None]:
from rdkit import Chem
import pandas as pd

def is_smiles(smiles):
    return Chem.MolFromSmiles(smiles) is not None

# Read the file with SMILES strings

# Filter out rows with invalid SMILES
valid_smiles = df[df['smiles'].apply(is_smiles)]

# Save the filtered data to a new file
valid_smiles.to_csv('/content/unique_chembl.csv', index=False)

# Check if all samples are valid SMILES in the filtered data
if len(valid_smiles) == len(df):
    print("All samples are valid SMILES")
else:
    print("Not all samples are valid SMILES")

All samples are valid SMILES


In [None]:
# Enter the model name
model_name = "eos6oli"

In [None]:
# @title 📥 Fetch the model
import time

begin = time.time()
!ersilia fetch $model_name
end = time.time()

print("Time taken:", round((end - begin), 2), "seconds")

[34m⬇️  Fetching model eos6oli: soltrannet-aqueous-solubility[0m
sudo: unknown user udockerusername
sudo: error initializing audit plugin sudoers_audit
[]
sudo: unknown user udockerusername
sudo: error initializing audit plugin sudoers_audit
sudo: unknown user udockerusername
sudo: error initializing audit plugin sudoers_audit
sudo: unknown user udockerusername
sudo: error initializing audit plugin sudoers_audit
[32m🚀 Serving model eos6oli: soltrannet-aqueous-solubility[0m
[0m
[33m   URL: http://127.0.0.1:51931[0m
[33m   PID: 6422[0m
[33m   SRV: conda[0m
[0m
[34m👉 To run model:[0m
[34m   - run[0m
[0m
[34m💁 Information:[0m
[34m   - info[0m
sudo: unknown user udockerusername
sudo: error initializing audit plugin sudoers_audit
[32m⛔ Model eos6oli closed[0m
[32m👍 Model eos6oli fetched successfully![0m
Time taken: 195.37 seconds


In [None]:
# @title ⚙️ Serve the model

# returns available APIs (calculate or predict)

!ersilia serve $model_name

sudo: unknown user udockerusername
sudo: error initializing audit plugin sudoers_audit
sudo: unknown user udockerusername
sudo: error initializing audit plugin sudoers_audit
[32m🚀 Serving model eos6oli: soltrannet-aqueous-solubility[0m
[0m
[33m   URL: http://127.0.0.1:35203[0m
[33m   PID: 6895[0m
[33m   SRV: conda[0m
[0m
[34m👉 To run model:[0m
[34m   - run[0m
[0m
[34m💁 Information:[0m
[34m   - info[0m


In [None]:
# @title ⚡ Run predictions
api = "calculate"  # @param {type:"string"}

from ersilia import ErsiliaModel
import time

model = ErsiliaModel(model_name)
begin = time.time()
output = model.api(input=smiles, output="pandas")
end = time.time()

print("Successful 👍! Time taken:", round((end - begin), 2), "seconds")
model.close()

Successful 👍! Time taken: 1531.17 seconds


In [None]:
# @title ✔️ Check your results

print(output.head())

                           key                                       input  \
0  WDSQNMSWLYROHT-UHFFFAOYSA-N     Brc1ccc(-c2nc(-c3ccc4nc[nH]c4c3)no2)cc1   
1  YQTYPSIBGJUFHX-UHFFFAOYSA-N         Brc1cccc(Nc2nc3c(N4CCCC4)ncnc3s2)c1   
2  QENYREVCVOGEQU-UHFFFAOYSA-N    Brc1cccc(Nc2ncnc3cc(NCCCn4ccnc4)ncc23)c1   
3  PFJRPGGNJFXXDJ-UHFFFAOYSA-N  Brc1cccc(Nc2ncnc3cc(NCCc4c[nH]cn4)ncc23)c1   
4  OJRQYIAKSVOWNZ-UHFFFAOYSA-N   Brc1cccc(Nc2ncnc3cnc(NCCCN4CCOCC4)cc23)c1   

   solubility  
0      -5.071  
1      -5.322  
2      -5.235  
3      -4.338  
4      -3.788  


In [None]:
# @title 💾 Save your results in Google Drive

output_path = "/content/output_pchembl.csv"
output.to_csv(output_path, index=False)


In [32]:
output=pd.read_csv("/content/output_pchembl.csv")
output.head()


Unnamed: 0,key,input,solubility
0,WDSQNMSWLYROHT-UHFFFAOYSA-N,Brc1ccc(-c2nc(-c3ccc4nc[nH]c4c3)no2)cc1,-5.071
1,YQTYPSIBGJUFHX-UHFFFAOYSA-N,Brc1cccc(Nc2nc3c(N4CCCC4)ncnc3s2)c1,-5.322
2,QENYREVCVOGEQU-UHFFFAOYSA-N,Brc1cccc(Nc2ncnc3cc(NCCCn4ccnc4)ncc23)c1,-5.235
3,PFJRPGGNJFXXDJ-UHFFFAOYSA-N,Brc1cccc(Nc2ncnc3cc(NCCc4c[nH]cn4)ncc23)c1,-4.338
4,OJRQYIAKSVOWNZ-UHFFFAOYSA-N,Brc1cccc(Nc2ncnc3cnc(NCCCN4CCOCC4)cc23)c1,-3.788


In [36]:
input=pd.read_csv("/content/chembl_stand.csv")
input.head()

Unnamed: 0,smiles,logS,weight
0,Brc1ccc(-c2nc(-c3ccc4nc[nH]c4c3)no2)cc1,-4.451,1.0
1,Brc1cccc(Nc2nc3c(N4CCCC4)ncnc3s2)c1,-3.39942,1.0
2,Brc1cccc(Nc2ncnc3cc(NCCCn4ccnc4)ncc23)c1,-1.39794,1.0
3,Brc1cccc(Nc2ncnc3cc(NCCc4c[nH]cn4)ncc23)c1,-1.45593,1.0
4,Brc1cccc(Nc2ncnc3cnc(NCCCN4CCOCC4)cc23)c1,-7.35655,1.0


In [61]:
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report

# Load ground truth (logs) and predictions (solubility) from files
logs_df = pd.read_csv("/content/unique_chembl.csv")
solubility_df = pd.read_csv("/content/output_pchembl.csv")

# Assuming both dataframes have an index column and the same order of samples
ground_truth = logs_df["logS"]
predictions = solubility_df["solubility"]
# Convert continuous values to binary classes
threshold = -4  # Adjust this threshold based on your problem
ground_truth_classes = (ground_truth > threshold).astype(int)
predictions_classes = (predictions > threshold).astype(int)

# Calculate confusion matrix
cm = confusion_matrix(ground_truth_classes, predictions_classes)

# Print confusion matrix
print("Confusion Matrix:")
print(cm)

# Calculate classification report
cr = classification_report(ground_truth_classes, predictions_classes)

# Print classification report
print("\nClassification Report:")
print(cr)

Confusion Matrix:
[[15747  1985]
 [ 8568  4720]]

Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.89      0.75     17732
           1       0.70      0.36      0.47     13288

    accuracy                           0.66     31020
   macro avg       0.68      0.62      0.61     31020
weighted avg       0.67      0.66      0.63     31020



In [62]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Load ground truth (logs) and predictions (solubility) from files
logs_df = pd.read_csv("/content/unique_chembl.csv")
solubility_df = pd.read_csv("/content/output_pchembl.csv")

# Assuming both dataframes have an index column and the same order of samples
ground_truth = logs_df["logS"]
predictions = solubility_df["solubility"]

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(ground_truth, predictions)
print("Mean Absolute Error (MAE):", mae)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mean_squared_error(ground_truth, predictions))
print("Root Mean Squared Error (RMSE):", rmse)

Mean Absolute Error (MAE): 1.3373231591422308
Root Mean Squared Error (RMSE): 1.7280959712008543
