## **The goal of this task is to check model predictions obtained from the Ersilia Model Hub**.

In [1]:
# In this codeblock I will import the necessary packages and specify the paths to relevant folders
# import the necessary packages and specify the paths to relevant folders

%%capture
%env MINICONDA_INSTALLER_SCRIPT=Miniconda3-py37_4.12.0-Linux-x86_64.sh
%env MINICONDA_PREFIX=/usr/local
%env PYTHONPATH="$PYTHONPATH:/usr/local/lib/python3.7/site-packages"
%env PIP_ROOT_USER_ACTION=ignore

!wget https://repo.anaconda.com/miniconda/Miniconda3-py37_4.12.0-Linux-x86_64.sh -O miniconda.sh
!bash miniconda.sh -b -f -p $MINICONDA_PREFIX

!python -m pip install git+https://github.com/ersilia-os/ersilia.git
!python -m pip install requests --upgrade
import sys
import pandas as pd
import numpy as np


_ = sys.path.append("/usr/local/lib/python3.7/site-packages")

!pip install rdkit

sys.path.append("/content/drive/MyDrive/Ersilia_Week2Task")

## Mount google drive
from google.colab import drive

drive.mount("/content/drive")


# specify your output folder

output_folder = "/content/drive/MyDrive/Ersilia_ModelValidation/ersilia_model_pred"  # @param {type:"string"}

# specify your output folder

output_folder = "/content/drive/MyDrive/Ersilia_ModelValidation/ersilia_model_pred"  # @param {type:"string"}

# specify the name of your input csv file

input_filename = "/content/drive/MyDrive/Ersilia_ModelValidation/ersilia_model_pred/NPC_test.csv"  # @param {type:"string"}


In [2]:
# In this codeblock I will load the data from the /data folder to a Pandas dataframe and understand which headers it has

import pandas as pd
df= pd.read_csv("/content/drive/MyDrive/Ersilia_ModelValidation/ersilia_model_pred/NPC_test.csv")

# check the first five rows with its header
print(df.shape)
print(df.head())

(185, 3)
                                              smiles solubility (µg/mL)  \
0  CCOC(=O)N[C@@H]1CC[C@@H]2[C@@H](C1)C[C@H]1C(=O...                 <1   
1                           Clc1cc(Cl)c(OCC#CI)cc1Cl                 <1   
2            c1ccc(-c2ccc(C(c3ccccc3)n3ccnc3)cc2)cc1                 <1   
3    Cc1cc(/C=C/C#N)cc(C)c1Nc1ccnc(Nc2ccc(C#N)cc2)n1                 <1   
4               CN(C/C=C/C#CC(C)(C)C)Cc1cccc2ccccc12                 <1   

   solubility_original_class  
0                          1  
1                          1  
2                          1  
3                          1  
4                          1  


In [3]:
# In this codeblock I will convert the molecules to standard SMILES by using the function standardise_smiles from /src
# I will import the function directly from src, not copying it here

from src import standardise_smiles

# Define the column containing the molecules (SMILES strings)
smiles_column = 'smiles'

# Convert molecules to standard SMILES
df['standardized_smiles'] = df[smiles_column].apply(standardise_smiles)

In [4]:
# In this codeblock I will get the Inchikey representation of the molecules using the RDKIT package

from src import get_inchikey

# Apply the function to the "standardise_smile" column
df["inchi_key"] = df["smiles"].apply(get_inchikey)

# Display the DataFrame with the added standardized SMILES column
print(df.head())


                                              smiles solubility (µg/mL)  \
0  CCOC(=O)N[C@@H]1CC[C@@H]2[C@@H](C1)C[C@H]1C(=O...                 <1   
1                           Clc1cc(Cl)c(OCC#CI)cc1Cl                 <1   
2            c1ccc(-c2ccc(C(c3ccccc3)n3ccnc3)cc2)cc1                 <1   
3    Cc1cc(/C=C/C#N)cc(C)c1Nc1ccnc(Nc2ccc(C#N)cc2)n1                 <1   
4               CN(C/C=C/C#CC(C)(C)C)Cc1cccc2ccccc12                 <1   

   solubility_original_class  \
0                          1   
1                          1   
2                          1   
3                          1   
4                          1   

                                 standardized_smiles  \
0  CCOC(=O)N[C@@H]1CC[C@@H]2[C@@H](C1)C[C@H]1C(=O...   
1                           Clc1cc(Cl)c(OCC#CI)cc1Cl   
2            c1ccc(-c2ccc(C(c3ccccc3)n3ccnc3)cc2)cc1   
3    Cc1cc(/C=C/C#N)cc(C)c1Nc1ccnc(Nc2ccc(C#N)cc2)n1   
4               CN(C/C=C/C#CC(C)(C)C)Cc1cccc2ccccc12   

                   


































































In [5]:
# In this codeblock I will save the data as a .csv file containing only the standard smiles and the inchikey as columns.
# All data will be saved with informative names in the /data folder


# Select only the 'standardized_smiles' and 'inchi_key' columns
selected_columns = ['standardized_smiles', 'inchi_key']
df = df[selected_columns]

# Specify the path where you want to save the CSV file
output_folder = '/content/drive/MyDrive/Ersilia_ModelValidation/ersilia_model_pred'
output_filename = 'standard_smiles_inchikey.csv'
output_path = f'{output_folder}/{output_filename}'

# Save the selected DataFrame to a CSV file
df.to_csv(output_path, index=False)


In [6]:
print(df.head())

                                 standardized_smiles  \
0  CCOC(=O)N[C@@H]1CC[C@@H]2[C@@H](C1)C[C@H]1C(=O...   
1                           Clc1cc(Cl)c(OCC#CI)cc1Cl   
2            c1ccc(-c2ccc(C(c3ccccc3)n3ccnc3)cc2)cc1   
3    Cc1cc(/C=C/C#N)cc(C)c1Nc1ccnc(Nc2ccc(C#N)cc2)n1   
4               CN(C/C=C/C#CC(C)(C)C)Cc1cccc2ccccc12   

                     inchi_key  
0  ZBGXUVOIWDMMJE-JNGLTUCJSA-N  
1  CTETYYAZBPJBHE-UHFFFAOYSA-N  
2  OCAPBUJLXMYKEJ-UHFFFAOYSA-N  
3  YIBOMRUWOWDFLG-ONEGZZNKSA-N  
4  DOMXUEMWDBAQBQ-WEVVVXLNSA-N  


# Model Predictions



In [7]:
#  Extract SMILES to a list
standardized_smiles_list = df['standardized_smiles'].tolist()

In [8]:

# enter model name
model_name = "eos74bo"  # @param {type:"string"}

# Fetch the Model
import time

begin = time.time()
!ersilia fetch $model_name
end = time.time()

print("Time taken:", round((end - begin), 2), "seconds")


[34m⬇️  Fetching model eos74bo: ncats-solubility[0m
sudo: unknown user udockerusername
sudo: error initializing audit plugin sudoers_audit
  Running command git clone -q https://github.com/ersilia-os/bentoml-ersilia.git /tmp/pip-req-build-v0nqn0vp
Collecting package metadata (current_repodata.json): - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | done
Solving environment: - failed with repodata from current_repodata.json, will retry with next repodata source.
Collecting package metadata (repodata.json): | / - \ | / - \ | / done
Solving environment: \ | / - \ | / - \ | / done


  current version: 4.12.0
  latest version: 24.3.0

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /usr/local/envs/eosbase-bentoml-0.11.0-py37

  added / updated specs:
    - python=3.7


The followi

In [9]:
# Serve the model

!ersilia serve $model_name

sudo: unknown user udockerusername
sudo: error initializing audit plugin sudoers_audit
sudo: unknown user udockerusername
sudo: error initializing audit plugin sudoers_audit
[32m🚀 Serving model eos74bo: ncats-solubility[0m
[0m
[33m   URL: http://127.0.0.1:58461[0m
[33m   PID: 12925[0m
[33m   SRV: conda[0m
[0m
[34m👉 To run model:[0m
[34m   - run[0m
[0m
[34m💁 Information:[0m
[34m   - info[0m


In [10]:
# Run predictions

api = "predict"  # @param {type:"string"}

from ersilia import ErsiliaModel
import time

model = ErsiliaModel(model_name)
begin = time.time()
output = model.api(input=standardized_smiles_list, output="pandas")
end = time.time()

print("Successful 👍! Time taken:", round((end - begin), 2), "seconds")
model.close()


Successful 👍! Time taken: 15.34 seconds


In [11]:
# Check your results
print(output.head())

# Save my results in Google Drive

#output_path = f'{output_folder}/{model_name}'
output_path = "%s/%s_output.csv" % (output_folder, model_name)
output.to_csv(output_path, index=False)



                           key  \
0  ZBGXUVOIWDMMJE-JNGLTUCJSA-N   
1  CTETYYAZBPJBHE-UHFFFAOYSA-N   
2  OCAPBUJLXMYKEJ-UHFFFAOYSA-N   
3  YIBOMRUWOWDFLG-ONEGZZNKSA-N   
4  DOMXUEMWDBAQBQ-WEVVVXLNSA-N   

                                               input  outcome  
0  CCOC(=O)N[C@@H]1CC[C@@H]2[C@@H](C1)C[C@H]1C(=O...    0.997  
1                           Clc1cc(Cl)c(OCC#CI)cc1Cl    1.000  
2            c1ccc(-c2ccc(C(c3ccccc3)n3ccnc3)cc2)cc1    0.996  
3    Cc1cc(/C=C/C#N)cc(C)c1Nc1ccnc(Nc2ccc(C#N)cc2)n1    1.000  
4               CN(C/C=C/C#CC(C)(C)C)Cc1cccc2ccccc12    0.996  


In [12]:
ersilia_pred= pd.read_csv(r"/content/drive/MyDrive/Ersilia_ModelValidation/ersilia_model_pred/eos74bo_output.csv")
ersilia_pred.head()

Unnamed: 0,key,input,outcome
0,ZBGXUVOIWDMMJE-JNGLTUCJSA-N,CCOC(=O)N[C@@H]1CC[C@@H]2[C@@H](C1)C[C@H]1C(=O...,0.997
1,CTETYYAZBPJBHE-UHFFFAOYSA-N,Clc1cc(Cl)c(OCC#CI)cc1Cl,1.0
2,OCAPBUJLXMYKEJ-UHFFFAOYSA-N,c1ccc(-c2ccc(C(c3ccccc3)n3ccnc3)cc2)cc1,0.996
3,YIBOMRUWOWDFLG-ONEGZZNKSA-N,Cc1cc(/C=C/C#N)cc(C)c1Nc1ccnc(Nc2ccc(C#N)cc2)n1,1.0
4,DOMXUEMWDBAQBQ-WEVVVXLNSA-N,CN(C/C=C/C#CC(C)(C)C)Cc1cccc2ccccc12,0.996


In [13]:
ersilia_pred.shape

(185, 3)

In [15]:
ersilia_prediction= pd.read_csv(r"/content/drive/MyDrive/Ersilia_ModelValidation/ersilia_model_pred/NPC_test.csv")

In [16]:
# Extract the 'outcome' column from the output DataFrame
outcome = output.iloc[:, 2]

# Assign the extracted 'outcome' column to the full_pred DataFrame
ersilia_prediction['prediction'] = outcome

# Print the resulting DataFrame
print(ersilia_prediction.head())

                                              smiles solubility (µg/mL)  \
0  CCOC(=O)N[C@@H]1CC[C@@H]2[C@@H](C1)C[C@H]1C(=O...                 <1   
1                           Clc1cc(Cl)c(OCC#CI)cc1Cl                 <1   
2            c1ccc(-c2ccc(C(c3ccccc3)n3ccnc3)cc2)cc1                 <1   
3    Cc1cc(/C=C/C#N)cc(C)c1Nc1ccnc(Nc2ccc(C#N)cc2)n1                 <1   
4               CN(C/C=C/C#CC(C)(C)C)Cc1cccc2ccccc12                 <1   

   solubility_original_class  prediction  
0                          1       0.997  
1                          1       1.000  
2                          1       0.996  
3                          1       1.000  
4                          1       0.996  


In [17]:
import numpy as np
# Define a threshold (e.g., 0.5 for binary classification)
threshold = 0.5

# Convert probabilities to class labels
ersilia_prediction['predicted_class'] = np.where(ersilia_prediction['prediction'] >= threshold, 1, 0)


In [18]:
ersilia_prediction.head()

Unnamed: 0,smiles,solubility (µg/mL),solubility_original_class,prediction,predicted_class
0,CCOC(=O)N[C@@H]1CC[C@@H]2[C@@H](C1)C[C@H]1C(=O...,<1,1,0.997,1
1,Clc1cc(Cl)c(OCC#CI)cc1Cl,<1,1,1.0,1
2,c1ccc(-c2ccc(C(c3ccccc3)n3ccnc3)cc2)cc1,<1,1,0.996,1
3,Cc1cc(/C=C/C#N)cc(C)c1Nc1ccnc(Nc2ccc(C#N)cc2)n1,<1,1,1.0,1
4,CN(C/C=C/C#CC(C)(C)C)Cc1cccc2ccccc12,<1,1,0.996,1




---





In [19]:
ersilia_prediction.to_csv(r"/content/drive/MyDrive/Ersilia_ModelValidation/ersilia_model_pred/ersilia_prediction.csv")

In [20]:
from sklearn.metrics import roc_auc_score, balanced_accuracy_score, confusion_matrix, cohen_kappa_score

In [21]:
auc_roc = roc_auc_score(ersilia_prediction['solubility_original_class'], ersilia_prediction['predicted_class'])
bacc = balanced_accuracy_score(ersilia_prediction['solubility_original_class'], ersilia_prediction['predicted_class'])
tn, fp, fn, tp = confusion_matrix(ersilia_prediction['solubility_original_class'], ersilia_prediction['predicted_class']).ravel()

sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
kappa = cohen_kappa_score(ersilia_prediction['solubility_original_class'], ersilia_prediction['predicted_class'])

In [22]:
# Create a DataFrame to store the metric results
metric_results = pd.DataFrame({
    'Metric': ['AUC-ROC', 'BACC', 'sensitivity', 'specificity',  'Cohen\'s Kappa'],
    'Score': [auc_roc, bacc, sensitivity ,specificity, kappa]
})

print(metric_results.T)

               0         1            2            3              4
Metric   AUC-ROC      BACC  sensitivity  specificity  Cohen's Kappa
Score   0.836467  0.836467     0.804878     0.868056       0.614017


In [None]:
GCNN 0.90 0.80 0.71 0.90 0.59

In [14]:
import numpy as np
df_pred= pd.read_csv(r"/content/drive/MyDrive/Ersilia_ModelValidation/ersilia_model_pred/eos74bo_output.csv")

# Define a threshold (e.g., 0.5 for binary classification)
threshold = 0.5

# Convert probabilities to class labels
df_pred['predicted_class'] = np.where(df_pred['outcome'] >= threshold, 1, 0)




In [15]:
df_pred.head()

Unnamed: 0,key,input,outcome,predicted_class
0,IQPNAANSBPBGFQ-UHFFFAOYSA-N,O=c1cc(-c2ccc(O)c(O)c2)oc2cc(O)cc(O)c12,0.061,0
1,FVYXIJYOAGAUQK-UHFFFAOYSA-N,C=CCc1ccc(O)c(-c2ccc(O)c(CC=C)c2)c1,0.938,1
2,FEPMHVLSLDOMQC-IYPFLVAKSA-N,CC[C@H]1NC(=O)[C@@H](NC(=O)c2ncccc2O)[C@@H](C)...,0.108,0
3,VEPKQEUBKLEPRA-UHFFFAOYSA-N,O=c1ncn2nc(Sc3ccc(F)cc3F)ccc2c1-c1c(Cl)cccc1Cl,0.496,0
4,AIDVIFPYWYKRCE-UHFFFAOYSA-N,O=C(Cc1ccc(Cl)c(Cl)c1)Nc1ccc(S(=O)(=O)Nc2ccon2...,0.323,0


In [None]:
test_set=pd.read_excel