# Inference

This script showcases the different models available in fishbAIT and how to use them efficiently.

In [2]:
import pandas as pd
import numpy as np
from inference_utils.model_utils import load_fine_tuned_model
from inference_utils.pytorch_data_utils import PreProcessDataForInference, BuildInferenceDataLoaderAndDataset
from inference_utils import paths

['.git', 'data', 'development', 'final_model.svg', 'fishbAIT', 'LICENSE', 'README.md', 'tutorials']


In [3]:
model, tokenizer = load_fine_tuned_model('EC50')

## Inference example using predefined endpoints, effects and durations

In [13]:
data = pd.read_excel('../data/Inference_example_1.xlsx')

In [18]:
processor = PreProcessDataForInference(data)
processor.GetCanonicalSMILES()
processor.GetOneHotEnc(list_of_endpoints=['EC50'], list_of_effects=['MOR'])
processed_data = processor.dataframe

Did not return onehotencoding for Endpoint. Why? You specified only one Endpoint or you specified NOEC and EC10 which are coded to be the same endpoint.
Did not return onehotencoding for Effect. Why? You specified only one Effect.
Will use input 0 to network due to no Onehotencodings being present.


In [19]:
processed_data

Unnamed: 0,SMILES,cmpdname,Duration_Value,effect,endpoint,SMILES_Canonical_RDKit,OneHotEnc_concatenated
0,CC(=O)Oc1ccccc1C(O)=O,Aspirin,48.0,MPH,EC10,CC(=O)Oc1ccccc1C(=O)O,[0.0]
1,[Cr],Chromium,96.0,MOR,EC50,[Cr],[0.0]
2,[H+].[Cl-].CNCCC(Oc1ccc(cc1)C(F)(F)F)c2ccccc2,Fluoxetine hydrochloride,2400.0,GRO,EC10,CNCCC(Oc1ccc(C(F)(F)F)cc1)c1ccccc1.[Cl-].[H+],[0.0]
3,Clc1ccc(cc1)C(c2ccc(Cl)cc2)C(Cl)(Cl)Cl,Clofenotane,96.0,MOR,EC50,Clc1ccc(C(c2ccc(Cl)cc2)C(Cl)(Cl)Cl)cc1,[0.0]
4,[Cu],Copper,48.0,MOR,EC50,[Cu],[0.0]
...,...,...,...,...,...,...,...
995,[Pb++].[O-]c1c(cc(c([O-])c1[N+]([O-])=O)[N+]([...,Lead styphnate,168.0,MOR,EC10,O=[N+]([O-])c1cc([N+](=O)[O-])c([O-])c([N+](=O...,[0.0]
996,CC(C)(C)C(O)(CCc1ccc(Cl)cc1)Cn2cncn2,Tebuconazole,672.0,MPH,EC10,CC(C)(C)C(O)(CCc1ccc(Cl)cc1)Cn1cncn1,[0.0]
997,[Na+].[Na+].[Na+].[Na+].OCCN(CCO)c1nc(Nc2ccc(c...,OpticalBrightenerBbu220,96.0,MOR,EC10,O=S(=O)([O-])c1ccc(Nc2nc(Nc3ccc(/C=C/c4ccc(Nc5...,[0.0]
998,CNC.OC(=O)COc1ccc(Cl)cc1Cl,"2,4-D dimethylamine salt",48.0,MOR,EC50,CNC.O=C(O)COc1ccc(Cl)cc1Cl,[0.0]


## Inference example using only a list of SMILES

In [4]:
data = pd.read_excel('../data/Inference_example_2.xlsx')
data

Unnamed: 0,SMILES,cmpdname
0,CC(=O)Oc1ccccc1C(O)=O,Aspirin
1,[Cr],Chromium
2,[H+].[Cl-].CNCCC(Oc1ccc(cc1)C(F)(F)F)c2ccccc2,Fluoxetine hydrochloride
3,Clc1ccc(cc1)C(c2ccc(Cl)cc2)C(Cl)(Cl)Cl,Clofenotane
4,[Cu],Copper
...,...,...
995,[Pb++].[O-]c1c(cc(c([O-])c1[N+]([O-])=O)[N+]([...,Lead styphnate
996,CC(C)(C)C(O)(CCc1ccc(Cl)cc1)Cn2cncn2,Tebuconazole
997,[Na+].[Na+].[Na+].[Na+].OCCN(CCO)c1nc(Nc2ccc(c...,OpticalBrightenerBbu220
998,CNC.OC(=O)COc1ccc(Cl)cc1Cl,"2,4-D dimethylamine salt"


Define the duration for which the model should generate a prediction

In [5]:
data['exposure_duration'] = np.log10(96) # Have to define the log10 duration (h)
data

Unnamed: 0,SMILES,cmpdname,exposure_duration
0,CC(=O)Oc1ccccc1C(O)=O,Aspirin,1.982271
1,[Cr],Chromium,1.982271
2,[H+].[Cl-].CNCCC(Oc1ccc(cc1)C(F)(F)F)c2ccccc2,Fluoxetine hydrochloride,1.982271
3,Clc1ccc(cc1)C(c2ccc(Cl)cc2)C(Cl)(Cl)Cl,Clofenotane,1.982271
4,[Cu],Copper,1.982271
...,...,...,...
995,[Pb++].[O-]c1c(cc(c([O-])c1[N+]([O-])=O)[N+]([...,Lead styphnate,1.982271
996,CC(C)(C)C(O)(CCc1ccc(Cl)cc1)Cn2cncn2,Tebuconazole,1.982271
997,[Na+].[Na+].[Na+].[Na+].OCCN(CCO)c1nc(Nc2ccc(c...,OpticalBrightenerBbu220,1.982271
998,CNC.OC(=O)COc1ccc(Cl)cc1Cl,"2,4-D dimethylamine salt",1.982271


Preprocess the data to generate one hot encoding

In [28]:
processor = PreProcessDataForInference(data)
processor.GetCanonicalSMILES()
processor.GetOneHotEnc(list_of_endpoints=['EC50'], list_of_effects=['MOR'])
processed_data = processor.dataframe.iloc[:10]
processed_data

Did not return onehotencoding for Endpoint. Why? You specified only one Endpoint or you specified NOEC and EC10 which are coded to be the same endpoint.
Did not return onehotencoding for Effect. Why? You specified only one Effect.
Will use input 0 to network due to no Onehotencodings being present.


Unnamed: 0,SMILES,cmpdname,exposure_duration,SMILES_Canonical_RDKit,OneHotEnc_concatenated
0,CC(=O)Oc1ccccc1C(O)=O,Aspirin,1.982271,CC(=O)Oc1ccccc1C(=O)O,[0.0]
1,[Cr],Chromium,1.982271,[Cr],[0.0]
2,[H+].[Cl-].CNCCC(Oc1ccc(cc1)C(F)(F)F)c2ccccc2,Fluoxetine hydrochloride,1.982271,CNCCC(Oc1ccc(C(F)(F)F)cc1)c1ccccc1.[Cl-].[H+],[0.0]
3,Clc1ccc(cc1)C(c2ccc(Cl)cc2)C(Cl)(Cl)Cl,Clofenotane,1.982271,Clc1ccc(C(c2ccc(Cl)cc2)C(Cl)(Cl)Cl)cc1,[0.0]
4,[Cu],Copper,1.982271,[Cu],[0.0]
5,CCNc1nc(Cl)nc(NC(C)C)n1,Atrazine,1.982271,CCNc1nc(Cl)nc(NC(C)C)n1,[0.0]
6,CN(C)C1=NC(=O)N(C2CCCCC2)C(=O)N1C,Hexazinone,1.982271,CN(C)c1nc(=O)n(C2CCCCC2)c(=O)n1C,[0.0]
7,CC(Br)(CO)[N+]([O-])=O,Debropol,1.982271,CC(Br)(CO)[N+](=O)[O-],[0.0]
8,c1ccc2c(c1)c3cccc4cccc2c34,Fluoranthene,1.982271,c1ccc2c(c1)-c1cccc3cccc-2c13,[0.0]
9,[Cl-].[Cl-].[Zn++],ZINC chloride,1.982271,[Cl-].[Cl-].[Zn+2],[0.0]


Build dataloader for our model for fast inference

In [29]:
loader = BuildInferenceDataLoaderAndDataset(processed_data, variables=['SMILES_Canonical_RDKit', 'exposure_duration', 'OneHotEnc_concatenated'], tokenizer = tokenizer).dataloader

Make predictions using our model

In [38]:
import torch

In [46]:
model.eval()
preds = []
for _, batch in enumerate(loader):
    with torch.no_grad():
        preds.append(model(*batch.values()).numpy())

preds = np.concatenate(preds, axis=0)
preds

array([ 0.5207307 ,  0.6819834 , -0.09552312, -0.8089556 , -0.47882724,
        0.5652273 ,  0.68885523,  0.3665309 , -0.7998388 ,  0.12358391],
      dtype=float32)