# Inference

This script showcases the different models available in fishbAIT and how to use them efficiently.

In [7]:
import pandas as pd
import numpy as np
from inference_utils.model_utils import *
from inference_utils.pytorch_data_utils import *

In [None]:
model = fishbAIT().from_pretrained(version='EC50')

## Inference example using predefined endpoints, effects and durations

In [2]:
data = pd.read_excel('../data/Inference_example_1.xlsx')

In [4]:
processor = PreProcessDataForInference(data)
processor.GetCanonicalSMILES()
processor.GetOneHotEnc(list_of_endpoints=['EC50','EC10'], list_of_effects=['MOR'])
processed_data = processor.dataframe

Renamed NOEC *EC10* in 0 positions
Did not return onehotencoding for Effect. Why? You specified only one Effect.


In [5]:
processed_data

Unnamed: 0,CAS,organism,Conc_sign,DOI,Duration_Value,effect,endpoint,mgperL,species_group,SMILES,Pubchem_CID,cmpdname,xlogp,Lineage,SMILES_Canonical_RDKit,OneHotEnc_endpoint,OneHotEnc_concatenated
0,50782,danio rerio,=,,48.0,MPH,EC10,9.09807,fish,CC(=O)Oc1ccccc1C(O)=O,2244,Aspirin,1.2,"['root', 'cellular organisms', 'Eukaryota', 'O...",CC(=O)Oc1ccccc1C(=O)O,"[0, 1]","[0, 1]"
1,7440473,oreochromis mossambicus,=,,96.0,MOR,EC50,179.00000,fish,[Cr],23976,Chromium,,"['root', 'cellular organisms', 'Eukaryota', 'O...",[Cr],"[1, 0]","[1, 0]"
2,56296787,gambusia affinis,=,,2400.0,GRO,EC10,0.07100,fish,[H+].[Cl-].CNCCC(Oc1ccc(cc1)C(F)(F)F)c2ccccc2,62857,Fluoxetine hydrochloride,,"['root', 'cellular organisms', 'Eukaryota', 'O...",CNCCC(Oc1ccc(C(F)(F)F)cc1)c1ccccc1.[Cl-].[H+],"[0, 1]","[0, 1]"
3,50293,fundulus majalis,=,,96.0,MOR,EC50,0.00100,fish,Clc1ccc(cc1)C(c2ccc(Cl)cc2)C(Cl)(Cl)Cl,3036,Clofenotane,6.9,"['root', 'cellular organisms', 'Eukaryota', 'O...",Clc1ccc(C(c2ccc(Cl)cc2)C(Cl)(Cl)Cl)cc1,"[1, 0]","[1, 0]"
4,7440508,,=,,48.0,MOR,EC50,0.84000,fish,[Cu],23978,Copper,,['root'],[Cu],"[1, 0]","[1, 0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,15245440,pimephales promelas,=,,168.0,MOR,EC10,1.33770,fish,[Pb++].[O-]c1c(cc(c([O-])c1[N+]([O-])=O)[N+]([...,61789,Lead styphnate,,"['root', 'cellular organisms', 'Eukaryota', 'O...",O=[N+]([O-])c1cc([N+](=O)[O-])c([O-])c([N+](=O...,"[0, 1]","[0, 1]"
996,107534963,danio rerio,=,,672.0,MPH,EC10,0.18000,fish,CC(C)(C)C(O)(CCc1ccc(Cl)cc1)Cn2cncn2,86102,Tebuconazole,3.7,"['root', 'cellular organisms', 'Eukaryota', 'O...",CC(C)(C)C(O)(CCc1ccc(Cl)cc1)Cn1cncn1,"[0, 1]","[0, 1]"
997,16470249,danio rerio,=,,96.0,MOR,EC10,27.00000,fish,[Na+].[Na+].[Na+].[Na+].OCCN(CCO)c1nc(Nc2ccc(c...,6435854,OpticalBrightenerBbu220,,"['root', 'cellular organisms', 'Eukaryota', 'O...",O=S(=O)([O-])c1ccc(Nc2nc(Nc3ccc(/C=C/c4ccc(Nc5...,"[0, 1]","[0, 1]"
998,2008391,micropterus salmoides,=,,48.0,MOR,EC50,350.00000,fish,CNC.OC(=O)COc1ccc(Cl)cc1Cl,16180,"2,4-D dimethylamine salt",,"['root', 'cellular organisms', 'Eukaryota', 'O...",CNC.O=C(O)COc1ccc(Cl)cc1Cl,"[1, 0]","[1, 0]"


## Inference example using only a list of SMILES

In [8]:
data = pd.read_excel('../data/Inference_example_2.xlsx')
data

Unnamed: 0,SMILES,cmpdname
0,CC(=O)Oc1ccccc1C(O)=O,Aspirin
1,[Cr],Chromium
2,[H+].[Cl-].CNCCC(Oc1ccc(cc1)C(F)(F)F)c2ccccc2,Fluoxetine hydrochloride
3,Clc1ccc(cc1)C(c2ccc(Cl)cc2)C(Cl)(Cl)Cl,Clofenotane
4,[Cu],Copper
...,...,...
995,[Pb++].[O-]c1c(cc(c([O-])c1[N+]([O-])=O)[N+]([...,Lead styphnate
996,CC(C)(C)C(O)(CCc1ccc(Cl)cc1)Cn2cncn2,Tebuconazole
997,[Na+].[Na+].[Na+].[Na+].OCCN(CCO)c1nc(Nc2ccc(c...,OpticalBrightenerBbu220
998,CNC.OC(=O)COc1ccc(Cl)cc1Cl,"2,4-D dimethylamine salt"


Define the duration for which the model should generate a prediction

In [10]:
data['Duration_Value'] = np.log10(96) # Have to define the log10 duration (h)
data

Unnamed: 0,SMILES,cmpdname,Duration_Value
0,CC(=O)Oc1ccccc1C(O)=O,Aspirin,1.982271
1,[Cr],Chromium,1.982271
2,[H+].[Cl-].CNCCC(Oc1ccc(cc1)C(F)(F)F)c2ccccc2,Fluoxetine hydrochloride,1.982271
3,Clc1ccc(cc1)C(c2ccc(Cl)cc2)C(Cl)(Cl)Cl,Clofenotane,1.982271
4,[Cu],Copper,1.982271
...,...,...,...
995,[Pb++].[O-]c1c(cc(c([O-])c1[N+]([O-])=O)[N+]([...,Lead styphnate,1.982271
996,CC(C)(C)C(O)(CCc1ccc(Cl)cc1)Cn2cncn2,Tebuconazole,1.982271
997,[Na+].[Na+].[Na+].[Na+].OCCN(CCO)c1nc(Nc2ccc(c...,OpticalBrightenerBbu220,1.982271
998,CNC.OC(=O)COc1ccc(Cl)cc1Cl,"2,4-D dimethylamine salt",1.982271
