# Inference

This script showcases the different models available in TRIDENT and how to use them efficiently.

In [1]:
import pandas as pd
import torch
import pandas as pd
import numpy as np
import sys
from tqdm.notebook import tqdm
sys.path.insert(1, '/cephyr/users/skall/Alvis/Ecotoxformer/inference_scripts_for_website_2/TRIDENT/tutorials/inference_utils/')
sys.path.insert(1, '/cephyr/users/skall/Alvis/Ecotoxformer/inference_scripts_for_website_2/TRIDENT/development/development_utils/')
from inference_utils.TRIDENT_for_inference import TRIDENT_for_inference

Specify the model version and load the model

In [2]:
pd.__version__

'1.1.4'

# For ZENODO

In [3]:
root = '/cephyr/users/skall/Alvis/TRIDENT/'

In [None]:

for MODEL_ENDPOINT in ['EC50', 'EC10', 'EC50EC10']:
    for SPECIES in tqdm(['fish','invertebrates','algae']):

        MODEL_VERSION = f'{MODEL_ENDPOINT}_{SPECIES}'
        print(f'Running predictions on: {MODEL_ENDPOINT}_{SPECIES}\n')
        print('\t Loading model...\n')
        trident = TRIDENT_for_inference(model_version=MODEL_VERSION, path_to_model_weights=root+'TRIDENT/')
        trident.load_fine_tuned_model()
        print('\t Loading data...\n')
        data = pd.read_excel(root+'data/Preprocessed_complete_data.xlsx', sheet_name=f'{MODEL_ENDPOINT}_{SPECIES}')
        data['species_group'] = data.species_group.replace('crustaceans', 'invertebrates')
        
        print('\t Predicting...\n')
        results = trident.predict_toxicity(SMILES = data['SMILES'].tolist(), exposure_duration=data['Duration_Value'].tolist(), endpoint=data['endpoint'].tolist(), effect=data['effect'], return_cls_embeddings=False)

        results['exposure_duration'] = 10**results.exposure_duration

        data = data.sort_values(by=['SMILES','Duration_Value','effect','endpoint'])
        results = results.sort_values(by=['SMILES','exposure_duration','effect','endpoint'])

        data = pd.concat([data,results[['predictions log10(mg/L)','predictions (mg/L)']]], axis=1)

        data = data.sort_index()
        print(f'Saving...\n')
        data.to_csv(root+f'data/Preprocessed_complete_data_pred_{MODEL_ENDPOINT}_{SPECIES}.csv')

# For website

In [4]:
species_groups = ['fish', 'algae', 'invertebrates']
models = ['EC50EC10', 'EC50', 'EC10']
effectordering = {
            'EC50_algae': {'POP':'POP'},
            'EC10_algae': {'POP':'POP'},
            'EC50EC10_algae': {'POP':'POP'}, 
            'EC50_invertebrates': {'MOR':'MOR','ITX':'ITX'},
            'EC10_invertebrates': {'MOR':'MOR','DVP':'DVP','ITX':'ITX', 'REP': 'REP', 'MPH': 'MPH', 'POP': 'POP'} ,
            'EC50EC10_invertebrates': {'MOR':'MOR','DVP':'DVP','ITX':'ITX', 'REP': 'REP', 'MPH': 'MPH', 'POP': 'POP'} ,
            'EC50_fish': {'MOR':'MOR'},
            'EC10_fish': {'MOR':'MOR','DVP':'DVP','ITX':'ITX', 'REP': 'REP', 'MPH': 'MPH', 'POP': 'POP','GRO': 'GRO'} ,
            'EC50EC10_fish': {'MOR':'MOR','DVP':'DVP','ITX':'ITX', 'REP': 'REP', 'MPH': 'MPH', 'POP': 'POP','GRO': 'GRO'} 
            }

endpointordering = {
            'EC50_algae': {'EC50':'EC50'},
            'EC10_algae': {'EC10':'EC10'},
            'EC50EC10_algae': {'EC50':'EC50', 'EC10': 'EC10'}, 
            'EC50_invertebrates': {'EC50':'EC50'},
            'EC10_invertebrates': {'EC10':'EC10'},
            'EC50EC10_invertebrates': {'EC50':'EC50', 'EC10': 'EC10'},
            'EC50_fish': {'EC50':'EC50'},
            'EC10_fish': {'EC10':'EC10'},
            'EC50EC10_fish': {'EC50':'EC50', 'EC10': 'EC10'} 
            }

default_durations = {
    'algae': 72,
    'fish': 96,
    'invertebrates': 48
}

In [5]:
raw_data = pd.read_excel(root+'data/development/Preprocessed_complete_data.xlsx', sheet_name='dataset')

raw_data['species_group'] = raw_data.species_group.replace('crustaceans', 'invertebrates')

In [None]:
SMILES_COLUMN_NAME = 'SMILES_Canonical_RDKit'
for SPECIES in tqdm(species_groups):
    for model in models:
        cls_dict = {}
        MODEL_VERSION = f'{model}_{SPECIES}'
        EXPOSURE_DURATION = default_durations[SPECIES]
        PREDICTION_ENDPOINT = list(endpointordering[MODEL_VERSION].keys())[0]
        PREDICTION_EFFECT = list(effectordering[MODEL_VERSION].keys())[0]
        trident = TRIDENT_for_inference(model_version=MODEL_VERSION, path_to_model_weights=root+'TRIDENT/')
        trident.load_fine_tuned_model()
        data = raw_data.copy()
        
        data = data.drop_duplicates(subset=['SMILES_Canonical_RDKit']).dropna(subset=['SMILES_Canonical_RDKit'])

        results = trident.predict_toxicity(SMILES = data[SMILES_COLUMN_NAME].tolist(), exposure_duration=EXPOSURE_DURATION, endpoint=[PREDICTION_ENDPOINT]*len(data), effect=[PREDICTION_EFFECT]*len(data), return_cls_embeddings=True)
        results.reset_index(drop=True, inplace=True)
        results['CLS_embeddings'] = results['CLS_embeddings'].apply(lambda x: np.asarray(x, dtype=np.float32))
        results = results[['SMILES_Canonical_RDKit','CLS_embeddings']]        
        results.to_pickle(root+f'data/tutorials/predictions/{MODEL_VERSION}_CLS_embeddings.pkl.zip', compression='zip')



  0%|          | 0/3 [00:00<?, ?it/s]

Downloading config.json:   0%|          | 0.00/722 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/145k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/99.0k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/375k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

Renamed NOEC *EC10* in 0 positions



  0%|          | 0/26 [00:00<?, ?it/s][A
  4%|▍         | 1/26 [00:01<00:26,  1.07s/it][A
  8%|▊         | 2/26 [00:01<00:19,  1.26it/s][A
 12%|█▏        | 3/26 [00:02<00:16,  1.40it/s][A
 15%|█▌        | 4/26 [00:02<00:14,  1.47it/s][A
 19%|█▉        | 5/26 [00:03<00:13,  1.50it/s][A
 23%|██▎       | 6/26 [00:04<00:13,  1.53it/s][A
 27%|██▋       | 7/26 [00:04<00:12,  1.57it/s][A
 31%|███       | 8/26 [00:05<00:11,  1.58it/s][A
 35%|███▍      | 9/26 [00:06<00:10,  1.59it/s][A
 38%|███▊      | 10/26 [00:06<00:10,  1.59it/s][A
 42%|████▏     | 11/26 [00:07<00:09,  1.59it/s][A
 46%|████▌     | 12/26 [00:07<00:08,  1.60it/s][A
 50%|█████     | 13/26 [00:08<00:08,  1.60it/s][A
 54%|█████▍    | 14/26 [00:09<00:07,  1.60it/s][A
 58%|█████▊    | 15/26 [00:09<00:06,  1.59it/s][A
 62%|██████▏   | 16/26 [00:10<00:06,  1.59it/s][A
 65%|██████▌   | 17/26 [00:11<00:05,  1.59it/s][A
 69%|██████▉   | 18/26 [00:11<00:05,  1.58it/s][A
 73%|███████▎  | 19/26 [00:12<00:04,  1.58it/s]

Downloading config.json:   0%|          | 0.00/722 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/145k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/99.0k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/375k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

Did not return onehotencoding for Endpoint. Why? You specified only one Endpoint or you specified NOEC and EC10 which are coded to be the same endpoint.
Did not return onehotencoding for Effect. Why? You specified only one Effect.
Will use input 0 to network due to no Onehotencodings being present.



  0%|          | 0/26 [00:00<?, ?it/s][A
  4%|▍         | 1/26 [00:00<00:17,  1.41it/s][A
  8%|▊         | 2/26 [00:01<00:15,  1.54it/s][A
 12%|█▏        | 3/26 [00:01<00:14,  1.57it/s][A
 15%|█▌        | 4/26 [00:02<00:14,  1.57it/s][A
 19%|█▉        | 5/26 [00:03<00:13,  1.58it/s][A
 23%|██▎       | 6/26 [00:03<00:12,  1.58it/s][A
 27%|██▋       | 7/26 [00:04<00:11,  1.60it/s][A
 31%|███       | 8/26 [00:05<00:11,  1.59it/s][A
 35%|███▍      | 9/26 [00:05<00:10,  1.59it/s][A
 38%|███▊      | 10/26 [00:06<00:10,  1.59it/s][A
 42%|████▏     | 11/26 [00:06<00:09,  1.58it/s][A
 46%|████▌     | 12/26 [00:07<00:08,  1.59it/s][A
 50%|█████     | 13/26 [00:08<00:08,  1.58it/s][A
 54%|█████▍    | 14/26 [00:08<00:07,  1.58it/s][A
 58%|█████▊    | 15/26 [00:09<00:07,  1.57it/s][A
 62%|██████▏   | 16/26 [00:10<00:06,  1.57it/s][A
 65%|██████▌   | 17/26 [00:10<00:05,  1.57it/s][A
 69%|██████▉   | 18/26 [00:11<00:05,  1.57it/s][A
 73%|███████▎  | 19/26 [00:12<00:04,  1.57it/s]

Downloading config.json:   0%|          | 0.00/722 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/145k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/99.0k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/375k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

Renamed NOEC *EC10* in 0 positions
Did not return onehotencoding for Endpoint. Why? You specified only one Endpoint or you specified NOEC and EC10 which are coded to be the same endpoint.



  0%|          | 0/26 [00:00<?, ?it/s][A
  4%|▍         | 1/26 [00:00<00:17,  1.41it/s][A
  8%|▊         | 2/26 [00:01<00:15,  1.55it/s][A
 12%|█▏        | 3/26 [00:01<00:14,  1.55it/s][A
 15%|█▌        | 4/26 [00:02<00:14,  1.56it/s][A
 19%|█▉        | 5/26 [00:03<00:13,  1.57it/s][A
 23%|██▎       | 6/26 [00:03<00:12,  1.57it/s][A
 27%|██▋       | 7/26 [00:04<00:11,  1.60it/s][A
 31%|███       | 8/26 [00:05<00:11,  1.59it/s][A
 35%|███▍      | 9/26 [00:05<00:10,  1.58it/s][A
 38%|███▊      | 10/26 [00:06<00:10,  1.58it/s][A
 42%|████▏     | 11/26 [00:07<00:09,  1.57it/s][A
 46%|████▌     | 12/26 [00:07<00:08,  1.57it/s][A
 50%|█████     | 13/26 [00:08<00:08,  1.57it/s][A
 54%|█████▍    | 14/26 [00:08<00:07,  1.57it/s][A
 58%|█████▊    | 15/26 [00:09<00:07,  1.57it/s][A
 62%|██████▏   | 16/26 [00:10<00:06,  1.56it/s][A
 65%|██████▌   | 17/26 [00:10<00:05,  1.56it/s][A
 69%|██████▉   | 18/26 [00:11<00:05,  1.56it/s][A
 73%|███████▎  | 19/26 [00:12<00:04,  1.56it/s]

Downloading config.json:   0%|          | 0.00/722 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/145k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/99.0k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/375k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

Renamed NOEC *EC10* in 0 positions
Did not return onehotencoding for Effect. Why? You specified only one Effect.



  0%|          | 0/26 [00:00<?, ?it/s][A
  4%|▍         | 1/26 [00:00<00:17,  1.41it/s][A
  8%|▊         | 2/26 [00:01<00:15,  1.52it/s][A
 12%|█▏        | 3/26 [00:01<00:14,  1.54it/s][A
 15%|█▌        | 4/26 [00:02<00:14,  1.55it/s][A
 19%|█▉        | 5/26 [00:03<00:13,  1.54it/s][A
 23%|██▎       | 6/26 [00:03<00:12,  1.54it/s][A
 27%|██▋       | 7/26 [00:04<00:12,  1.57it/s][A
 31%|███       | 8/26 [00:05<00:11,  1.56it/s][A
 35%|███▍      | 9/26 [00:05<00:10,  1.56it/s][A
 38%|███▊      | 10/26 [00:06<00:10,  1.55it/s][A
 42%|████▏     | 11/26 [00:07<00:09,  1.55it/s][A
 46%|████▌     | 12/26 [00:07<00:08,  1.56it/s][A
 50%|█████     | 13/26 [00:08<00:08,  1.55it/s][A
 54%|█████▍    | 14/26 [00:09<00:07,  1.56it/s][A
 58%|█████▊    | 15/26 [00:09<00:07,  1.55it/s][A
 62%|██████▏   | 16/26 [00:10<00:06,  1.55it/s][A
 65%|██████▌   | 17/26 [00:10<00:05,  1.55it/s][A
 69%|██████▉   | 18/26 [00:11<00:05,  1.55it/s][A
 73%|███████▎  | 19/26 [00:12<00:04,  1.56it/s]

Downloading config.json:   0%|          | 0.00/722 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/145k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/99.0k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/375k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

Did not return onehotencoding for Endpoint. Why? You specified only one Endpoint or you specified NOEC and EC10 which are coded to be the same endpoint.
Did not return onehotencoding for Effect. Why? You specified only one Effect.
Will use input 0 to network due to no Onehotencodings being present.



  0%|          | 0/26 [00:00<?, ?it/s][A
  4%|▍         | 1/26 [00:00<00:17,  1.41it/s][A
  8%|▊         | 2/26 [00:01<00:15,  1.53it/s][A
 12%|█▏        | 3/26 [00:01<00:14,  1.55it/s][A
 15%|█▌        | 4/26 [00:02<00:14,  1.55it/s][A
 19%|█▉        | 5/26 [00:03<00:13,  1.55it/s][A
 23%|██▎       | 6/26 [00:03<00:12,  1.55it/s][A
 27%|██▋       | 7/26 [00:04<00:12,  1.57it/s][A
 31%|███       | 8/26 [00:05<00:11,  1.56it/s][A
 35%|███▍      | 9/26 [00:05<00:10,  1.56it/s][A
 38%|███▊      | 10/26 [00:06<00:10,  1.56it/s][A
 42%|████▏     | 11/26 [00:07<00:09,  1.55it/s][A
 46%|████▌     | 12/26 [00:07<00:08,  1.56it/s][A
 50%|█████     | 13/26 [00:08<00:08,  1.55it/s][A
 54%|█████▍    | 14/26 [00:09<00:07,  1.56it/s][A
 58%|█████▊    | 15/26 [00:09<00:07,  1.55it/s][A
 62%|██████▏   | 16/26 [00:10<00:06,  1.51it/s][A
 65%|██████▌   | 17/26 [00:11<00:05,  1.52it/s][A
 69%|██████▉   | 18/26 [00:11<00:05,  1.53it/s][A
 73%|███████▎  | 19/26 [00:12<00:04,  1.53it/s]

Downloading config.json:   0%|          | 0.00/722 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/145k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/99.0k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/375k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

Renamed NOEC *EC10* in 0 positions
Did not return onehotencoding for Endpoint. Why? You specified only one Endpoint or you specified NOEC and EC10 which are coded to be the same endpoint.
Did not return onehotencoding for Effect. Why? You specified only one Effect.
Will use input 0 to network due to no Onehotencodings being present.



  0%|          | 0/26 [00:00<?, ?it/s][A
  4%|▍         | 1/26 [00:00<00:17,  1.40it/s][A
  8%|▊         | 2/26 [00:01<00:15,  1.53it/s][A
 12%|█▏        | 3/26 [00:01<00:14,  1.54it/s][A
 15%|█▌        | 4/26 [00:02<00:14,  1.54it/s][A
 19%|█▉        | 5/26 [00:03<00:13,  1.55it/s][A
 23%|██▎       | 6/26 [00:03<00:12,  1.55it/s][A
 27%|██▋       | 7/26 [00:04<00:12,  1.57it/s][A
 31%|███       | 8/26 [00:05<00:11,  1.56it/s][A
 35%|███▍      | 9/26 [00:05<00:10,  1.56it/s][A
 38%|███▊      | 10/26 [00:06<00:10,  1.56it/s][A
 42%|████▏     | 11/26 [00:07<00:09,  1.55it/s][A
 46%|████▌     | 12/26 [00:07<00:09,  1.55it/s][A
 50%|█████     | 13/26 [00:08<00:08,  1.55it/s][A
 54%|█████▍    | 14/26 [00:09<00:07,  1.55it/s][A
 58%|█████▊    | 15/26 [00:09<00:07,  1.54it/s][A
 62%|██████▏   | 16/26 [00:10<00:06,  1.54it/s][A
 65%|██████▌   | 17/26 [00:11<00:05,  1.54it/s][A
 69%|██████▉   | 18/26 [00:11<00:05,  1.54it/s][A
 73%|███████▎  | 19/26 [00:12<00:04,  1.54it/s]

Downloading config.json:   0%|          | 0.00/722 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/145k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/99.0k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/375k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

Renamed NOEC *EC10* in 0 positions



  0%|          | 0/26 [00:00<?, ?it/s][A
  4%|▍         | 1/26 [00:00<00:17,  1.41it/s][A
  8%|▊         | 2/26 [00:01<00:15,  1.53it/s][A
 12%|█▏        | 3/26 [00:01<00:14,  1.54it/s][A
 15%|█▌        | 4/26 [00:02<00:14,  1.54it/s][A
 19%|█▉        | 5/26 [00:03<00:13,  1.54it/s][A
 23%|██▎       | 6/26 [00:03<00:12,  1.54it/s][A
 27%|██▋       | 7/26 [00:04<00:12,  1.57it/s][A
 31%|███       | 8/26 [00:05<00:11,  1.56it/s][A
 35%|███▍      | 9/26 [00:05<00:10,  1.56it/s][A
 38%|███▊      | 10/26 [00:06<00:10,  1.56it/s][A
 42%|████▏     | 11/26 [00:07<00:09,  1.55it/s][A
 46%|████▌     | 12/26 [00:07<00:09,  1.55it/s][A
 50%|█████     | 13/26 [00:08<00:08,  1.55it/s][A
 54%|█████▍    | 14/26 [00:09<00:07,  1.55it/s][A
 58%|█████▊    | 15/26 [00:09<00:07,  1.54it/s][A
 62%|██████▏   | 16/26 [00:10<00:06,  1.54it/s][A
 65%|██████▌   | 17/26 [00:11<00:05,  1.53it/s][A
 69%|██████▉   | 18/26 [00:11<00:05,  1.53it/s][A
 73%|███████▎  | 19/26 [00:12<00:04,  1.53it/s]

Downloading config.json:   0%|          | 0.00/722 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/145k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/99.0k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/375k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

Did not return onehotencoding for Endpoint. Why? You specified only one Endpoint or you specified NOEC and EC10 which are coded to be the same endpoint.



  0%|          | 0/26 [00:00<?, ?it/s][A
  4%|▍         | 1/26 [00:00<00:17,  1.40it/s][A
  8%|▊         | 2/26 [00:01<00:15,  1.51it/s][A
 12%|█▏        | 3/26 [00:01<00:15,  1.53it/s][A
 15%|█▌        | 4/26 [00:02<00:14,  1.53it/s][A
 19%|█▉        | 5/26 [00:03<00:13,  1.53it/s][A
 23%|██▎       | 6/26 [00:03<00:13,  1.52it/s][A
 27%|██▋       | 7/26 [00:04<00:12,  1.54it/s][A
 31%|███       | 8/26 [00:05<00:11,  1.54it/s][A
 35%|███▍      | 9/26 [00:05<00:10,  1.55it/s][A
 38%|███▊      | 10/26 [00:06<00:10,  1.55it/s][A
 42%|████▏     | 11/26 [00:07<00:09,  1.54it/s][A
 46%|████▌     | 12/26 [00:07<00:09,  1.54it/s][A
 50%|█████     | 13/26 [00:08<00:08,  1.54it/s][A
 54%|█████▍    | 14/26 [00:09<00:07,  1.54it/s][A
 58%|█████▊    | 15/26 [00:09<00:07,  1.54it/s][A
 62%|██████▏   | 16/26 [00:10<00:06,  1.54it/s][A
 65%|██████▌   | 17/26 [00:11<00:05,  1.53it/s][A
 69%|██████▉   | 18/26 [00:11<00:05,  1.53it/s][A
 73%|███████▎  | 19/26 [00:12<00:04,  1.53it/s]

In [None]:
SMILES_COLUMN_NAME = 'SMILES_Canonical_RDKit'

combined_results = raw_data.drop_duplicates(subset=['SMILES_Canonical_RDKit']).dropna(subset=['SMILES_Canonical_RDKit'])[[SMILES_COLUMN_NAME]]

for SPECIES in tqdm(species_groups):
    
    for model in models:
        MODEL_VERSION = f'{model}_{SPECIES}'
        trident = TRIDENT_for_inference(model_version=MODEL_VERSION, path_to_model_weights=root+'TRIDENT/', device='cuda')
        trident.load_fine_tuned_model()
        data = raw_data.copy()
        
        for PREDICTION_ENDPOINT in endpointordering[MODEL_VERSION]:
            for PREDICTION_EFFECT in effectordering[MODEL_VERSION]:
                print(MODEL_VERSION, PREDICTION_ENDPOINT, PREDICTION_EFFECT)
                try:
                    EXPOSURE_DURATION = data[(data.species_group==SPECIES) & (data.endpoint==PREDICTION_ENDPOINT) & (data.effect==PREDICTION_EFFECT)].Duration_Value.value_counts().index[0]
                except:
                    EXPOSURE_DURATION = default_durations[SPECIES]
                data = combined_results[[SMILES_COLUMN_NAME]]

                results = trident.predict_toxicity(SMILES = data[SMILES_COLUMN_NAME].tolist(), exposure_duration=EXPOSURE_DURATION, endpoint=[PREDICTION_ENDPOINT]*len(data), effect=[PREDICTION_EFFECT]*len(data), return_cls_embeddings=True)
                results.reset_index(drop=True, inplace=True)
                results['exposure_duration'] = EXPOSURE_DURATION
                
                for column in results.columns:
                    results.rename(columns={column: f'{MODEL_VERSION}_{PREDICTION_ENDPOINT}_{PREDICTION_EFFECT} {column}'}, inplace=True)
                
                combined_results[f'{MODEL_VERSION}_{PREDICTION_ENDPOINT}_{PREDICTION_EFFECT} predictions log10(mg/L)'] = results[f'{MODEL_VERSION}_{PREDICTION_ENDPOINT}_{PREDICTION_EFFECT} predictions log10(mg/L)'].tolist()
                combined_results[f'{MODEL_VERSION}_{PREDICTION_ENDPOINT}_{PREDICTION_EFFECT} exposure_duration'] = results[f'{MODEL_VERSION}_{PREDICTION_ENDPOINT}_{PREDICTION_EFFECT} exposure_duration'].tolist()

In [None]:
for col in combined_results.columns:
    if 'exposure_duration' in col:
        combined_results[col] = combined_results[col].astype(np.float32)

In [None]:
combined_results.to_pickle(root+f'data/tutorials/predictions/combined_predictions.pkl.zip', compression='zip')

In [None]:
combined_results

# Get training data matches
Run locally, not on server

In [21]:
import pandas as pd
import torch
import pandas as pd
import numpy as np
import sys
from tqdm.notebook import tqdm
sys.path.insert(1, 'C://Users/Styrbjörn Käll/Documents/Chalmers/TRIDENT/tutorials/')

from figures.figure_utils.preprocess_data import Preprocess10x10Fold, GroupDataForPerformance
from development_utils.preprocessing.Get_data_for_model import PreprocessData
from tqdm.notebook import tqdm
from inference_utils.pytorch_data_utils import check_training_data, check_training_data_from_scratch

In [25]:
training_data = pd.read_excel('../data/development/Preprocessed_complete_data.xlsx', sheet_name='dataset')
training_data['species_group'] = training_data.species_group.replace('crustaceans', 'invertebrates')
all_preds = pd.read_pickle(f'../data/tutorials/predictions/combined_predictions.pkl.zip', compression='zip')

In [26]:
training_data.species_group.isna()


0         False
1         False
2         False
3         False
4         False
          ...  
147453    False
147454    False
147455    False
147456    False
147457    False
Name: species_group, Length: 147458, dtype: bool

In [27]:
training_data.species_group.unique()

array(['fish', 'invertebrates', 'algae'], dtype=object)

In [36]:
for model in models:
    for species in tqdm(species_groups):
        MODELTYPE = f'{model}_{species}'
        
        for endpoint in endpointordering[MODELTYPE]:
            for effect in effectordering[MODELTYPE]:
                if species == 'invertebrates':
                    all_preds = check_training_data_from_scratch(all_preds, model, 'crustaceans', endpoint, effect, path_to_data = '../data/Preprocessed_complete_data.xlsx')
                else:
                    all_preds = check_training_data_from_scratch(all_preds, model, species, endpoint, effect, path_to_data = '../data/Preprocessed_complete_data.xlsx')

                for col in all_preds.columns:
                    if ((col == 'species match') | (col == 'endpoint match') | (col == 'effect match')):
                        all_preds.rename(columns={col: f'{model}_{species}_{endpoint}_{effect} {col}'}, inplace=True)

  0%|          | 0/3 [00:00<?, ?it/s]

  df['endpoint match'] = endpoint_matches
  df['effect match'] = effect_matches
  df['endpoint match'] = endpoint_matches
  df['effect match'] = effect_matches
  df['endpoint match'] = endpoint_matches
  df['effect match'] = effect_matches
  df['endpoint match'] = endpoint_matches
  df['effect match'] = effect_matches
  df['endpoint match'] = endpoint_matches
  df['effect match'] = effect_matches
  df['endpoint match'] = endpoint_matches
  df['effect match'] = effect_matches
  df['endpoint match'] = endpoint_matches
  df['effect match'] = effect_matches
  df['endpoint match'] = endpoint_matches
  df['effect match'] = effect_matches
  df['endpoint match'] = endpoint_matches
  df['effect match'] = effect_matches
  df['endpoint match'] = endpoint_matches
  df['effect match'] = effect_matches
  df['endpoint match'] = endpoint_matches
  df['effect match'] = effect_matches
  df['endpoint match'] = endpoint_matches
  df['effect match'] = effect_matches
  df['endpoint match'] = endpoint_matche

  0%|          | 0/3 [00:00<?, ?it/s]

  df['endpoint match'] = endpoint_matches
  df['effect match'] = effect_matches
  df['endpoint match'] = endpoint_matches
  df['effect match'] = effect_matches
  df['endpoint match'] = endpoint_matches
  df['effect match'] = effect_matches
  df['endpoint match'] = endpoint_matches
  df['effect match'] = effect_matches


  0%|          | 0/3 [00:00<?, ?it/s]

  df['endpoint match'] = endpoint_matches
  df['effect match'] = effect_matches
  df['endpoint match'] = endpoint_matches
  df['effect match'] = effect_matches
  df['endpoint match'] = endpoint_matches
  df['effect match'] = effect_matches
  df['endpoint match'] = endpoint_matches
  df['effect match'] = effect_matches
  df['endpoint match'] = endpoint_matches
  df['effect match'] = effect_matches
  df['endpoint match'] = endpoint_matches
  df['effect match'] = effect_matches
  df['endpoint match'] = endpoint_matches
  df['effect match'] = effect_matches
  df['endpoint match'] = endpoint_matches
  df['effect match'] = effect_matches
  df['endpoint match'] = endpoint_matches
  df['effect match'] = effect_matches
  df['endpoint match'] = endpoint_matches
  df['effect match'] = effect_matches
  df['endpoint match'] = endpoint_matches
  df['effect match'] = effect_matches
  df['endpoint match'] = endpoint_matches
  df['effect match'] = effect_matches
  df['endpoint match'] = endpoint_matche

In [37]:
SMILES_COLUMN_NAME = 'SMILES_Canonical_RDKit'

for SPECIES in tqdm(species_groups):
    
    for model in models:
        MODEL_VERSION = f'{model}_{SPECIES}'
        for PREDICTION_ENDPOINT in endpointordering[MODEL_VERSION]:
            for PREDICTION_EFFECT in effectordering[MODEL_VERSION]:
                print(MODEL_VERSION, PREDICTION_ENDPOINT, PREDICTION_EFFECT, '\n')
                
                print(all_preds[f'{MODEL_VERSION}_{PREDICTION_ENDPOINT}_{PREDICTION_EFFECT} endpoint match'].sum(), '\n')


  0%|          | 0/3 [00:00<?, ?it/s]

EC50EC10_fish EC50 MOR 

EC50EC10_fish_EC50_MOR endpoint match    3542
EC50EC10_fish_EC50_MOR endpoint match    3542
dtype: int64 

EC50EC10_fish EC50 DVP 

EC50EC10_fish_EC50_DVP endpoint match    3542
EC50EC10_fish_EC50_DVP endpoint match    3542
dtype: int64 

EC50EC10_fish EC50 ITX 

EC50EC10_fish_EC50_ITX endpoint match    3542
EC50EC10_fish_EC50_ITX endpoint match    3542
dtype: int64 

EC50EC10_fish EC50 REP 

EC50EC10_fish_EC50_REP endpoint match    3542
EC50EC10_fish_EC50_REP endpoint match    3542
dtype: int64 

EC50EC10_fish EC50 MPH 

EC50EC10_fish_EC50_MPH endpoint match    3542
EC50EC10_fish_EC50_MPH endpoint match    3542
dtype: int64 

EC50EC10_fish EC50 POP 

EC50EC10_fish_EC50_POP endpoint match    3542
EC50EC10_fish_EC50_POP endpoint match    3542
dtype: int64 

EC50EC10_fish EC50 GRO 

EC50EC10_fish_EC50_GRO endpoint match    3542
EC50EC10_fish_EC50_GRO endpoint match    3542
dtype: int64 

EC50EC10_fish EC10 MOR 

EC50EC10_fish_EC10_MOR endpoint match    2321
EC50E

In [38]:
all_preds

Unnamed: 0,SMILES_Canonical_RDKit,EC50EC10_fish_EC50_MOR predictions log10(mg/L),EC50EC10_fish_EC50_MOR exposure_duration,EC50EC10_fish_EC50_DVP predictions log10(mg/L),EC50EC10_fish_EC50_DVP exposure_duration,EC50EC10_fish_EC50_ITX predictions log10(mg/L),EC50EC10_fish_EC50_ITX exposure_duration,EC50EC10_fish_EC50_REP predictions log10(mg/L),EC50EC10_fish_EC50_REP exposure_duration,EC50EC10_fish_EC50_MPH predictions log10(mg/L),...,EC10_invertebrates_EC10_DVP endpoint match,EC10_invertebrates_EC10_DVP effect match,EC10_invertebrates_EC10_ITX endpoint match,EC10_invertebrates_EC10_ITX effect match,EC10_invertebrates_EC10_REP endpoint match,EC10_invertebrates_EC10_REP effect match,EC10_invertebrates_EC10_MPH endpoint match,EC10_invertebrates_EC10_MPH effect match,EC10_invertebrates_EC10_POP endpoint match,EC10_invertebrates_EC10_POP effect match
0,O=[N+]([O-])c1ccc(Cl)cc1,1.172338,96.0,1.058945,96.0,0.979804,96.0,0.817641,96.0,0.761875,...,1,0,1,1,1,1,1,0,1,0
34,Nc1ccc([N+](=O)[O-])cc1,1.860993,96.0,1.756102,96.0,1.623828,96.0,1.559996,96.0,1.465644,...,1,0,1,1,1,0,1,0,1,0
63,O=[N+]([O-])c1ccc(O)cc1,1.350671,96.0,1.287464,96.0,1.167368,96.0,1.168845,96.0,1.143638,...,1,0,1,1,1,1,1,0,1,0
403,CN(C)c1ccc(C=O)cc1,1.750033,96.0,1.598159,96.0,1.482621,96.0,1.293138,96.0,1.157702,...,0,0,0,0,0,0,0,0,0,0
404,O=[N+]([O-])c1ccc([N+](=O)[O-])cc1,-0.168318,96.0,-0.202566,96.0,-0.185144,96.0,-0.237345,96.0,-0.251191,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147453,CCC(C(=O)O)c1ccc(N2C(=O)c3ccccc3C2=O)cc1,0.696419,96.0,0.495500,96.0,0.451064,96.0,-0.008681,96.0,-0.244265,...,0,0,0,0,0,0,0,0,0,0
147454,NC(=O)NC1NC(=O)NC1=O,2.094811,96.0,2.053785,96.0,1.882532,96.0,1.951870,96.0,1.902873,...,0,0,0,0,0,0,0,0,0,0
147455,S=C(SSSSSSC(=S)N1CCCCC1)N1CCCCC1,0.741298,96.0,0.601649,96.0,0.545263,96.0,0.294324,96.0,0.128343,...,0,0,0,0,0,0,0,0,0,0
147456,CC1CCC(C(C)C)CC1,0.768781,96.0,0.717363,96.0,0.643591,96.0,0.646532,96.0,0.607147,...,0,0,0,0,0,0,0,0,0,0


In [39]:
all_preds.to_pickle(f'../data/tutorials/predictions/combined_predictions_and_training_data.pkl.zip', compression='zip')

# Add errors from 10x10 CV

In [40]:
import pandas as pd
import torch
import pandas as pd
import numpy as np
import sys
from tqdm.notebook import tqdm
from figures.figure_utils.preprocess_data import Preprocess10x10Fold, GroupDataForPerformance
from development_utils.preprocessing.Get_data_for_model import PreprocessData
from tqdm.notebook import tqdm

In [41]:
import pandas as pd
training_data = pd.read_pickle('../data/tutorials/predictions/combined_predictions_and_training_data.pkl.zip', compression='zip')
training_data['SMILES'] = training_data['SMILES_Canonical_RDKit'].copy()
training_data = PreprocessData(training_data).GetCanonicalSMILES()

In [42]:
def match(x, error_dict):
    try:
        return error_dict[x]
    except:
        return None

In [43]:
for species_group in ['fish', 'invertebrates','algae']:
    for model in tqdm(['EC50','EC10','EC50EC10']):
        if model != 'EC50EC10':
            cvpreds = Preprocess10x10Fold(name=f'{model}_{species_group}', uselogdata=True, full_filepath=f'../data/results/{model}_{species_group}_predictions_100x_CV_RDkit.pkl.zip')
        else:
            cvpreds = Preprocess10x10Fold(name=f'{model}_{species_group}', uselogdata=True, full_filepath=f'../data/results/{model}_{species_group}_withoverlap_predictions_100x_CV_RDkit.pkl.zip')
        
        wavgcv = GroupDataForPerformance(cvpreds)
        wavgcv['SMILES'] = wavgcv['Canonical_SMILES_figures'].copy()
        wavgcv = PreprocessData(wavgcv).GetCanonicalSMILES()
        error_dict = dict(zip(wavgcv.SMILES_Canonical_RDKit.tolist(), wavgcv.L1error.tolist()))

        training_data[f'{model}_{species_group} L1Error'] = training_data.SMILES_Canonical_RDKit.apply(lambda x: match(x, error_dict))

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

In [44]:
training_data.to_pickle('../data/tutorials/predictions/combined_predictions_and_errors.pkl.zip', compression='zip')

In [45]:
training_data

Unnamed: 0,SMILES_Canonical_RDKit,EC50EC10_fish_EC50_MOR predictions log10(mg/L),EC50EC10_fish_EC50_MOR exposure_duration,EC50EC10_fish_EC50_DVP predictions log10(mg/L),EC50EC10_fish_EC50_DVP exposure_duration,EC50EC10_fish_EC50_ITX predictions log10(mg/L),EC50EC10_fish_EC50_ITX exposure_duration,EC50EC10_fish_EC50_REP predictions log10(mg/L),EC50EC10_fish_EC50_REP exposure_duration,EC50EC10_fish_EC50_MPH predictions log10(mg/L),...,SMILES,EC50_fish L1Error,EC10_fish L1Error,EC50EC10_fish L1Error,EC50_invertebrates L1Error,EC10_invertebrates L1Error,EC50EC10_invertebrates L1Error,EC50_algae L1Error,EC10_algae L1Error,EC50EC10_algae L1Error
0,O=[N+]([O-])c1ccc(Cl)cc1,1.172338,96.0,1.058945,96.0,0.979804,96.0,0.817641,96.0,0.761875,...,O=[N+]([O-])c1ccc(Cl)cc1,0.010166,0.373479,0.125743,0.065393,0.402708,0.304046,0.698894,,0.677930
34,Nc1ccc([N+](=O)[O-])cc1,1.860993,96.0,1.756102,96.0,1.623828,96.0,1.559996,96.0,1.465644,...,Nc1ccc([N+](=O)[O-])cc1,0.241841,0.292932,0.161388,0.800447,0.360655,0.427892,0.575160,0.567851,0.544045
63,O=[N+]([O-])c1ccc(O)cc1,1.350671,96.0,1.287464,96.0,1.167368,96.0,1.168845,96.0,1.143638,...,O=[N+]([O-])c1ccc(O)cc1,0.436332,0.010344,0.431412,0.184236,0.304895,0.271843,0.560873,0.522284,0.759680
403,CN(C)c1ccc(C=O)cc1,1.750033,96.0,1.598159,96.0,1.482621,96.0,1.293138,96.0,1.157702,...,CN(C)c1ccc(C=O)cc1,0.245164,,0.240691,,,,,,
404,O=[N+]([O-])c1ccc([N+](=O)[O-])cc1,-0.168318,96.0,-0.202566,96.0,-0.185144,96.0,-0.237345,96.0,-0.251191,...,O=[N+]([O-])c1ccc([N+](=O)[O-])cc1,1.076959,,1.117791,3.271399,,3.397170,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147453,CCC(C(=O)O)c1ccc(N2C(=O)c3ccccc3C2=O)cc1,0.696419,96.0,0.495500,96.0,0.451064,96.0,-0.008681,96.0,-0.244265,...,CCC(C(=O)O)c1ccc(N2C(=O)c3ccccc3C2=O)cc1,,,,,,,,0.785128,1.308006
147454,NC(=O)NC1NC(=O)NC1=O,2.094811,96.0,2.053785,96.0,1.882532,96.0,1.951870,96.0,1.902873,...,NC(=O)NC1NC(=O)NC1=O,,,,,,,,0.114482,0.316235
147455,S=C(SSSSSSC(=S)N1CCCCC1)N1CCCCC1,0.741298,96.0,0.601649,96.0,0.545263,96.0,0.294324,96.0,0.128343,...,S=C(SSSSSSC(=S)N1CCCCC1)N1CCCCC1,,,,,,,,2.185539,2.263958
147456,CC1CCC(C(C)C)CC1,0.768781,96.0,0.717363,96.0,0.643591,96.0,0.646532,96.0,0.607147,...,CC1CCC(C(C)C)CC1,,,,,,,,0.179110,0.241676


# Errors for Zenodo

In [None]:
training_data.to_excel('../data/tutorials/predictions/Weighted_avg_error_per_SMILES.xlsx')