# Install Library

[RDKit ](https://github.com/rdkit/rdkit)

[DGL](https://github.com/dmlc/dgl/)

[DGL-LifeSci](https://github.com/awslabs/dgl-lifesci)





# Import Library

In [3]:
import os

import dgl
import sys
import torch
import random
import cv2
import statistics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch.optim as optim

from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit import DataStructs

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import  History
from dgllife.utils import smiles_to_bigraph, CanonicalAtomFeaturizer, AttentiveFPAtomFeaturizer
from sklearn.model_selection import train_test_split

from Utils.general import DATASET, get_dataset, separate_active_and_inactive_data, get_embedding_vector_class, count_lablel,data_generator, up_and_down_Samplenig
from Utils.gcnpretrained import get_sider_model
from Utils.specialfunctions import is_Membership

from Models.heterogeneous_siamese_sider import siamese_model_attentiveFp_sider

device = torch.device('cpu' if torch.cuda.is_available() else 'cpu')




# Data

In [4]:
cache_path_tox21='./tox21_dglgraph.bin'

df_tox21 = get_dataset("tox21")
ids = df_tox21['mol_id']

df_tox21 = df_tox21.drop(columns=['mol_id'])

In [5]:
cache_path_sider='./sider_dglgraph.bin'

df = get_dataset("sider")



Extracting file to C:\Users\stdso\.dgl/sider


In [6]:
tox21_tasks = df_tox21.columns.values[:12].tolist()
tox21_tasks

['NR-AR',
 'NR-AR-LBD',
 'NR-AhR',
 'NR-Aromatase',
 'NR-ER',
 'NR-ER-LBD',
 'NR-PPAR-gamma',
 'SR-ARE',
 'SR-ATAD5',
 'SR-HSE',
 'SR-MMP',
 'SR-p53']

In [7]:
tox21_smiles = np.array(df_tox21['smiles'])
sider_smiles = np.array(df['smiles'])

In [8]:
subscriber = []
for ts in tox21_smiles:
    for ss in sider_smiles:
        if ts == ss:
            subscriber.append(ts)

In [9]:
subscriber

['CC(O)(P(=O)(O)O)P(=O)(O)O',
 'C[N+](C)(C)CC(=O)[O-]',
 'C[N+](C)(C)CCO',
 'CC(=O)NO',
 'CC(=O)OCC[N+](C)(C)C',
 'CC(=O)[O-].[Na+]',
 'CCCC(CCC)C(=O)O',
 'Cl[Zn]Cl',
 'CN(CCCl)CCCl',
 'C[N+](C)(C)CCOC(=O)CCC(=O)OCC[N+](C)(C)C',
 'C1N2CN3CN1CN(C2)C3',
 'CCN(CC)C(=S)SSC(=S)N(CC)CC']

# Required functions

In [10]:
def create_dataset_with_gcn_case_study(dataset, class_embed_vector, GCN, tasks):
    created_data = []
    data = np.arange(len(tasks))
    onehot_encoded = to_categorical(data)
    for i, data in enumerate(dataset):
        smiles, g, labels, mask = data
        g = g.to(device)
        g = dgl.add_self_loop(g)
        graph_feats = g.ndata.pop('h')
        embbed = GCN(g, graph_feats)
        embbed = embbed.to('cpu')
        embbed = embbed.detach().numpy()
        for j, label in enumerate(labels):
            a = (smiles, embbed, onehot_encoded[j], class_embed_vector[j], labels[j], tasks[j])
            created_data.append(a)
    print('Data created!!')
    return created_data


def create_dataset_with_gcn(dataset, subscriber, class_embed_vector, GCN, tasks, numberTask):

    created_data = []
    created_subscriber = []
    data = np.arange(len(tasks))
    onehot_encoded = to_categorical(data)

    for i, data in enumerate(dataset):
        smiles, g, label, mask = data
#         g = g.to(device)
        g = dgl.add_self_loop(g)
        graph_feats = g.ndata.pop('h')
        embbed = GCN(g, graph_feats)
        embbed = embbed.to('cpu')
        embbed = embbed.detach().numpy()
        a = (smiles, embbed, onehot_encoded[numberTask], class_embed_vector[numberTask], label, tasks[numberTask])
        if smiles in subscriber:
            created_subscriber.append(data)
        else:
            created_data.append(a)
    print('Data created!!')
    return created_data, created_subscriber


# Calculation of embedded vectors for each class

In [11]:
print(df_tox21, tox21_tasks)

      NR-AR  NR-AR-LBD  NR-AhR  NR-Aromatase  NR-ER  NR-ER-LBD  NR-PPAR-gamma  \
0       0.0        0.0     1.0           NaN    NaN        0.0            0.0   
1       0.0        0.0     0.0           0.0    0.0        0.0            0.0   
2       NaN        NaN     NaN           NaN    NaN        NaN            NaN   
3       0.0        0.0     0.0           0.0    0.0        0.0            0.0   
4       0.0        0.0     0.0           0.0    0.0        0.0            0.0   
...     ...        ...     ...           ...    ...        ...            ...   
7826    NaN        NaN     NaN           NaN    NaN        NaN            NaN   
7827    1.0        1.0     0.0           0.0    1.0        0.0            NaN   
7828    1.0        1.0     0.0           0.0    1.0        1.0            0.0   
7829    1.0        1.0     0.0           NaN    1.0        1.0            0.0   
7830    0.0        0.0     NaN           0.0    0.0        0.0            0.0   

      SR-ARE  SR-ATAD5  SR-

In [12]:
df_positive, df_negative = separate_active_and_inactive_data(df_tox21, tox21_tasks)

for i,d in enumerate(zip(df_positive,df_negative)):
    print(f'{tox21_tasks[i]}=> positive: {len(d[0])} - negative: {len(d[1])}')

NR-AR=> positive: 309 - negative: 6956
NR-AR-LBD=> positive: 237 - negative: 6521
NR-AhR=> positive: 768 - negative: 5781
NR-Aromatase=> positive: 300 - negative: 5521
NR-ER=> positive: 793 - negative: 5400
NR-ER-LBD=> positive: 350 - negative: 6605
NR-PPAR-gamma=> positive: 186 - negative: 6264
SR-ARE=> positive: 942 - negative: 4890
SR-ATAD5=> positive: 264 - negative: 6808
SR-HSE=> positive: 372 - negative: 6095
SR-MMP=> positive: 918 - negative: 4892
SR-p53=> positive: 423 - negative: 6351


In [13]:
dataset_positive = [DATASET(d,smiles_to_bigraph, AttentiveFPAtomFeaturizer(), cache_file_path = cache_path_tox21) for d in df_positive]
dataset_negative = [DATASET(d,smiles_to_bigraph, AttentiveFPAtomFeaturizer(), cache_file_path = cache_path_tox21) for d in df_negative]

Processing dgl graphs from scratch...


Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing molecule 1000/6956
Processing molecule 2000/6956
Processing molecule 3000/6956
Processing molecule 4000/6956
Processing molecule 5000/6956
Processing molecule 6000/6956
Processing dgl graphs from scratch...
Processing molecule 1000/6521
Processing molecule 2000/6521
Processing molecule 3000/6521
Processing molecule 4000/6521
Processing molecule 5000/6521
Processing molecule 6000/6521
Processing dgl graphs from scratch...
Processing molecule 1000/5781
Processing molecule 2000/5781
Processing molecule 3000/5781
Processing molecul

In [14]:
embed_class_tox21 = get_embedding_vector_class(dataset_positive, dataset_negative, radius=2, size = 512)

class vector created!!


# Transfer Learning with BioAct-Het and AttentiveFp GCN

In [15]:
model_name = 'GCN_attentivefp_SIDER'
gcn_model = get_sider_model(model_name)
gcn_model.eval()
# gcn_model = gcn_model.to(device)

Downloading GCN_attentivefp_SIDER_pre_trained.pth from https://data.dgl.ai/dgllife/pre_trained/gcn_attentivefp_sider.pth...


GCN_attentivefp_SIDER_pre_trained.pth: 100%|██████████| 3.09M/3.09M [00:01<00:00, 2.59MB/s]


Pretrained model loaded


GCNPredictor(
  (gnn): GCN(
    (gnn_layers): ModuleList(
      (0): GCNLayer(
        (graph_conv): GraphConv(in=39, out=256, normalization=none, activation=<function relu at 0x000001CB371FE2A0>)
        (dropout): Dropout(p=0.08333992387843633, inplace=False)
        (bn_layer): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1-3): 3 x GCNLayer(
        (graph_conv): GraphConv(in=256, out=256, normalization=none, activation=<function relu at 0x000001CB371FE2A0>)
        (dropout): Dropout(p=0.08333992387843633, inplace=False)
        (bn_layer): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
  )
  (readout): WeightedSumAndMax(
    (weight_and_sum): WeightAndSum(
      (atom_weighting): Sequential(
        (0): Linear(in_features=256, out_features=1, bias=True)
        (1): Sigmoid()
      )
    )
  )
  (predict): MLPPredictor(
    (predict): Sequential(
      (0): Dropout(p=0.0833399238784

In [16]:
data_ds = []
subscriber_data_ds = []
for i, task in  enumerate(tox21_tasks):
    a = df_tox21[['smiles' , task]]
    a = a.dropna()
    ds = DATASET(a,smiles_to_bigraph, AttentiveFPAtomFeaturizer(), cache_file_path = cache_path_sider) 
    data, subscriber_data = create_dataset_with_gcn(ds, subscriber, embed_class_tox21, gcn_model, tox21_tasks, i)
    for d in data:
        data_ds.append(d)
    for d in subscriber_data:
        subscriber_data_ds.append(d)

Processing dgl graphs from scratch...
Processing molecule 1000/7265
Processing molecule 2000/7265
Processing molecule 3000/7265
Processing molecule 4000/7265
Processing molecule 5000/7265
Processing molecule 6000/7265
Processing molecule 7000/7265
Data created!!
Processing dgl graphs from scratch...
Processing molecule 1000/6758
Processing molecule 2000/6758
Processing molecule 3000/6758
Processing molecule 4000/6758
Processing molecule 5000/6758
Processing molecule 6000/6758
Data created!!
Processing dgl graphs from scratch...
Processing molecule 1000/6549
Processing molecule 2000/6549
Processing molecule 3000/6549
Processing molecule 4000/6549
Processing molecule 5000/6549
Processing molecule 6000/6549
Data created!!
Processing dgl graphs from scratch...
Processing molecule 1000/5821
Processing molecule 2000/5821
Processing molecule 3000/5821
Processing molecule 4000/5821
Processing molecule 5000/5821
Data created!!
Processing dgl graphs from scratch...
Processing molecule 1000/6193


In [17]:
from sklearn.model_selection import KFold

Epoch_S = 10

def evaluate_model(dataset, subscriber_dataset, k = 10 , shuffle = False):
    result =[]

    kf = KFold(n_splits=10, shuffle= shuffle, random_state=None)

    for train_index, test_index in kf.split(dataset):

        train_ds = [dataset[index] for index in train_index]

        valid_ds = [dataset[index] for index in test_index]

        label_pos , label_neg, _ , _ = count_lablel(train_ds)
        print(f'train positive label: {label_pos} - train negative label: {label_neg}')

        # train_ds = up_and_down_Samplenig(train_ds, scale_downsampling = 0.5)

        label_pos , label_neg , _ , _ = count_lablel(train_ds)
        print(f'up and down sampling => train positive label: {label_pos} - train negative label: {label_neg}')

        label_pos , label_neg, _ , _ = count_lablel(valid_ds)
        print(f'Test positive label: {label_pos} - Test negative label: {label_neg}')

        l_train = []
        r_train = []
        lbls_train = []
        l_valid = []
        r_valid = []
        lbls_valid = []

        for i , data in enumerate(train_ds):
            smiles, embbed_drug, onehot_task, embbed_task, lbl, task_name = data
            l_train.append(embbed_drug[0])
            r_train.append(embbed_task)
            lbls_train.append(lbl.tolist())

        for i , data in enumerate(valid_ds):
            smiles, embbed_drug, onehot_task, embbed_task, lbl, task_name = data
            l_valid.append(embbed_drug[0])
            r_valid.append(embbed_task)
            lbls_valid.append(lbl.tolist())

        l_train = np.array(l_train).reshape(-1,1024,1)
        r_train = np.array(r_train).reshape(-1,512,1)
        lbls_train = np.array(lbls_train)

        l_valid = np.array(l_valid).reshape(-1,1024,1)
        r_valid = np.array(r_valid).reshape(-1,512,1)
        lbls_valid = np.array(lbls_valid)

        # create neural network model
        siamese_net = siamese_model_attentiveFp_sider()
        
        history = History()
        P = siamese_net.fit([l_train, r_train], lbls_train, epochs = Epoch_S, batch_size = 128, callbacks=[history])

        for j in range(100):
            C=1
            Before = int(P.history['accuracy'][-1]*100)
            for i in range(2,Epoch_S+1):
                if  int(P.history['accuracy'][-i]*100) == Before:
                    C=C+1
                else:
                    C=1
                Before=int(P.history['accuracy'][-i]*100)
                print(Before)
            if C==Epoch_S:
                break
            P = siamese_net.fit([l_train, r_train], lbls_train, epochs = Epoch_S, batch_size = 128, callbacks=[history])
        print(j+1)

        score  = siamese_net.evaluate([l_valid,r_valid], lbls_valid, verbose=1)
        a = (score[1],score[4])
        result.append(a)

    return result

scores = evaluate_model(data_ds, subscriber_data_ds, 10, True)

train positive label: 0 - train negative label: 70047
up and down sampling => train positive label: 0 - train negative label: 70047
Test positive label: 0 - Test negative label: 7783

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
92
92
92
92
92
92
92
92
92
1
train positive label: 0 - train negative label: 70047
up and down sampling => train positive label: 0 - train negative label: 70047
Test positive label: 0 - Test negative label: 7783
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
93
93
93
93
92
92
92
92
92
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
94
94
93
94
93
93
93
93
93
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
94
94
94
94
94
94
94
94
94
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


#### Dropout = 0.3 and downsampling = 0.5

In [18]:
scores

[(0.9232943654060364, 0.8668349981307983),
 (0.9387125968933105, 0.8727362751960754),
 (0.9240652918815613, 0.8559340238571167),
 (0.9425671100616455, 0.8711477518081665),
 (0.9380701780319214, 0.8705403804779053),
 (0.9399974346160889, 0.8845497369766235),
 (0.9397404789924622, 0.8698745369911194),
 (0.9366568326950073, 0.8572478890419006),
 (0.9396119713783264, 0.8680788278579712),
 (0.9340870976448059, 0.8563265204429626)]

In [19]:
acc = []
auc = []
for i in scores:
    acc.append(i[0])
    auc.append(i[1])

print(f'accuracy= {np.mean(acc)} AUC= {np.mean(auc)} STD_AUC= {np.std(auc)}')

accuracy= 0.9356803357601166 AUC= 0.867327094078064 STD_AUC= 0.008429134444122603


# **Case study with BioAct-Het**

In [20]:
model_name = 'GCN_attentivefp_SIDER'
gcn_model = get_sider_model(model_name)
gcn_model.eval()
gcn_model = gcn_model.to(device)

Downloading GCN_attentivefp_SIDER_pre_trained.pth from https://data.dgl.ai/dgllife/pre_trained/gcn_attentivefp_sider.pth...


GCN_attentivefp_SIDER_pre_trained.pth: 100%|██████████| 3.09M/3.09M [00:01<00:00, 2.57MB/s]

Pretrained model loaded





In [21]:
sider_smiles = df.smiles.to_numpy()

In [22]:
dir_path = 'C:/Users/stdso/Documents/USTH/Med/BioAct-Het-main/Data'

In [23]:
df_case_study = pd.read_csv(dir_path + '/group2.csv')

In [24]:
df_case_study

Unnamed: 0.1,Unnamed: 0,smiles,Hepatobiliary disorders,Metabolism and nutrition disorders,Product issues,Eye disorders,Investigations,Musculoskeletal and connective tissue disorders,Gastrointestinal disorders,Social circumstances,...,Infections and infestations,"Respiratory, thoracic and mediastinal disorders",Psychiatric disorders,Renal and urinary disorders,"Pregnancy, puerperium and perinatal conditions",Ear and labyrinth disorders,Cardiac disorders,Nervous system disorders,"Injury, poisoning and procedural complications",Drug_Name
0,0,Cl.CN[C@H](CC(C)C)C(=O)N[C@@H]1[C@H](O)C2=CC=C...,,,,,,,,,...,,,,,,,,,,Vancomycin
1,1,CC[C@@H]1NC(=O)[C@H]([C@H](O)[C@H](C)C\C=C\C)N...,,,,,,,,,...,,,,,,,,,,cyclosporine
2,2,Cl.CCCCCCCCC1=CC=C(CCC(N)(CO)CO)C=C1,,,,,,,,,...,,,,,,,,,,fingolimod
3,3,CC(C)CC(C(=NC(CCC(=O)O)C(=NC(CCCCN)C(=NC(CCC(=...,,,,,,,,,...,,,,,,,,,,interferon-beta 1a
4,4,CCC(CC)COC(=O)C(C)NP(=O)(OCC1C(C(C(O1)(C#N)C2=...,,,,,,,,,...,,,,,,,,,,Remdesivir
5,5,CCC(CC)OC1C=C(CC(C1NC(=O)C)N)C(=O)OCC,,,,,,,,,...,,,,,,,,,,Oseltamivir
6,6,CC(C)C1=NC(=CS1)CN(C)C(=O)NC(C(C)C)C(=O)NC(CC2...,,,,,,,,,...,,,,,,,,,,Ritonavir
7,7,CC(C)C(=O)OCC1C(C(C(O1)N2C=CC(=NC2=O)NO)O)O,,,,,,,,,...,,,,,,,,,,Molnupiravir
8,8,CC1(C2C1C(N(C2)C(=O)C(C(C)(C)C)NC(=O)C(F)(F)F)...,,,,,,,,,...,,,,,,,,,,paxlovid (Nirmatrelvir)
9,9,CCC(=O)NC1CCCC2=C1C=NC=C2C3=CC4=C(C=C3)N(C(=O)...,,,,,,,,,...,,,,,,,,,,Baxdrostat


In [25]:
drug_name = df_case_study.Drug_Name.to_numpy()

In [26]:
candidate_smiles = df_case_study.smiles.to_numpy()

In [27]:
is_Membership(sider_smiles, candidate_smiles)

False

In [28]:
sider_tasks = df.columns.values[1:28].tolist()
sider_tasks

['Hepatobiliary disorders',
 'Metabolism and nutrition disorders',
 'Product issues',
 'Eye disorders',
 'Investigations',
 'Musculoskeletal and connective tissue disorders',
 'Gastrointestinal disorders',
 'Social circumstances',
 'Immune system disorders',
 'Reproductive system and breast disorders',
 'Neoplasms benign, malignant and unspecified (incl cysts and polyps)',
 'General disorders and administration site conditions',
 'Endocrine disorders',
 'Surgical and medical procedures',
 'Vascular disorders',
 'Blood and lymphatic system disorders',
 'Skin and subcutaneous tissue disorders',
 'Congenital, familial and genetic disorders',
 'Infections and infestations',
 'Respiratory, thoracic and mediastinal disorders',
 'Psychiatric disorders',
 'Renal and urinary disorders',
 'Pregnancy, puerperium and perinatal conditions',
 'Ear and labyrinth disorders',
 'Cardiac disorders',
 'Nervous system disorders',
 'Injury, poisoning and procedural complications']

In [29]:
print(df, sider_tasks)

                                                 smiles  \
0                                       C(CNCCNCCNCCN)N   
1     CC(C)(C)C1=CC(=C(C=C1NC(=O)C2=CNC3=CC=CC=C3C2=...   
2     CC[C@]12CC(=C)[C@H]3[C@H]([C@@H]1CC[C@]2(C#C)O...   
3       CCC12CC(=C)C3C(C1CC[C@]2(C#C)O)CCC4=CC(=O)CCC34   
4                C1C(C2=CC=CC=C2N(C3=CC=CC=C31)C(=O)N)O   
...                                                 ...   
1422  C[C@H]1CN(CC[C@@]1(C)C2=CC(=CC=C2)O)C[C@H](CC3...   
1423  CC[C@@H]1[C@@]2([C@@H]([C@@H](C(=O)[C@@H](C[C@...   
1424  CCOC1=CC=C(C=C1)CC2=C(C=CC(=C2)[C@H]3[C@@H]([C...   
1425  C1CN(CCC1N2C3=CC=CC=C3NC2=O)CCCC(C4=CC=C(C=C4)...   
1426       CCC(=O)C(CC(C)N(C)C)(C1=CC=CC=C1)C2=CC=CC=C2   

      Hepatobiliary disorders  Metabolism and nutrition disorders  \
0                           1                                   1   
1                           0                                   1   
2                           0                                   1   
3              

In [30]:
df_positive, df_negative = separate_active_and_inactive_data(df, sider_tasks)

for i,d in enumerate(zip(df_positive,df_negative)):
    print(f'{sider_tasks[i]}=> positive: {len(d[0])} - negative: {len(d[1])}')

Hepatobiliary disorders=> positive: 743 - negative: 684
Metabolism and nutrition disorders=> positive: 996 - negative: 431
Product issues=> positive: 22 - negative: 1405
Eye disorders=> positive: 876 - negative: 551
Investigations=> positive: 1151 - negative: 276
Musculoskeletal and connective tissue disorders=> positive: 997 - negative: 430
Gastrointestinal disorders=> positive: 1298 - negative: 129
Social circumstances=> positive: 251 - negative: 1176
Immune system disorders=> positive: 1024 - negative: 403
Reproductive system and breast disorders=> positive: 727 - negative: 700
Neoplasms benign, malignant and unspecified (incl cysts and polyps)=> positive: 376 - negative: 1051
General disorders and administration site conditions=> positive: 1292 - negative: 135
Endocrine disorders=> positive: 323 - negative: 1104
Surgical and medical procedures=> positive: 213 - negative: 1214
Vascular disorders=> positive: 1108 - negative: 319
Blood and lymphatic system disorders=> positive: 885 - 

In [31]:
dataset_positive = [DATASET(d,smiles_to_bigraph, AttentiveFPAtomFeaturizer(), cache_file_path = cache_path_sider) for d in df_positive]
dataset_negative = [DATASET(d,smiles_to_bigraph, AttentiveFPAtomFeaturizer(), cache_file_path = cache_path_sider) for d in df_negative]

Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing molecule 1000/1151
Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing molecule 1000/1298
Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing molecule 1000/1024
Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing molecule 1000/1292
Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing molecule 1000/1108
Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing molecule 1000/1318
Processing dgl graphs from scratch...
Processing dgl graphs from scratch...
Processing molecule 1000/1006
Processing dgl graphs from scratch...
Processing molecule 1000/1060


In [32]:
embed_class_sider = get_embedding_vector_class(dataset_positive, dataset_negative, radius=2, size = 512)

class vector created!!


In [33]:
dataset = DATASET(df,smiles_to_bigraph, AttentiveFPAtomFeaturizer(), cache_file_path = cache_path_sider) 
ds_train = create_dataset_with_gcn_case_study(dataset, embed_class_sider, gcn_model, sider_tasks)

Processing dgl graphs from scratch...
Processing molecule 1000/1427
Data created!!


In [34]:
dataset_study = DATASET(df_case_study[df_case_study.columns[1:29]],smiles_to_bigraph, 
                        AttentiveFPAtomFeaturizer(), cache_file_path = cache_path_sider)

ds_study = create_dataset_with_gcn_case_study(dataset_study, embed_class_sider, gcn_model, sider_tasks)

Processing dgl graphs from scratch...
Invalid mol found
Invalid mol found
Data created!!


In [35]:
len(data_ds)

77830

### Training algorithm

In [36]:
Epoch_S = 15

l, r , lbls = data_generator(ds_train)

l = np.array(l).reshape(-1,1024,1)
r = np.array(r).reshape(-1,512,1)
lbls=np.array(lbls)

history = History()

siamese_net = siamese_model_attentiveFp_sider()


s = siamese_net.fit([l, r], lbls, epochs = Epoch_S, shuffle=True, batch_size=128, callbacks=[history])

for j in range(1000):
    C=1
    Before = int(s.history['accuracy'][-1]*100)
    for i in range(2,Epoch_S+1):
        if  int(s.history['accuracy'][-i]*100)== Before:
            C=C+1
        else:
            C=1
        Before=int(s.history['accuracy'][-i]*100)
        print(Before)
    if C==Epoch_S:
        break
    s = siamese_net.fit([l, r], lbls, epochs = Epoch_S, shuffle=True, batch_size=128, callbacks=history)
print(j+1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
82
82
82
82
82
82
81
81
81
81
80
80
79
74
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
84
84
84
84
83
83
83
83
83
83
83
83
82
82
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
85
85
85
85
85
84
85
84
84
84
84
84
84
84
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
85
85
86
85
85
85
85
85
85
85
85
85
85
85
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
E

### Model evaluation

In [37]:
valid_ds = {}

for i, task in enumerate(sider_tasks):
    temp = []
    for j , data in enumerate(ds_study):
        smiles, embbed_drug, onehot_task, embbed_task, lbl, task_name = data
        if task ==  task_name:
            temp.append(data)

    valid_ds[task] = temp

In [38]:
task_scores = [sider_tasks for sider_tasks in range(len(sider_tasks))]

for i, task in enumerate(sider_tasks):

    l_val = []
    r_val = []
    lbls_valid = []
    for data in valid_ds[task]:
        smiles, embbed_drug, onehot_task, embbed_task, lbl, task_name = data
        l_val.append(embbed_drug[0])
        r_val.append(embbed_task)
        lbls_valid.append(lbl)

    l1 = np.array(l_val)
    r1 = np.array(r_val)
    lbls_valid = np.array(lbls_valid)

    y_pred = siamese_net.predict([l1,r1])

    result = (y_pred)
    task_scores[i] = task, result
    print(task_scores)

[('Hepatobiliary disorders', array([[0.87093085],
       [0.999991  ],
       [0.3503147 ],
       [0.09576213],
       [0.83620083],
       [0.9508993 ],
       [1.        ],
       [0.995094  ],
       [0.18692899],
       [0.25813988],
       [0.18901543],
       [0.14086032],
       [0.17979096],
       [0.9881474 ],
       [0.22689337],
       [0.83620083],
       [0.9231006 ],
       [0.89340955],
       [0.9999878 ],
       [0.764607  ],
       [0.34570885],
       [0.5420091 ],
       [1.        ],
       [0.20207898]], dtype=float32)), 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
[('Hepatobiliary disorders', array([[0.87093085],
       [0.999991  ],
       [0.3503147 ],
       [0.09576213],
       [0.83620083],
       [0.9508993 ],
       [1.        ],
       [0.995094  ],
       [0.18692899],
       [0.25813988],
       [0.18901543],
       [0.14086032],
       [0.17979096],
       [0.9881474 ],
       [0.22689337],
       [0.

In [39]:
for task in task_scores:
    print(" --------------------------------- ")
    print(F'{task[0]}:')
    for i, drug in enumerate(task[1]):
        print(F'{i+1}- {drug_name[i]}: {drug}')

 --------------------------------- 
Hepatobiliary disorders:
1- Vancomycin: [0.87093085]
2- cyclosporine: [0.999991]
3- fingolimod: [0.3503147]
4- interferon-beta 1a: [0.09576213]
5- Remdesivir: [0.83620083]
6- Oseltamivir: [0.9508993]
7- Ritonavir: [1.]
8- Molnupiravir: [0.995094]
9- paxlovid (Nirmatrelvir): [0.18692899]
10- Baxdrostat: [0.25813988]
11- Guanfacine: [0.18901543]
12- Liraglutide: [0.14086032]
13- linagliptin: [0.17979096]
14- Baricitinib-phosphate: [0.9881474]
15- Dexamethasone-Sodium-Phosphate: [0.22689337]
16- Tocilizumab: [0.83620083]
17- Remdesivir (Veklury): [0.9231006]
18- Anakinra: [0.89340955]
19- Chloroquine: [0.9999878]
20- Hydroxychloroquine: [0.764607]
21- Famotidine: [0.34570885]
22- Umifenovir : [0.5420091]
23- Ivermectin-B1a: [1.]
24- Prednisolone: [0.20207898]
 --------------------------------- 
Metabolism and nutrition disorders:
1- Vancomycin: [0.94160634]
2- cyclosporine: [0.9999984]
3- fingolimod: [0.6441632]
4- interferon-beta 1a: [0.47142643]
5- Re