In [1]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

In [2]:
from scripts.baseline_model import MoleculeModel
from scripts.utils import generate_descriptors
from scripts.nn.TransferLearningModel import train_val_test_nn, transfer_train_val_test
from collections import defaultdict
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from ITMO_FS.embedded import MOS
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
import numpy as np
import pandas as pd

## Training base dataset

In [24]:
data_train_val = pd.read_csv('../data/split/data_train_val.csv')
data_test = pd.read_csv('../data/split/data_test.csv')

In [25]:
TARGET_COLUMN = 'active'
FP_SIZE = 32
FP_RADIUS = 2

In [26]:
data_train_val = generate_descriptors(data_train_val, use_fp=False)
data_test = generate_descriptors(data_test, use_fp=False)

  0%|          | 0/388 [00:00<?, ?it/s]

CCN(CC)CC(=O)NC1=C(C=CC=C1C)C
C1=CC=C2C(=C1)C(=CC(=N2)C3=NC4=CC=CC=C4C(=C3)C(=O)[O-])C(=O)[O-].[Na+].[Na+]
CN(C)CCCN1C2=CC=CC=C2SC3=C1C=C(C=C3)C(F)(F)F
CC1C=CC=CC=CC=CC=CC=CC=CC(CC2C(C(CC(O2)(CC(CC(C(CCC(CC(CC(=O)OC(C(C1O)C)C)O)O)O)O)O)O)O)C(=O)O)OC3C(C(C(C(O3)C)O)N)O
C[C@H](CN1C=NC2=C(N=CN=C21)N)OCP(=O)(O)O
CC1=C(C=CC=C1O)C(=O)N[C@@H](CSC2=CC=CC=C2)[C@@H](CN3C[C@H]4CCCC[C@H]4C[C@H]3C(=O)NC(C)(C)C)O
CC(C)(C(=O)O)O/N=C(/C1=CSC(=N1)N)\C(=O)N[C@H]2[C@@H]3N(C2=O)C(=C(CS3)C[N+]4=CC=CC=C4)C(=O)[O-]
C1C(=C(N2[C@H](S1)[C@@H](C2=O)NC(=O)[C@@H](C3=CC=CC=C3)S(=O)(=O)O)C(=O)[O-])C[N+]4=CC=C(C=C4)C(=O)N
CC[C@H](C)[C@@H](C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CC2=CC=C(C=C2)O)C(=O)N[C@@H](CC(C)C)C(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CCC(=O)O)NC(=O)[C@H](CC3=CC=CC=C3)NC(=O)[C@H](CC(=O)O)NC(=O)CNC(=O)[C@H](CC(=O)N)NC(=O)CNC(=O)CNC(=O)CNC(=O)CNC(=O)[C@@H]4CCCN4C(=O)[C@H](CCCNC(=N)N)NC(=O)[C@@H]5CCCN5C(=O)[C@@H](CC6=CC=CC=C6)N
CCOC(=O)C1=C(NC(=C(C1C2=CC=CC=C2Cl)C(=O)OC

C1=CC=C(C=C1)C2=CC=C(C=C2)C(=O)CCC(=O)O
CCCCCCCCCCCCCC(=O)O
C[C@@H]1CCN([C@H](C1)C(=O)O)C(=O)[C@H](CCCN=C(N)N)NS(=O)(=O)C2=CC=CC3=C2NCC(C3)C
CC(=O)O[C@H]1C=C[C@H]2[C@H]3CC4=C5[C@]2([C@H]1OC5=C(C=C4)OC(=O)C)CCN3C
CC(C1=CC(=CC=C1)OC2=CC=CC=C2)C(=O)O
CCC1C(C(C(N(CC(CC(C(C(C(C(C(=O)O1)C)OC2CC(C(C(O2)C)O)(C)OC)C)OC3C(C(CC(O3)C)N(C)C)O)(C)O)C)C)C)O)(C)O
C=C(CC(=O)C1=CC=C(C=C1)C2=CC=CC=C2Cl)C(=O)O
COC1=CC2=C(C=CN=C2C=C1)[C@H]([C@@H]3C[C@@H]4CCN3C[C@@H]4C=C)O
CC1=C(OC(=N1)NS(=O)(=O)C2=CC=C(C=C2)N)C
C[N+]1(CCCC1)CC2=C(N3[C@@H]([C@@H](C3=O)NC(=O)/C(=N\OC)/C4=CSC(=N4)N)SC2)C(=O)[O-]
CC[C@@]1(C2=C(COC1=O)C(=O)N3CC4=CC5=C(C=CC(=C5CN(C)C)O)N=C4C3=C2)O
CN(C)CCC1=CNC2=C1C=C(C=C2)CN3C=NC=N3
CC1=CN=C(C=N1)C(=O)NCCC2=CC=C(C=C2)S(=O)(=O)NC(=O)NC3CCCCC3
C[C@H]1[C@@H]([C@H]([C@H]([C@@H](O1)OC[C@@H]2[C@H]([C@@H]([C@H]([C@@H](O2)OC3=C(OC4=CC(=CC(=C4C3=O)O)O)C5=CC(=C(C=C5)O)O)O)O)O)O)O)O
CC(=O)CC(C1=CC=C(C=C1)[N+](=O)[O-])C2=C(C3=CC=CC=C3OC2=O)O
C1=NC2=C(N1)C(=S)N=CN2
CC(C1=CC(=CC=C1)C(=O)C2=CC=CC=C2)C(=O)O
C1

CN(C)CCCN1C2=CC=CC=C2SC3=C1C=C(C=C3)Cl
C1=CC(=C(C=C1C2=C(C=C(C=C2)F)F)C(=O)O)O
C1=CC=C(C=C1)N2C(=O)C(C(=O)N2C3=CC=CC=C3)CCS(=O)C4=CC=CC=C4
C[C@H](CCCC(C)(C)O)[C@H]1CC[C@@H]\2[C@@]1(CCC/C2=C\C=C/3\C[C@H](C[C@@H](C3=C)O)O)C
CN1CC[C@@]23C=C[C@@H](C[C@@H]2OC4=C(C=CC(=C34)C1)OC)O
CC1=CN=C(C(=C1OC)C)CS(=O)C2=NC3=C(N2)C=C(C=C3)OC
C[C@]12CC[C@H]3[C@H]([C@@H]1CC[C@@H]2O)CCC4=C3C=CC(=C4)O
CC(C1=CC=C(C=C1)C(=O)C2=CC=CS2)C(=O)OC
COC1=C(C=C(C=C1)/C=C/C2=CC(=CC(=C2)O)O)O
C1=CC=C2C(=C1)C(=O)C=C(O2)C(=O)O
CN1C(=O)CN=C(C2=C1C=CC(=C2)[N+](=O)[O-])C3=CC=CC=C3F
C1=CN(C(=O)N=C1N)[C@H]2C([C@@H]([C@H](O2)CO)O)(F)F
CNC(=NC)NCC1=CC=CC=C1
CC1=NN=C2N1C3=C(C=C(C=C3)Cl)C(=NC2)C4=CC=CC=C4
CCC1=C(C=C2CC(CC2=C1)NCC(C3=C4C=CC(=O)NC4=C(C=C3)O)O)CC
CN1C2=C(C(=O)N(C1=O)C)N(C=N2)CC(CO)O
CCCCC1=C(C2=CC=CC=C2O1)C(=O)C3=CC(=C(C(=C3)I)OCCN(CC)CC)I
CC[C@@H](C(=O)N)N1CCCC1=O
CN1C(=C(C2=CC=CC=C2S1(=O)=O)O)C(=O)NC3=CC=CC=N3
C1=CC=C(C(=C1)C2=CC=C(C=C2)CC(=O)O)[Cl]
C(CC(C(F)F)(C(=O)O)N)CN
C([C@H]([C@H]([C@@H]([C@@H](CO)O)O)O)O)O
C1

  0%|          | 0/98 [00:00<?, ?it/s]

CC(=O)OCC1=C(N2[C@@H]([C@@H](C2=O)NC(=O)CCC[C@H](C(=O)O)N)SC1)C(=O)O
C1CCC(CC1)(CC(=O)O)CN
CN(C)CCC(C1=CC=C(C=C1)Cl)C2=CC=CC=N2
CC1=C(C(C(=C(N1)C)C(=O)OC)C2=CC=CC=C2[N+](=O)[O-])C(=O)OC
C1=CC(=C(C=C1[C@H](CN)O)O)O
COC1=C(C2=C(C=C1O)OC=C(C2=O)C3=CC=C(C=C3)O)O
CC(C)(C)NC(=O)[C@@H]1C[C@@H]2CCCC[C@@H]2CN1C[C@H]([C@H](CC3=CC=CC=C3)NC(=O)[C@H](CC(=O)N)NC(=O)C4=NC5=CC=CC=C5C=C4)O
CCCC(CCC)C(=O)O
C[C@H]1[C@@H]2CC[C@]3([C@H]([C@]2(CC[C@H]1O)C)[C@@H](C[C@@H]\4[C@@]3(C[C@@H](/C4=C(/CCC=C(C)C)\C(=O)O)OC(=O)C)C)O)C
C1=C(C(=O)NC(=O)N1)F
CCCSC1=CC2=C(C=C1)N=C(N2)NC(=O)OC
CC(C)C1=C(C(=CC=C1)C(C)C)O
CCN1CCN(CC1)CC2=CN=C(C=C2)NC3=NC=C(C(=N3)C4=CC5=C(C(=C4)F)N=C(N5C(C)C)C)F
CC(CC(=O)C1=CC=C(C=C1)C2=CC=CC=C2)C(=O)O
C[C@]([C@H]1C[C@@]23CC[C@@]1([C@H]4[C@@]25CCN([C@@H]3CC6=C5C(=C(C=C6)O)O4)CC7CC7)OC)(C(C)(C)C)O
COC1=CC=C(C=C1)C2=COC3=CC(=CC(=C3C2=O)O)O
CC1=C2[C@H](C(=O)[C@@]3([C@H](C[C@@H]4[C@]([C@H]3[C@@H]([C@@](C2(C)C)(C[C@@H]1OC(=O)[C@@H]([C@H](C5=CC=CC=C5)NC(=O)OC(C)(C)C)O)O)OC(=O)C6=CC=CC=C6)(CO4)OC(=O

In [27]:
X_train_val = data_train_val.drop(columns = [TARGET_COLUMN, 'smiles'])
X_test = data_test.drop(columns = [TARGET_COLUMN, 'smiles'])
y_train_val = data_train_val.active
y_test = data_test.active

scaler = MinMaxScaler()
scaler.fit(pd.concat([X_train_val, X_test]))
X_train_val = scaler.transform(X_train_val)
X_test = scaler.transform(X_test)

# mos = MOS()

# mos.fit(np.concatenate([X_train_val, X_test], axis=0), pd.concat([y_train_val, y_test]).to_numpy())
# X_train_val = mos.transform(X_train_val)
# X_test = mos.transform(X_test)

In [28]:
# for i, col in enumerate(data_train_val.columns):
#     if i in mos.selected_features:
#         print(col)

In [29]:
predicted_values = train_val_test_nn(X_train_val, y_train_val, X_test, y_test, [25, 50, 25, 10], [7, 5],
                                     num_epochs=100, patience=10, lr=0.0001)

Model's state_dict:
fc1.weight 	 torch.Size([25, 257])
fc1.bias 	 torch.Size([25])
sequential.fc2.weight 	 torch.Size([50, 25])
sequential.fc2.bias 	 torch.Size([50])
sequential.fc3.weight 	 torch.Size([25, 50])
sequential.fc3.bias 	 torch.Size([25])
sequential.fc4.weight 	 torch.Size([10, 25])
sequential.fc4.bias 	 torch.Size([10])
fc_out_binding.bs_fc1.weight 	 torch.Size([7, 10])
fc_out_binding.bs_fc1.bias 	 torch.Size([7])
fc_out_binding.bs_fc2.weight 	 torch.Size([5, 7])
fc_out_binding.bs_fc2.bias 	 torch.Size([5])
fc_out_binding.bs_out.weight 	 torch.Size([3, 5])
fc_out_binding.bs_out.bias 	 torch.Size([3])
fc_out_non_binding.weight 	 torch.Size([2, 10])
fc_out_non_binding.bias 	 torch.Size([2])
EarlyStopping counter: 1 out of 10
EarlyStopping counter: 2 out of 10
EarlyStopping counter: 3 out of 10
EarlyStopping counter: 4 out of 10
EarlyStopping counter: 5 out of 10
Epoch [10/100], Loss: 0.3165
Epoch [20/100], Loss: 0.3494
Epoch [30/100], Loss: 0.3230
EarlyStopping counter: 1 ou

In [30]:
print('Test accuracy for the best model %.2f' % accuracy_score(y_test, predicted_values))
print('Test f1-score for the best model %.2f' % f1_score(y_test, predicted_values))

Test accuracy for the best model 0.86
Test f1-score for the best model 0.91


## Transfer learning

In [31]:
data_train_val = pd.read_csv('../data/split/site_train_val.csv')
data_test = pd.read_csv('../data/split/site_test.csv')

In [32]:
data_train_val = generate_descriptors(data_train_val, use_fp=False)
data_test = generate_descriptors(data_test, use_fp=False)

  0%|          | 0/97 [00:00<?, ?it/s]

O=C(O)c1c(I)c(c(I)cc1I)NC(=O)CCCCC(=O)Nc(c(I)cc2I)c(I)c2C(=O)O
C=CC1=C(C)C(=O)NC1=Cc2c(C)c(CCC(=O)O)c([nH]2)Cc([nH]3)c(CCC(=O)O)c(C)c3C=C(NC4=O)C(C)=C4C=C
Nc(n1)scc1C(=NOC)C(=O)N[C@@H]2C(=O)N([C@@H]23)C(C(=O)O)=C(CS3)COC(=O)C
CNCCCN(c(c12)cccc1)c3c(CC2)cccc3
s1cccc1C(=O)c2ccc(cc2)C(C)C(=O)O
OCCC1CCN(CC1)CCCN2c(cccc3)c3Sc(c24)ccc(c4)S(=O)(=O)N(C)C
CC(S1)(C)[C@H](C(=O)O)N([C@H]12)C(=O)[C@H]2NC(=O)c3c(OC)cccc3OC
CN(C)CCCN(c(c12)cccc1)c3c(CC2)cccc3
CC[C@@]1(O)C(=O)OCc(c2=O)c1cc3n2Cc(c-34)cc5c(n4)cccc5
CCCC(C)C1(CC)C(=O)N=C(S)NC1=O
n1nncn1CC(=O)N[C@@H]2C(=O)N([C@@H]23)C(C(=O)O)=C(CS3)CSc(s4)nnc4C
Oc1c(C)ncc(c12)C(OC2)c3ccc(Cl)cc3
NC[C@H](O)c1cc(O)c(O)cc1
CCCCC(C1=O)C(=O)N(c2ccccc2)N1c3ccc(O)cc3
c1ccccc1[C@@H](N)C(=O)N[C@@H]2C(=O)N([C@@H]23)C(C(=O)O)=C(C)CS3
CC(=O)[C@H]1CC[C@H]([C@@]12C)[C@H]3[C@H](CC2)[C@]4(C)C(=CC3)C[C@@H](O)CC4
O=C(O)CCC(=O)c1ccc(cc1)-c2ccccc2
CCCC(C1=O)C(=O)N2N1C(N(C)C)=Nc(c23)ccc(C)c3
s1cccc1CC(=O)N[C@@H]2C(=O)N([C@@H]23)C(C(=O)O)=C(CS3)COC(=O)C
CC(=O)NCCOC(=O)C(Oc(cc1C

  0%|          | 0/25 [00:00<?, ?it/s]

O=C(O)C(C)c(cc1)cc(c1c23)[nH]c2ccc(Cl)c3
c1cc(Cl)ccc1C(=O)c2c(C)n(CC(=O)O)c(c23)cc(cc3)OC
C1CCCCC1C(c2ccsc2)C(=O)OCCN3CCCCCC3
CC(S1)(C)[C@H](C(=O)O)N([C@H]12)C(=O)[C@H]2NC(=O)c3c(OCC)ccc(c34)cccc4
NC(=O)C(C1=O)=C(O)[C@@H](N(C)C)[C@@H]([C@@]12O)[C@@H](O)[C@H]3C(=C2O)C(=O)c4c([C@@H]3C)cccc4O
C1CCCN1c(cc2C(=O)O)c(c(c2)S(=O)(=O)N)Oc3ccccc3
O=C(O)C(C)c1cc(ccc1)Oc2ccccc2
c1ccccc1C(CC(=O)C)c(c2O)c(=O)oc(c23)cccc3
COc(cc1)c(OC)cc1C(C#N)(C(C)C)CCCN(C)CCc2cc(OC)c(cc2)OC
O=C(O)Cc1cc(=O)oc(c12)cc(cc2)Nc3ccccc3
Nc(n1)scc1C(=NOC(C)(C)C(=O)O)C(=O)N[C@@H]2C(=O)N([C@@H]23)C(C(=O)O)=C(CS3)C[n+]4ccccc4
Cc1cc(no1)NS(=O)(=O)c(cc2)ccc2N
O=C(O)CN(C)S(=O)(=O)c(ccc1)c(c12)cccc2N(C)C
CN(C)CCCN1c(cccc2)c2Sc(c13)cccc3
c1ccccc1C(N)C(=O)N[C@@H]2C(=O)N([C@@H]23)C(C(=O)O)=C(Cl)CS3
O=C(O)[C@@H](C)c(c1)ccc(c12)cc(cc2)OC
O=C(O)C(C)c(c1)ccc(c12)oc(n2)-c3ccc(Cl)cc3
CCc1nnc(s1)NS(=O)(=O)c(cc2)ccc2N
c1ccccc1CC(=O)N[C@@H]2C(=O)N([C@@H]23)[C@@H](C(=O)O)C(S3)(C)C
c1cccc(Cl)c1CN(C2)CCc(c23)scc3
c1cccc(c1CC(=O)O)Nc2c(Cl)cccc2Cl


In [33]:
TARGET_COLUMN = 'target'

In [34]:
X_train_val = data_train_val.drop(columns = [TARGET_COLUMN, 'smiles'])
X_test = data_test.drop(columns = [TARGET_COLUMN, 'smiles'])
y_train_val = data_train_val.target
y_test = data_test.target

X_train_val = scaler.transform(X_train_val)
X_test = scaler.transform(X_test)

# X_train_val = mos.transform(X_train_val)
# X_test = mos.transform(X_test)

In [35]:
y_train_val = pd.to_numeric(y_train_val)

In [36]:
predicted_values = transfer_train_val_test(X_train_val, y_train_val, X_test, y_test, [25, 50, 25, 10], [7, 5],
                                           '../data/models/checkpoint.pt', num_epochs=1000, patience=100, lr=0.001)

Model's state_dict:
fc1.weight 	 torch.Size([25, 257])
fc1.bias 	 torch.Size([25])
sequential.fc2.weight 	 torch.Size([50, 25])
sequential.fc2.bias 	 torch.Size([50])
sequential.fc3.weight 	 torch.Size([25, 50])
sequential.fc3.bias 	 torch.Size([25])
sequential.fc4.weight 	 torch.Size([10, 25])
sequential.fc4.bias 	 torch.Size([10])
fc_out_binding.bs_fc1.weight 	 torch.Size([7, 10])
fc_out_binding.bs_fc1.bias 	 torch.Size([7])
fc_out_binding.bs_fc2.weight 	 torch.Size([5, 7])
fc_out_binding.bs_fc2.bias 	 torch.Size([5])
fc_out_binding.bs_out.weight 	 torch.Size([3, 5])
fc_out_binding.bs_out.bias 	 torch.Size([3])
fc_out_non_binding.weight 	 torch.Size([2, 10])
fc_out_non_binding.bias 	 torch.Size([2])
EarlyStopping counter: 1 out of 100
EarlyStopping counter: 2 out of 100
EarlyStopping counter: 3 out of 100
EarlyStopping counter: 4 out of 100
EarlyStopping counter: 5 out of 100
EarlyStopping counter: 6 out of 100
EarlyStopping counter: 7 out of 100
EarlyStopping counter: 8 out of 100
E

In [37]:
print('Test accuracy for the best model %.2f' % accuracy_score(y_test, predicted_values))
print('Test f1-score for the best model %.2f' % f1_score(y_test, predicted_values, average='macro'))

Test accuracy for the best model 0.41
Test f1-score for the best model 0.36


In [38]:
predicted_values

[tensor(0),
 tensor(0),
 tensor(1),
 tensor(2),
 tensor(1),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(2),
 tensor(0),
 tensor(0),
 tensor(1),
 tensor(2),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(0),
 tensor(1),
 tensor(1),
 tensor(2),
 tensor(1),
 tensor(2)]