In [74]:
# general tools
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
# RDkit
from rdkit import Chem
from rdkit.Chem.rdmolops import GetAdjacencyMatrix
from rdkit.Chem.Crippen import MolLogP
from rdkit.Chem.rdMolDescriptors import (
    GetMACCSKeysFingerprint,
    GetMorganFingerprintAsBitVect,
)
# Pytorch
import torch
from torch import nn
from torch.nn import Linear, MSELoss
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

from datetime import datetime
import time

In [2]:
df = pd.read_csv('./dataset_v1.csv', nrows=100000)

In [5]:
def get_mol(smiles_or_mol):
    '''
    Loads SMILES/molecule into RDKit's object
    '''
    if isinstance(smiles_or_mol, str):
        if len(smiles_or_mol) == 0:
            return None
        mol = Chem.MolFromSmiles(smiles_or_mol)
        if mol is None:
            return None
        try:
            Chem.SanitizeMol(mol)
        except ValueError:
            return None
        return mol
    return smiles_or_mol

In [75]:
class LogPDataset(Dataset):
    def __init__(self, smiles_df, dtype='smiles', split='train'):
        self.dtype = dtype
        smiles = smiles_df[smiles_df.SPLIT == split].SMILES
        if dtype == 'smiles':
            self.x = smiles
        elif dtype == 'maccs':
            self.x = torch.Tensor([GetMACCSKeysFingerprint(get_mol(s)).ToList() for s in smiles])
        elif dtype == 'morgan':
            self.x = torch.Tensor([GetMorganFingerprintAsBitVect(get_mol(s), 2, nBits=2048).ToList() for s in smiles])

        self.y = torch.Tensor([MolLogP(get_mol(s)) for s in smiles])

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

In [76]:
class AEDataset(Dataset):
    def __init__(self, smiles_df, dtype='smiles', split='train'):
        self.dtype = dtype
        smiles = smiles_df[smiles_df.SPLIT == split].SMILES
        if dtype == 'smiles':
            self.x = smiles
        elif dtype == 'maccs':
            self.x = torch.Tensor([GetMACCSKeysFingerprint(get_mol(s)).ToList() for s in smiles])
        elif dtype == 'morgan':
            self.x = torch.Tensor([GetMorganFingerprintAsBitVect(get_mol(s), 2, nBits=2048).ToList() for s in smiles])

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.x[idx]

In [8]:
class Net(torch.nn.Module):
    def __init__(self, input_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, 10)
        self.fc2 = nn.Linear(10, 1)

    def forward(self, data):
        x = self.fc1(data)
        x = F.relu(x)
        x = self.fc2(x)
        
        return x

In [9]:
class Encoder(torch.nn.Module):
    def __init__(self, sizes):
        super(Encoder, self).__init__()
        self.sizes = sizes
        layers = []
        for i in range(len(sizes) - 1):
            layers.append(nn.Linear(sizes[i], sizes[i+1]))
            layers.append(nn.ReLU())
        self.layers = nn.Sequential(*layers)

    def forward(self, data):
        return self.layers(data)

class Decoder(torch.nn.Module):
    def __init__(self, sizes):
        super(Decoder, self).__init__()
        self.sizes = sizes
        layers = []
        for i in range(len(sizes) - 1):
            layers.append(nn.Linear(sizes[-i-1], sizes[-i-2]))
            layers.append(nn.ReLU())
        layers[-1] = nn.Sigmoid()
        self.layers = nn.Sequential(*layers[:-1])

    def forward(self, data):
        return self.layers(data)

class Autoencoder(torch.nn.Module):
    def __init__(self, encoder, decoder):
        super(Autoencoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, data):
        return self.decoder(self.encoder(data))

class Predictor(torch.nn.Module):
    def __init__(self, encoder):
        super(Predictor, self).__init__()
        self.embedding_size = encoder.sizes[-1]
        self.encoder = encoder
        self.out_layer = nn.Linear(self.embedding_size, 1)

    def forward(self, data):
        return self.out_layer(self.encoder(data)).flatten()


In [12]:
def train_loop(data, model, criterion, optimizer):
    x, y = data
    output = model(x)
    loss = criterion(output, y)
    loss_value = loss.detach()
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return loss_value

def train_epoch(loader, model, criterion, optimizer):
    losses = []
    
    # set model to training mode
    model.train()
    
    # loop over minibatches for training
    for (k, batch) in enumerate(loader):
        losses.append(train_loop(batch, model, criterion, optimizer))
    
    return np.mean(losses)

def test_epoch(loader, model, criterion):
    model.eval()
    losses = []
    with torch.no_grad():
        for (k, (x, y)) in enumerate(loader):
            # compute current value of loss function via forward pass
            output = model(x)
            losses.append(criterion(output, y).detach())

    return np.mean(losses)

In [72]:
def train_net(hidden_size):
    loss_function = MSELoss()
    model = Net(hidden_size)
    optimiser = torch.optim.SGD(model.parameters(), lr = 1e-3)
    
    for epoch in range(30):
    
        losses = []
        losses_test = []
        losses_test_scaf = []
    
        # set model to training mode
        model.train()
    
        # loop over minibatches for training
        for (k, batch) in tqdm(enumerate(train_dataloader), total = len(train) // 2**7):
            # compute current value of loss function via forward pass
            x, y = encoder_model(batch[0]), batch[1]
            output = model(x)
            #print(x.shape)
            #print(y.shape)
            loss_function_value = loss_function(output[:, 0], y)
            losses.append(loss_function_value.detach())
    
            # set past gradient to zero
            optimiser.zero_grad()
    
            # compute current gradient via backward pass
            loss_function_value.backward()
    
            # update model weights using gradient and optimisation method
            optimiser.step()
    
        with torch.no_grad():
        
            for (k, batch) in tqdm(enumerate(test_dataloader), total = len(test) // batch_size):
                x, y = encoder_model(batch[0]), batch[1]
                # compute current value of loss function via forward pass
                output = model(x)
                loss_function_value = loss_function(output[:, 0], y)
                losses_test.append(loss_function_value.detach())
            #print(np.mean(losses_test))
    
            for (k, batch) in tqdm(enumerate(test_scaf_dataloader), total = len(test_scaf) // batch_size):
                x, y = encoder_model(batch[0]), batch[1]
                # compute current value of loss function via forward pass
                output = model(x)
                loss_function_value = loss_function(output[:, 0], y)
                losses_test_scaf.append(loss_function_value.detach())
            #print(np.mean(losses_test_scaf))
        test_line = 'train loss: %1.4f test loss: %1.4f scaf loss: %1.4f'
        print( test_line % (np.mean(losses), np.mean(losses_test), np.mean(losses_test_scaf)))

Определим наилучшие параметры для автоэнкодера на основе maccs

In [13]:
train = LogPDataset(df, 'maccs', 'train')
test = LogPDataset(df, 'maccs', 'test')
test_scaf = LogPDataset(df, 'maccs', 'test_scaffolds')

In [47]:
batch_size = [5, 10, 20]
layer_sizes = [(167, 32, 10), (167, 64, 50), (167, 128, 100)]
learning_rate = [0.1, 0.01, 0.001]
epochs = [10, 20, 30]

In [48]:
df_result_maccs = pd.DataFrame(columns=['batch_size', 'hidden_size', 'learning_rate', 'epoch', 'time', 'losses_test', 'losses_test_scaf'])

In [49]:
for bs in batch_size:
    for ls in layer_sizes:
        for lr in learning_rate:
            for ep in epochs:
                start_time = datetime.now()

                encoder_model = Encoder(ls)
                decoder_model = Decoder(ls)
                predictor_model = Predictor(encoder_model)
                ae_model = Autoencoder(encoder_model, decoder_model)
                
                # create dataloader for training
                batch_size = 64
                train_dataloader = DataLoader(dataset = train, batch_size = bs, shuffle = True)
                test_dataloader = DataLoader(dataset = test, batch_size = bs, shuffle = False)
                test_scaf_dataloader = DataLoader(dataset = test_scaf, batch_size = bs, shuffle = False)
                
                # define loss function
                predictor_loss_function = nn.MSELoss()
                autoencoder_loss_function = nn.BCELoss()
                
                # define optimiser
                autoencoder_optimizer = torch.optim.Adam(ae_model.parameters(), lr = lr)
                predictor_optimizer = torch.optim.SGD(predictor_model.parameters(), lr = lr)
                for epoch in range(ep):
                    train_loss = train_epoch(train_dataloader,
                                             predictor_model,
                                             predictor_loss_function,
                                             predictor_optimizer)
                    test_loss = test_epoch(test_dataloader,
                                           predictor_model,
                                           predictor_loss_function)
                    test_scaf_loss = test_epoch(test_scaf_dataloader,
                                                predictor_model,
                                                predictor_loss_function)
                    test_line = 'train loss: %1.4f\ntest loss: %1.4f\nscaf loss: %1.4f'
                    print( test_line % (train_loss, test_loss, test_scaf_loss))     
                delta = datetime.now() - start_time
                df_result_maccs.loc[len(df_result_maccs.index)] = [bs, ls, lr, ep, delta.total_seconds(), float(test_loss), float(test_scaf_loss)]


train loss: 0.4047
test loss: 0.2828
scaf loss: 0.3059
train loss: 0.3670
test loss: 0.3832
scaf loss: 0.4053
train loss: 0.3608
test loss: 0.5711
scaf loss: 0.6052
train loss: 0.3593
test loss: 0.3230
scaf loss: 0.3442
train loss: 0.3656
test loss: 0.3293
scaf loss: 0.3552
train loss: 0.3709
test loss: 0.4878
scaf loss: 0.4931
train loss: 0.3809
test loss: 1.0034
scaf loss: 1.0194
train loss: 0.3403
test loss: 0.3306
scaf loss: 0.3427
train loss: 0.3514
test loss: 0.3156
scaf loss: 0.3303
train loss: 0.3485
test loss: 0.3342
scaf loss: 0.3648
train loss: 0.7495
test loss: 0.8326
scaf loss: 0.8397
train loss: 0.8645
test loss: 0.8929
scaf loss: 0.8772
train loss: 0.8647
test loss: 0.8347
scaf loss: 0.8484
train loss: 0.8642
test loss: 0.8484
scaf loss: 0.8446
train loss: 0.8639
test loss: 0.8455
scaf loss: 0.8659
train loss: 0.8648
test loss: 0.8324
scaf loss: 0.8403
train loss: 0.8651
test loss: 0.8375
scaf loss: 0.8392
train loss: 0.8646
test loss: 0.9097
scaf loss: 0.9464
train loss

In [54]:
df_result_maccs.sort_values(by=['losses_test']).head(5)

Unnamed: 0,batch_size,hidden_size,learning_rate,epoch,time,losses_test,losses_test_scaf
77,20,"(167, 128, 100)",0.01,30,97.438087,0.112102,0.131636
14,5,"(167, 64, 50)",0.01,30,282.497749,0.112736,0.134574
23,5,"(167, 128, 100)",0.01,30,313.677102,0.113063,0.13183
22,5,"(167, 128, 100)",0.01,20,207.993233,0.114138,0.132908
49,10,"(167, 128, 100)",0.01,20,111.879361,0.11593,0.133869


Получилось что наилучшие параметры это **batch_size = 20**, **hidden_size = (167, 128, 100)**, **learning_rate = 0.01**, **epoch = 30**, при этом обучение занимает всего 100 секунд. Обучим автоэнкодер с оптимальными параметрами.

In [55]:
df = pd.read_csv('./dataset_v1.csv')
train = LogPDataset(df, 'maccs', 'train')
test = LogPDataset(df, 'maccs', 'test')
test_scaf = LogPDataset(df, 'maccs', 'test_scaffolds')
batch_size = [20]
layer_sizes = [(167, 128, 100)	]
learning_rate = [0.01]
epochs = [30]

In [56]:
for bs in batch_size:
    for ls in layer_sizes:
        for lr in learning_rate:
            for ep in epochs:
                start_time = datetime.now()

                encoder_model = Encoder(ls)
                decoder_model = Decoder(ls)
                predictor_model = Predictor(encoder_model)
                ae_model = Autoencoder(encoder_model, decoder_model)
                
                # create dataloader for training
                batch_size = 64
                train_dataloader = DataLoader(dataset = train, batch_size = bs, shuffle = True)
                test_dataloader = DataLoader(dataset = test, batch_size = bs, shuffle = False)
                test_scaf_dataloader = DataLoader(dataset = test_scaf, batch_size = bs, shuffle = False)
                
                # define loss function
                predictor_loss_function = nn.MSELoss()
                autoencoder_loss_function = nn.BCELoss()
                
                # define optimiser
                autoencoder_optimizer = torch.optim.Adam(ae_model.parameters(), lr = lr)
                predictor_optimizer = torch.optim.SGD(predictor_model.parameters(), lr = lr)
                for epoch in range(ep):
                    train_loss = train_epoch(train_dataloader,
                                             predictor_model,
                                             predictor_loss_function,
                                             predictor_optimizer)
                    test_loss = test_epoch(test_dataloader,
                                           predictor_model,
                                           predictor_loss_function)
                    test_scaf_loss = test_epoch(test_scaf_dataloader,
                                                predictor_model,
                                                predictor_loss_function)
                    test_line = 'train loss: %1.4f\ntest loss: %1.4f\nscaf loss: %1.4f'
                    print( test_line % (train_loss, test_loss, test_scaf_loss))     
                delta = datetime.now() - start_time
                df_result_maccs.loc[len(df_result_maccs.index)] = [bs, ls, lr, ep, delta.total_seconds(), float(test_loss), float(test_scaf_loss)]


train loss: 0.2690
test loss: 0.2327
scaf loss: 0.2270
train loss: 0.1930
test loss: 0.1781
scaf loss: 0.1814
train loss: 0.1726
test loss: 0.2155
scaf loss: 0.2177
train loss: 0.1600
test loss: 0.1642
scaf loss: 0.1673
train loss: 0.1511
test loss: 0.1594
scaf loss: 0.1690
train loss: 0.1442
test loss: 0.1492
scaf loss: 0.1589
train loss: 0.1384
test loss: 0.1521
scaf loss: 0.1652
train loss: 0.1338
test loss: 0.1382
scaf loss: 0.1478
train loss: 0.1304
test loss: 0.1374
scaf loss: 0.1484
train loss: 0.1266
test loss: 0.1330
scaf loss: 0.1442
train loss: 0.1235
test loss: 0.1364
scaf loss: 0.1476
train loss: 0.1210
test loss: 0.1301
scaf loss: 0.1430
train loss: 0.1182
test loss: 0.1359
scaf loss: 0.1507
train loss: 0.1153
test loss: 0.1267
scaf loss: 0.1400
train loss: 0.1139
test loss: 0.1300
scaf loss: 0.1448
train loss: 0.1117
test loss: 0.1258
scaf loss: 0.1413
train loss: 0.1103
test loss: 0.1228
scaf loss: 0.1361
train loss: 0.1081
test loss: 0.1216
scaf loss: 0.1369
train loss

In [73]:
train_net(100)

4348it [00:03, 1256.73it/s]                        
478it [00:00, 3024.62it/s]             
471it [00:00, 2906.76it/s]             


train loss: 0.4040 test loss: 0.1597 scaf loss: 0.1781


4348it [00:03, 1224.86it/s]                        
478it [00:00, 2968.28it/s]             
471it [00:00, 2924.79it/s]             


train loss: 0.1305 test loss: 0.1430 scaf loss: 0.1649


4348it [00:03, 1216.63it/s]                        
478it [00:00, 3123.47it/s]             
471it [00:00, 3160.35it/s]             


train loss: 0.1181 test loss: 0.1347 scaf loss: 0.1569


4348it [00:03, 1213.91it/s]                        
478it [00:00, 2896.32it/s]             
471it [00:00, 2888.94it/s]             


train loss: 0.1106 test loss: 0.1292 scaf loss: 0.1519


4348it [00:03, 1239.17it/s]                        
478it [00:00, 2878.89it/s]             
471it [00:00, 2836.72it/s]             


train loss: 0.1053 test loss: 0.1251 scaf loss: 0.1475


4348it [00:03, 1233.55it/s]                        
478it [00:00, 2949.93it/s]             
471it [00:00, 2924.82it/s]             


train loss: 0.1013 test loss: 0.1224 scaf loss: 0.1443


4348it [00:03, 1229.71it/s]                        
478it [00:00, 2913.97it/s]             
471it [00:00, 3098.00it/s]             


train loss: 0.0981 test loss: 0.1198 scaf loss: 0.1416


4348it [00:03, 1213.91it/s]                        
478it [00:00, 2931.83it/s]             
471it [00:00, 3038.02it/s]             


train loss: 0.0956 test loss: 0.1181 scaf loss: 0.1396


4348it [00:03, 1235.30it/s]                        
478it [00:00, 2528.54it/s]             
471it [00:00, 2961.60it/s]             


train loss: 0.0936 test loss: 0.1165 scaf loss: 0.1380


4348it [00:03, 1237.41it/s]                        
478it [00:00, 2715.26it/s]             
471it [00:00, 2980.33it/s]             


train loss: 0.0920 test loss: 0.1155 scaf loss: 0.1367


4348it [00:03, 1236.35it/s]                        
478it [00:00, 2699.95it/s]             
471it [00:00, 2853.89it/s]             


train loss: 0.0907 test loss: 0.1147 scaf loss: 0.1357


4348it [00:03, 1235.65it/s]                        
478it [00:00, 3005.62it/s]             
471it [00:00, 3077.72it/s]             


train loss: 0.0897 test loss: 0.1139 scaf loss: 0.1348


4348it [00:03, 1205.83it/s]                        
478it [00:00, 2913.99it/s]             
471it [00:00, 2961.59it/s]             


train loss: 0.0889 test loss: 0.1136 scaf loss: 0.1343


4348it [00:03, 1203.16it/s]                        
478it [00:00, 3005.61it/s]             
471it [00:00, 2331.16it/s]             


train loss: 0.0883 test loss: 0.1130 scaf loss: 0.1336


4348it [00:03, 1231.45it/s]                        
478it [00:00, 2730.78it/s]             
471it [00:00, 2645.48it/s]             


train loss: 0.0877 test loss: 0.1129 scaf loss: 0.1334


4348it [00:03, 1207.50it/s]                        
478it [00:00, 2699.98it/s]             
471it [00:00, 2630.67it/s]             


train loss: 0.0873 test loss: 0.1126 scaf loss: 0.1330


4348it [00:03, 1200.17it/s]                        
478it [00:00, 2844.60it/s]             
471it [00:00, 2836.70it/s]             


train loss: 0.0869 test loss: 0.1123 scaf loss: 0.1326


4348it [00:03, 1213.91it/s]                        
478it [00:00, 3043.90it/s]             
471it [00:00, 2819.70it/s]             


train loss: 0.0866 test loss: 0.1125 scaf loss: 0.1327


4348it [00:03, 1187.71it/s]                        
478it [00:00, 2878.88it/s]             
471it [00:00, 3097.96it/s]             


train loss: 0.0864 test loss: 0.1123 scaf loss: 0.1324


4348it [00:03, 1218.67it/s]                        
478it [00:00, 3063.40it/s]             
471it [00:00, 2836.72it/s]             


train loss: 0.0861 test loss: 0.1121 scaf loss: 0.1321


4348it [00:03, 1191.62it/s]                        
478it [00:00, 2878.86it/s]             
471it [00:00, 3038.00it/s]             


train loss: 0.0859 test loss: 0.1118 scaf loss: 0.1318


4348it [00:03, 1202.16it/s]                        
478it [00:00, 3063.43it/s]             
471it [00:00, 3203.35it/s]             


train loss: 0.0858 test loss: 0.1118 scaf loss: 0.1317


4348it [00:03, 1234.60it/s]                        
478it [00:00, 2861.62it/s]             
471it [00:00, 2980.36it/s]             


train loss: 0.0856 test loss: 0.1117 scaf loss: 0.1317


4348it [00:03, 1210.87it/s]                        
478it [00:00, 2986.84it/s]             
471it [00:00, 2980.33it/s]             


train loss: 0.0855 test loss: 0.1117 scaf loss: 0.1316


4348it [00:03, 1232.85it/s]                        
478it [00:00, 3103.23it/s]             
471it [00:00, 3018.53it/s]             


train loss: 0.0854 test loss: 0.1115 scaf loss: 0.1313


4348it [00:03, 1201.83it/s]                        
478it [00:00, 3043.91it/s]             
471it [00:00, 2906.76it/s]             


train loss: 0.0852 test loss: 0.1115 scaf loss: 0.1313


4348it [00:03, 1179.33it/s]                        
478it [00:00, 2986.83it/s]             
471it [00:00, 2924.82it/s]             


train loss: 0.0852 test loss: 0.1114 scaf loss: 0.1312


4348it [00:03, 1218.33it/s]                        
478it [00:00, 2968.28it/s]             
471it [00:00, 2786.35it/s]             


train loss: 0.0850 test loss: 0.1117 scaf loss: 0.1314


4348it [00:03, 1188.36it/s]                        
478it [00:00, 3005.61it/s]             
471it [00:00, 2999.32it/s]             


train loss: 0.0850 test loss: 0.1115 scaf loss: 0.1312


4348it [00:03, 1200.50it/s]                        
478it [00:00, 2968.27it/s]             
471it [00:00, 3118.53it/s]             


train loss: 0.0849 test loss: 0.1114 scaf loss: 0.1311


Лучший результат для фингерпринтов maccs после сжатия с помощью автоэнкодера до вектора размерностью 100: **test loss = 0.1114**

Определим теперь оптимальные параметры для фингерпринтов morgan

In [78]:
train = LogPDataset(df, 'morgan', 'train')
test = LogPDataset(df, 'morgan', 'test')
test_scaf = LogPDataset(df, 'morgan', 'test_scaffolds')

In [79]:
batch_size = [5, 10, 20]
layer_sizes = [(2048, 32, 10), (2048, 64, 50), (2048, 128, 100)]
learning_rate = [0.1, 0.01, 0.001]
epochs = [10, 20, 30]

In [80]:
df_result_morgan = pd.DataFrame(columns=['batch_size', 'hidden_size', 'learning_rate', 'epoch', 'time', 'losses_test', 'losses_test_scaf'])

In [81]:
for bs in batch_size:
    for ls in layer_sizes:
        for lr in learning_rate:
            for ep in epochs:
                start_time = datetime.now()

                encoder_model = Encoder(ls)
                decoder_model = Decoder(ls)
                predictor_model = Predictor(encoder_model)
                ae_model = Autoencoder(encoder_model, decoder_model)
                
                # create dataloader for training
                train_dataloader = DataLoader(dataset = train, batch_size = bs, shuffle = True)
                test_dataloader = DataLoader(dataset = test, batch_size = bs, shuffle = False)
                test_scaf_dataloader = DataLoader(dataset = test_scaf, batch_size = bs, shuffle = False)
                
                # define loss function
                predictor_loss_function = nn.MSELoss()
                autoencoder_loss_function = nn.BCELoss()
                
                # define optimiser
                autoencoder_optimizer = torch.optim.Adam(ae_model.parameters(), lr = lr)
                predictor_optimizer = torch.optim.SGD(predictor_model.parameters(), lr = lr)
                for epoch in range(ep):
                    train_loss = train_epoch(train_dataloader,
                                             predictor_model,
                                             predictor_loss_function,
                                             predictor_optimizer)
                    test_loss = test_epoch(test_dataloader,
                                           predictor_model,
                                           predictor_loss_function)
                    test_scaf_loss = test_epoch(test_scaf_dataloader,
                                                predictor_model,
                                                predictor_loss_function)
                    test_line = 'train loss: %1.4f\ntest loss: %1.4f\nscaf loss: %1.4f'
                    print( test_line % (train_loss, test_loss, test_scaf_loss))     
                delta = datetime.now() - start_time
                df_result_morgan.loc[len(df_result_morgan.index)] = [bs, ls, lr, ep, delta.total_seconds(), float(test_loss), float(test_scaf_loss)]


train loss: 0.3047
test loss: 0.3574
scaf loss: 0.3935
train loss: 0.2281
test loss: 0.2991
scaf loss: 0.3329
train loss: 0.2100
test loss: 0.1696
scaf loss: 0.1916
train loss: 0.1925
test loss: 0.1472
scaf loss: 0.1637
train loss: 0.1777
test loss: 0.2239
scaf loss: 0.2517
train loss: 0.1678
test loss: 0.1516
scaf loss: 0.1638
train loss: 0.1613
test loss: 0.2678
scaf loss: 0.2880
train loss: 0.1550
test loss: 0.1528
scaf loss: 0.1768
train loss: 0.1485
test loss: 0.3203
scaf loss: 0.3678
train loss: 0.1449
test loss: 0.2525
scaf loss: 0.2843
train loss: 0.2243
test loss: 0.2020
scaf loss: 0.2145
train loss: 0.1434
test loss: 0.1328
scaf loss: 0.1366
train loss: 0.1221
test loss: 0.1123
scaf loss: 0.1199
train loss: 0.0978
test loss: 0.0978
scaf loss: 0.1061
train loss: 0.0852
test loss: 0.0944
scaf loss: 0.1057
train loss: 0.0759
test loss: 0.0960
scaf loss: 0.1053
train loss: 0.0680
test loss: 0.0882
scaf loss: 0.1013
train loss: 0.0623
test loss: 0.0858
scaf loss: 0.0975
train loss

In [82]:
df_result_morgan.sort_values(by=['losses_test']).head(5)

Unnamed: 0,batch_size,hidden_size,learning_rate,epoch,time,losses_test,losses_test_scaf
46,10,"(2048, 128, 100)",0.1,20,274.102138,0.040084,0.053667
47,10,"(2048, 128, 100)",0.1,30,422.142039,0.040182,0.054611
74,20,"(2048, 128, 100)",0.1,30,245.529777,0.040798,0.051974
23,5,"(2048, 128, 100)",0.01,30,611.074524,0.041041,0.055623
22,5,"(2048, 128, 100)",0.01,20,427.120584,0.041159,0.055883


Получилось что наилучшие параметры это **batch_size = 10**, **hidden_size = (2048, 128, 100)**, **learning_rate = 0.1**, **epoch = 20**, при этом обучение занимает **274** секунды. Обучим автоэнкодер с оптимальными параметрами.

In [83]:
batch_size = [10]
layer_sizes = [(2048, 128, 100)]
learning_rate = [0.1]
epochs = [20]

In [84]:
for bs in batch_size:
    for ls in layer_sizes:
        for lr in learning_rate:
            for ep in epochs:
                start_time = datetime.now()

                encoder_model = Encoder(ls)
                decoder_model = Decoder(ls)
                predictor_model = Predictor(encoder_model)
                ae_model = Autoencoder(encoder_model, decoder_model)
                
                # create dataloader for training
                batch_size = 64
                train_dataloader = DataLoader(dataset = train, batch_size = bs, shuffle = True)
                test_dataloader = DataLoader(dataset = test, batch_size = bs, shuffle = False)
                test_scaf_dataloader = DataLoader(dataset = test_scaf, batch_size = bs, shuffle = False)
                
                # define loss function
                predictor_loss_function = nn.MSELoss()
                autoencoder_loss_function = nn.BCELoss()
                
                # define optimiser
                autoencoder_optimizer = torch.optim.Adam(ae_model.parameters(), lr = lr)
                predictor_optimizer = torch.optim.SGD(predictor_model.parameters(), lr = lr)
                for epoch in range(ep):
                    train_loss = train_epoch(train_dataloader,
                                             predictor_model,
                                             predictor_loss_function,
                                             predictor_optimizer)
                    test_loss = test_epoch(test_dataloader,
                                           predictor_model,
                                           predictor_loss_function)
                    test_scaf_loss = test_epoch(test_scaf_dataloader,
                                                predictor_model,
                                                predictor_loss_function)
                    test_line = 'train loss: %1.4f\ntest loss: %1.4f\nscaf loss: %1.4f'
                    print( test_line % (train_loss, test_loss, test_scaf_loss))     
                delta = datetime.now() - start_time
                df_result_morgan.loc[len(df_result_morgan.index)] = [bs, ls, lr, ep, delta.total_seconds(), float(test_loss), float(test_scaf_loss)]


train loss: 0.1750
test loss: 0.1452
scaf loss: 0.1553
train loss: 0.0877
test loss: 0.1012
scaf loss: 0.1085
train loss: 0.0604
test loss: 0.0678
scaf loss: 0.0788
train loss: 0.0454
test loss: 0.0615
scaf loss: 0.0711
train loss: 0.0351
test loss: 0.0675
scaf loss: 0.0794
train loss: 0.0282
test loss: 0.0530
scaf loss: 0.0647
train loss: 0.0228
test loss: 0.0487
scaf loss: 0.0623
train loss: 0.0191
test loss: 0.0469
scaf loss: 0.0584
train loss: 0.0164
test loss: 0.0461
scaf loss: 0.0585
train loss: 0.0139
test loss: 0.0478
scaf loss: 0.0609
train loss: 0.0121
test loss: 0.0431
scaf loss: 0.0558
train loss: 0.0106
test loss: 0.0493
scaf loss: 0.0601
train loss: 0.0095
test loss: 0.0429
scaf loss: 0.0552
train loss: 0.0085
test loss: 0.0507
scaf loss: 0.0637
train loss: 0.0077
test loss: 0.0438
scaf loss: 0.0559
train loss: 0.0070
test loss: 0.0410
scaf loss: 0.0548
train loss: 0.0064
test loss: 0.0427
scaf loss: 0.0544
train loss: 0.0059
test loss: 0.0417
scaf loss: 0.0548
train loss

In [85]:
train_net(100)

8696it [00:10, 832.11it/s]                        
955it [00:00, 2863.82it/s]             
942it [00:00, 2833.68it/s]             


train loss: 0.2019 test loss: 0.0508 scaf loss: 0.0680


8696it [00:10, 833.53it/s]                        
955it [00:00, 2775.53it/s]             
942it [00:00, 2915.75it/s]             


train loss: 0.0131 test loss: 0.0439 scaf loss: 0.0587


8696it [00:10, 840.34it/s]                        
955it [00:00, 2841.61it/s]             
942it [00:00, 2828.20it/s]             


train loss: 0.0086 test loss: 0.0423 scaf loss: 0.0566


8696it [00:10, 841.16it/s]                        
955it [00:00, 2791.77it/s]             
942it [00:00, 2675.52it/s]             


train loss: 0.0070 test loss: 0.0416 scaf loss: 0.0556


8696it [00:10, 846.91it/s]                        
955it [00:00, 2783.63it/s]             
942it [00:00, 2811.31it/s]             


train loss: 0.0061 test loss: 0.0412 scaf loss: 0.0551


8696it [00:10, 842.31it/s]                        
955it [00:00, 2858.64it/s]             
942it [00:00, 2828.19it/s]             


train loss: 0.0056 test loss: 0.0409 scaf loss: 0.0547


8696it [00:10, 840.53it/s]                        
955it [00:00, 2816.47it/s]             
942it [00:00, 2845.28it/s]             


train loss: 0.0053 test loss: 0.0407 scaf loss: 0.0544


8696it [00:10, 839.98it/s]                        
955it [00:00, 2850.10it/s]             
942it [00:00, 2915.74it/s]             


train loss: 0.0050 test loss: 0.0406 scaf loss: 0.0542


8696it [00:10, 840.71it/s]                        
955it [00:00, 2833.19it/s]             
942it [00:00, 2853.90it/s]             


train loss: 0.0048 test loss: 0.0404 scaf loss: 0.0541


8696it [00:10, 840.30it/s]                        
955it [00:00, 2858.63it/s]             
942it [00:00, 2961.60it/s]             


train loss: 0.0047 test loss: 0.0404 scaf loss: 0.0539


8696it [00:10, 838.13it/s]                        
955it [00:00, 2928.79it/s]             
942it [00:00, 2828.18it/s]             


train loss: 0.0046 test loss: 0.0403 scaf loss: 0.0539


8696it [00:10, 838.77it/s]                        
955it [00:00, 2858.63it/s]             
942it [00:00, 2871.30it/s]             


train loss: 0.0045 test loss: 0.0402 scaf loss: 0.0538


8696it [00:10, 840.81it/s]                        
955it [00:00, 2833.19it/s]             
942it [00:00, 2794.62it/s]             


train loss: 0.0044 test loss: 0.0402 scaf loss: 0.0537


8696it [00:10, 842.13it/s]                        
955it [00:00, 2974.41it/s]             
942it [00:00, 2893.63it/s]             


train loss: 0.0043 test loss: 0.0402 scaf loss: 0.0537


8696it [00:10, 844.85it/s]                        
955it [00:00, 2808.19it/s]             
942it [00:00, 2819.72it/s]             


train loss: 0.0043 test loss: 0.0401 scaf loss: 0.0536


8696it [00:10, 841.12it/s]                        
955it [00:00, 2833.18it/s]             
942it [00:00, 2880.07it/s]             


train loss: 0.0042 test loss: 0.0401 scaf loss: 0.0536


8696it [00:10, 837.55it/s]                        
955it [00:00, 2841.62it/s]             
942it [00:00, 2782.53it/s]             


train loss: 0.0042 test loss: 0.0401 scaf loss: 0.0536


8696it [00:10, 834.06it/s]                        
955it [00:00, 2775.53it/s]             
942it [00:00, 2826.40it/s]             


train loss: 0.0042 test loss: 0.0401 scaf loss: 0.0536


8696it [00:10, 840.94it/s]                        
955it [00:00, 2910.92it/s]             
942it [00:00, 2811.31it/s]             


train loss: 0.0041 test loss: 0.0401 scaf loss: 0.0536


8696it [00:10, 837.60it/s]                        
955it [00:00, 2965.18it/s]             
942it [00:00, 2794.62it/s]             


train loss: 0.0041 test loss: 0.0400 scaf loss: 0.0535


8696it [00:10, 845.19it/s]                        
955it [00:00, 2983.70it/s]             
942it [00:00, 2845.29it/s]             


train loss: 0.0041 test loss: 0.0400 scaf loss: 0.0535


8696it [00:10, 844.90it/s]                        
955it [00:00, 2884.53it/s]             
942it [00:00, 2897.80it/s]             


train loss: 0.0041 test loss: 0.0400 scaf loss: 0.0535


8696it [00:10, 838.20it/s]                        
955it [00:00, 2875.86it/s]             
942it [00:00, 2915.75it/s]             


train loss: 0.0041 test loss: 0.0400 scaf loss: 0.0535


8696it [00:10, 846.78it/s]                        
955it [00:00, 2833.17it/s]             
942it [00:00, 2828.19it/s]             


train loss: 0.0040 test loss: 0.0400 scaf loss: 0.0535


8696it [00:10, 840.27it/s]                        
955it [00:00, 2833.19it/s]             
942it [00:00, 2880.08it/s]             


train loss: 0.0040 test loss: 0.0400 scaf loss: 0.0535


8696it [00:10, 837.26it/s]                        
955it [00:00, 2965.16it/s]             
942it [00:00, 2876.04it/s]             


train loss: 0.0040 test loss: 0.0400 scaf loss: 0.0534


8696it [00:10, 843.32it/s]                        
955it [00:00, 2884.54it/s]             
942it [00:00, 2999.33it/s]             


train loss: 0.0040 test loss: 0.0400 scaf loss: 0.0534


8696it [00:10, 841.27it/s]                        
955it [00:00, 2681.98it/s]             
942it [00:00, 2811.30it/s]             


train loss: 0.0040 test loss: 0.0400 scaf loss: 0.0535


8696it [00:10, 843.11it/s]                        
955it [00:00, 2791.77it/s]             
942it [00:00, 2778.13it/s]             


train loss: 0.0040 test loss: 0.0399 scaf loss: 0.0534


8696it [00:10, 835.12it/s]                        
955it [00:00, 2867.22it/s]             
942it [00:00, 2853.90it/s]             


train loss: 0.0040 test loss: 0.0400 scaf loss: 0.0534


Лучший результат для фингерпринтов morgan после сжатия с помощью автоэнкодера до вектора размерностью 100: **test loss = 0.04**

Это вариант лучше, чем нейронки без сжатия автоэнкодером на фингерпринтах morgan, которые показывали **0.13**. Однако, RNN показывала минимальную ошибку равную **0.03**, что немного лучше чем результат данной модели.