**Задание:** 
1. Запустить Jupyter в докере и собрать значения LogP и MORGAN фингерпринты для датасета MOSES.

2. Обучить модели и сравнить качество на двух типах фингерпринтов (MACCS и MORGAN)

In [None]:
# general tools
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
# RDkit
from rdkit import Chem
from rdkit.Chem.rdmolops import GetAdjacencyMatrix
from rdkit.Chem.Crippen import MolLogP
from rdkit.Chem.rdMolDescriptors import GetMACCSKeysFingerprint, GetMorganFingerprintAsBitVect
# Pytorch
import torch
from torch.nn import Linear, MSELoss
import torch.nn.functional as F
from torch.utils.data import DataLoader

In [2]:
df = pd.read_csv('dataset_v1.csv', nrows=10000)

In [4]:
def get_mol(smiles_or_mol):
    '''
    Loads SMILES/molecule into RDKit's object
    '''
    if isinstance(smiles_or_mol, str):
        if len(smiles_or_mol) == 0:
            return None
        mol = Chem.MolFromSmiles(smiles_or_mol)
        if mol is None:
            return None
        try:
            Chem.SanitizeMol(mol)
        except ValueError:
            return None
        return mol
    return smiles_or_mol

In [48]:
# в функцию добавлена возможность выгрузки для определенного типа фингерпринтов
def get_np_data(df, fp_type, morgan_size):
    '''
    Convert SMILES in pandas DataFrame to numpy arrays
    '''
    train = []
    test = []
    test_scaf = []

    for sid, smiles in enumerate(df.SMILES):
        mol = get_mol(smiles) # create rdkit molecule from SMILES
        if fp_type == 'morgan':
            fp = GetMorganFingerprintAsBitVect(mol, 2, nBits = morgan_size).ToList() # get morgan fingerprints
        elif fp_type == 'maccs':
            fp = GetMACCSKeysFingerprint(mol).ToList() # get maccs fingerprints
        logp = MolLogP(mol) # calculate logp

        if df.SPLIT[sid] == 'train':
            train.append(fp + [logp])
        elif df.SPLIT[sid] == 'test':
            test.append(fp + [logp])
        elif df.SPLIT[sid] == 'test_scaffolds':
            test_scaf.append(fp + [logp])

    train = np.array(train, dtype=np.float32)
    test = np.array(test, dtype=np.float32)
    test_scaf = np.array(test_scaf, dtype=np.float32)

    return train, test, test_scaf

In [6]:
# через указание функции активации можно осуществить ее выбор
class Net(torch.nn.Module):
    def __init__(self, input_size, hidden_size, activation):
        super(Net, self).__init__()
        self.fc1 = Linear(input_size, hidden_size)
        self.fc2 = Linear(hidden_size, 1)

    def forward(self, data):
        x = self.fc1(data)
        if activation == 'relu':
            x = F.relu(x)
        elif activation == 'leaky_relu':
            x = F.leaky_relu(x)
        elif activation == 'tanh':
            x = torch.tanh(x)
        elif activation == 'sigmoid':
            x = torch.sigmoid(x)
        x = self.fc2(x)
        
        return x

In [27]:
# реализована функция, которая оптимизирует batch_size, input_size, hidden_size, activation
def train_network(model_type, batch_size, input_size, hidden_size, activation):
    
    df_result = pd.DataFrame(columns=['model_type', 'batch_size', 'input_size', 'hidden_size', 'activation', 'losses', 'losses_test', 'losses_test_scaf'])

    for i_s in input_size:
        
        train, test, test_scaf = get_np_data(df, fp_type = model_type, morgan_size = i_s)
        
        for bs in batch_size:
            for h_s in hidden_size:
                for act in activation:
                    
                    if model_type == 'maccs':
                        model = Net(167, h_s, act)
                    elif model_type == 'morgan':
                        model = Net(i_s, h_s, act)

                    # create dataloader for training
                    train_dataloader = DataLoader(dataset = train, batch_size = bs, shuffle = True)
                    test_dataloader = DataLoader(dataset = test, batch_size = bs, shuffle = False)
                    test_scaf_dataloader = DataLoader(dataset = test_scaf, batch_size = bs, shuffle = False)

                    # define loss function
                    loss_function = MSELoss()

                    # define optimiser
                    optimiser = torch.optim.SGD(model.parameters(), lr = 1e-3)
    
    
                    # loop over 10 training epochs
                    for epoch in range(10):

                        losses = []
                        losses_test = []
                        losses_test_scaf = []

                        # set model to training mode
                        model.train()

                        # loop over minibatches for training
                        for (k, batch) in tqdm(enumerate(train_dataloader), total = len(train) // 2**7):
                            # compute current value of loss function via forward pass
                            x, y = batch[:, :-1], batch[:, -1]
                            output = model(x)
                            loss_function_value = loss_function(output[:, 0], y)
                            losses.append(loss_function_value.detach())

                            # set past gradient to zero
                            optimiser.zero_grad()

                            # compute current gradient via backward pass
                            loss_function_value.backward()

                            # update model weights using gradient and optimisation method
                            optimiser.step()

                        with torch.no_grad():

                            for (k, batch) in tqdm(enumerate(test_dataloader), total = len(test) // bs):
                                x, y = batch[:, :-1], batch[:, -1]
                                # compute current value of loss function via forward pass
                                output = model(x)
                                loss_function_value = loss_function(output[:, 0], y)
                                losses_test.append(loss_function_value.detach())

                            for (k, batch) in tqdm(enumerate(test_scaf_dataloader), total = len(test_scaf) // bs):
                                x, y = batch[:, :-1], batch[:, -1]
                                # compute current value of loss function via forward pass
                                output = model(x)
                                loss_function_value = loss_function(output[:, 0], y)
                                losses_test_scaf.append(loss_function_value.detach())
                            test_line = 'train loss: %1.4f\ntest loss: %1.4f\nscaf loss: %1.4f'
                        #print( test_line % (np.mean(losses), np.mean(losses_test), np.mean(losses_test_scaf)))
                    
                    df_result.loc[len(df_result.index)] = [model_type, bs, i_s, h_s, act, np.mean(losses), np.mean(losses_test), np.mean(losses_test_scaf)]
    
    return df_result

In [42]:
batch_size = [5, 10, 15, 20]
input_size_maccs = [167]
input_size_morgan = [2048, 1024, 512, 128]
hidden_size = [10, 25, 50, 100]
activation = ['relu', 'leaky_relu', 'tanh', 'sigmoid']

In [43]:
df_result_maccs = train_network(model_type = 'maccs', batch_size = batch_size, input_size = input_size_maccs, hidden_size = hidden_size, activation = activation)

1660it [00:00, 3210.18it/s]           
177it [00:00, 7671.56it/s]             
164it [00:00, 7441.54it/s]             
1660it [00:00, 3180.35it/s]           
177it [00:00, 8039.59it/s]             
164it [00:00, 7967.22it/s]             
1660it [00:00, 3174.31it/s]           
177it [00:00, 8019.70it/s]             
164it [00:00, 7261.64it/s]             
1660it [00:00, 3197.82it/s]           
177it [00:00, 7858.58it/s]             
164it [00:00, 7444.11it/s]             
1660it [00:00, 2873.19it/s]           
177it [00:00, 7689.59it/s]             
164it [00:00, 7803.36it/s]             
1660it [00:00, 3173.52it/s]           
177it [00:00, 7690.87it/s]             
164it [00:00, 7448.79it/s]             
1660it [00:00, 3216.86it/s]           
177it [00:00, 7831.22it/s]             
164it [00:00, 7858.72it/s]             
1660it [00:00, 3101.13it/s]           
177it [00:00, 7682.99it/s]             
164it [00:00, 6288.14it/s]             
1660it [00:00, 3169.06it/s]           
177it [00

In [49]:
df_result_morgan = train_network(model_type = 'morgan', batch_size = batch_size, input_size = input_size_morgan, hidden_size = hidden_size, activation = activation)

1660it [00:00, 3021.29it/s]           
177it [00:00, 7561.62it/s]             
164it [00:00, 7105.76it/s]             
1660it [00:00, 3024.08it/s]           
177it [00:00, 7499.51it/s]             
164it [00:00, 6824.74it/s]             
1660it [00:00, 2959.82it/s]           
177it [00:00, 7673.38it/s]             
164it [00:00, 7161.76it/s]             
1660it [00:00, 3043.82it/s]           
177it [00:00, 7564.62it/s]             
164it [00:00, 7067.79it/s]             
1660it [00:00, 3052.45it/s]           
177it [00:00, 7824.53it/s]             
164it [00:00, 7438.64it/s]             
1660it [00:00, 3031.86it/s]           
177it [00:00, 7363.17it/s]             
164it [00:00, 7480.87it/s]             
1660it [00:00, 3026.89it/s]           
177it [00:00, 7370.26it/s]             
164it [00:00, 7449.76it/s]             
1660it [00:00, 3018.43it/s]           
177it [00:00, 7371.51it/s]             
164it [00:00, 7111.49it/s]             
1660it [00:00, 3028.57it/s]           
177it [00

#### Результаты

В результате видим, что для фингерпринтов MACCS нет определенной зависимости от hidden_size, однако функция актвации leaky_relu показывает лучшие результаты наряду с tanh.

In [50]:
df_result_maccs.sort_values(by=['losses_test']).head(5)

Unnamed: 0,model_type,batch_size,input_size,hidden_size,activation,losses,losses_test,losses_test_scaf
1,maccs,5,167,10,leaky_relu,0.236788,0.235108,0.243886
13,maccs,5,167,100,leaky_relu,0.236332,0.236743,0.242199
10,maccs,5,167,50,tanh,0.235685,0.237568,0.241355
5,maccs,5,167,25,leaky_relu,0.237715,0.237727,0.243002
2,maccs,5,167,10,tanh,0.237591,0.238063,0.245326


В случае фингерпринтов MORGAN hidden_size также влияет неопределенно, а наилучше функции активации sigmoid, relu и leaky_relu. При этом есть сильная зависимость от размера фингерпринта, с увеличением которого происходит уменьшение ошибки

In [51]:
df_result_morgan.sort_values(by=['losses_test']).head(5)

Unnamed: 0,model_type,batch_size,input_size,hidden_size,activation,losses,losses_test,losses_test_scaf
15,morgan,5,2048,100,sigmoid,0.130928,0.147829,0.166266
12,morgan,5,2048,100,relu,0.130881,0.14794,0.169731
3,morgan,5,2048,10,sigmoid,0.129678,0.147957,0.169699
1,morgan,5,2048,10,leaky_relu,0.130238,0.148105,0.167051
9,morgan,5,2048,50,leaky_relu,0.130465,0.148562,0.165993


Также для обоих типов фингерпринтов видно, что batch_size должен быть минимальным для уменьшения ошибки