In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold

from rdkit import DataStructs
from rdkit import Chem, DataStructs
from rdkit.Chem import PandasTools, AllChem, Descriptors
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator

from typing import List, Union

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader


In [2]:
def morgan_binary_features_generator(mol: Union[str, Chem.Mol], plot_img = False,
                                     radius: int = 2,
                                     num_bits: int = 2048) -> np.ndarray:
    
    mol = Chem.MolFromSmiles(mol) if type(mol) == str else mol
    if plot_img:
        display(mol)
    
    features_vec = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=num_bits)
    features = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(features_vec, features)

    return features

In [3]:
def getMolDescriptors(mol: Union[str, Chem.Mol], missingVal=None):
    ''' calculate the full list of descriptors for a molecule

        missingVal is used if the descriptor cannot be calculated
    '''
    mol = Chem.MolFromSmiles(mol) if type(mol) == str else mol
    res = {}
    for nm,fn in Descriptors._descList:
        # some of the descriptor fucntions can throw errors if they fail, catch those here:
        try:
            val = fn(mol)
        except:
            # print the error message:
            import traceback
            traceback.print_exc()
            # and set the descriptor value to whatever missingVal is
            val = missingVal
        res[nm] = val
    return res

In [4]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")

train_df["AlogP"].fillna(value=train_df["AlogP"].mean(), inplace=True)
test_df["AlogP"].fillna(value=test_df["AlogP"].mean(), inplace=True)
train_df.dropna(axis=0, inplace=True)

train_fps = pd.DataFrame(train_df["SMILES"].apply(morgan_binary_features_generator).tolist())
test_fps = pd.DataFrame(test_df["SMILES"].apply(morgan_binary_features_generator).tolist())

train_fps.rename(columns=lambda x: "FPS_" + str(x), inplace=True)
test_fps.rename(columns=lambda x: "FPS_" + str(x), inplace=True)


In [5]:
train_descriptor = pd.DataFrame([getMolDescriptors(smile) for smile in train_df['SMILES']])
test_descriptor =  pd.DataFrame([getMolDescriptors(smile) for smile in test_df['SMILES']])

In [6]:
train_df = pd.concat([train_df, train_descriptor], axis=1)
test_df = pd.concat([test_df, test_descriptor], axis=1)

In [7]:
train_df.fillna(train_df.mean(numeric_only=True), inplace=True)
test_df.fillna(test_df.mean(numeric_only=True), inplace=True)

In [8]:
class CustomDataset(Dataset):
    def __init__(self, tab_df, fps_df,  target: str, tab_scaler, fps_scaler, is_test=False):
        self.tab_df = tab_df
        self.fps_df = fps_df
        self.target = target
        self.is_test = is_test
        self.tab_scaler = tab_scaler
        self.fps_scaler = fps_scaler


        if self.is_test:
            self.drop_col = ["id", "SMILES"]
            self.tab_features = self.tab_scaler[1].transform(self.tab_scaler[0].transform(self.tab_df.drop(columns = self.drop_col, axis=1)))
            self.fps_features = self.fps_scaler[1].transform(self.fps_scaler[0].transform(self.fps_df))
        else:
            self.drop_col = ["id", "SMILES", "MLM", "HLM"]
            self.tab_features = self.tab_scaler[1].fit_transform(self.tab_scaler[0].fit_transform(self.tab_df.drop(columns = self.drop_col, axis=1)))
            self.fps_features = self.fps_scaler[1].fit_transform(self.fps_scaler[0].fit_transform(self.fps_df))
            


    def __getitem__(self, index):
        feature = np.concatenate([self.tab_features[index], self.fps_features[index]])
        #feature = self.tab_features[index]


        if self.is_test:
            return torch.tensor(feature).float()
        else:
            label = self.tab_df[self.target][index]
            return torch.tensor(feature).float(), torch.tensor(label).float().unsqueeze(dim=-1)

    def __len__(self):
        return len(self.tab_df)

In [9]:
tab_scaler = [VarianceThreshold(threshold=0.05), MinMaxScaler()]
fps_scaler = [VarianceThreshold(threshold=0.05), MinMaxScaler()]

train_MLM = CustomDataset(tab_df = train_df, fps_df = train_fps, target="MLM", tab_scaler = tab_scaler, fps_scaler=fps_scaler , is_test= False)
test_MLM = CustomDataset(tab_df = test_df, fps_df = test_fps, target="MLM", tab_scaler = tab_scaler, fps_scaler=fps_scaler ,is_test= True)

train_HLM = CustomDataset(tab_df = train_df, fps_df = train_fps, target="HLM", tab_scaler = tab_scaler, fps_scaler=fps_scaler ,is_test= False)
test_HLM = CustomDataset(tab_df = test_df, fps_df = test_fps, target="HLM", tab_scaler = tab_scaler, fps_scaler=fps_scaler , is_test= True)


In [10]:
input_size = train_MLM.fps_features.shape[1] + train_MLM.tab_features.shape[1] 
#input_size = train_MLM.tab_features.shape[1]
print(input_size)

309


In [11]:
# Hyperparameter
CFG = {'BATCH_SIZE': 256,
       'EPOCHS': 8000,
       'INPUT_SIZE': input_size,
       'HIDDEN_SIZE': 1024,
       'OUTPUT_SIZE': 1,
       'DROPOUT_RATE': 0.8,
       'LEARNING_RATE': 0.0001}

In [12]:
class EarlyStopping:
    def __init__(self, patience=1, min_delta=0.0):
        self.patience = patience  # number of times to allow for no improvement before stopping the execution
        self.min_delta = min_delta  # the minimum change to be counted as improvement
        self.counter = 0  # count the number of times the validation accuracy not improving
        self.min_validation_loss = np.inf

    # return True when encountering _patience_ times decrease in validation loss 
    def __call__(self, validation_loss):
        if ((validation_loss+self.min_delta) < self.min_validation_loss):
            self.min_validation_loss = validation_loss
            self.counter = 0  # reset the counter if validation loss decreased at least by min_delta
        elif ((validation_loss+self.min_delta) > self.min_validation_loss):
            self.counter += 1 # increase the counter if validation loss is not decreased by the min_delta
            if self.counter >= self.patience:
                return True
        return False

In [13]:
# train,valid split
train_MLM_dataset, valid_MLM_dataset = train_test_split(train_MLM, test_size=0.2, random_state=42)
train_HLM_dataset, valid_HLM_dataset = train_test_split(train_HLM, test_size=0.2, random_state=42)

In [14]:
train_MLM_loader = DataLoader(dataset=train_MLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=True)

valid_MLM_loader = DataLoader(dataset=valid_MLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=False)


train_HLM_loader = DataLoader(dataset=train_HLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=True)

valid_HLM_loader = DataLoader(dataset=valid_HLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=False)

In [15]:
def initialize_weights(model: nn.Module):
    """
    Initializes the weights of a model in place.
    :param model: An nn.Module.
    """
    for param in model.parameters():
        if param.dim() == 1:
            nn.init.constant_(param, 0)
        else:
            nn.init.xavier_normal_(param)

In [16]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_rate, out_size):
        super(Net, self).__init__()
        
        # fc 레이어 3개와 출력 레이어
        self.fc1 = nn.Linear(input_size, hidden_size) 
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, hidden_size)

        self.fc_out = nn.Linear(hidden_size, out_size)
        
        # 정규화
        self.ln1 = nn.LayerNorm(hidden_size)
        self.ln2 = nn.LayerNorm(hidden_size)
        self.ln3 = nn.LayerNorm(hidden_size)        
        self.ln4 = nn.LayerNorm(hidden_size)        

        
        # 활성화 함수
        self.activation = nn.LeakyReLU()
        
        # Dropout
        self.dropout = nn.Dropout(dropout_rate)
     
    def forward(self, x):
        out = self.fc1(x)
        out = self.ln1(out)
        out = self.activation(out)
        out = self.dropout(out)
        
        out = self.fc2(out)
        out = self.ln2(out)
        out = self.activation(out)
        out = self.dropout(out)
        
        out = self.fc3(out)
        out = self.ln3(out)
        out = self.activation(out)
        out = self.dropout(out)

        out = self.fc4(out)
        out = self.ln4(out)
        out = self.activation(out)
        out = self.dropout(out)

        out = self.fc_out(out)
        return out

In [17]:
model_MLM = Net(CFG['INPUT_SIZE'],CFG['HIDDEN_SIZE'],CFG['DROPOUT_RATE'],CFG['OUTPUT_SIZE']).to("cuda")
model_HLM = Net(CFG['INPUT_SIZE'],CFG['HIDDEN_SIZE'],CFG['DROPOUT_RATE'],CFG['OUTPUT_SIZE']).to("cuda")

In [18]:
initialize_weights(model_MLM)
initialize_weights(model_HLM)

In [19]:
model_MLM

Net(
  (fc1): Linear(in_features=309, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=1024, bias=True)
  (fc3): Linear(in_features=1024, out_features=1024, bias=True)
  (fc4): Linear(in_features=1024, out_features=1024, bias=True)
  (fc_out): Linear(in_features=1024, out_features=1, bias=True)
  (ln1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  (ln2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  (ln3): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  (ln4): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  (activation): LeakyReLU(negative_slope=0.01)
  (dropout): Dropout(p=0.8, inplace=False)
)

In [20]:
criterion = nn.MSELoss()
optimizer_MLM = torch.optim.Adam(model_MLM.parameters(), lr=CFG['LEARNING_RATE'])
scheduler_MLM = torch.optim.lr_scheduler.LambdaLR(optimizer = optimizer_MLM, lr_lambda= lambda epoch : 0.95**(epoch))
optimizer_HLM = torch.optim.Adam(model_HLM.parameters(), lr=CFG['LEARNING_RATE'])
scheduler_HLM = torch.optim.lr_scheduler.LambdaLR(optimizer = optimizer_HLM, lr_lambda= lambda epoch : 0.95**(epoch))


In [21]:
def train(train_loader, valid_loader, model, criterion, optimizer, scheduler, epochs):
    model.train()
    earlyStop = EarlyStopping(patience= 3, min_delta=1)
    
    for epoch in range(epochs):
        running_loss = 0
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            
            output = model(inputs.to("cuda"))
            loss = criterion(output, targets.to("cuda"))
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            
        
        if epoch % 100 == 0:
            model.eval()
            valid_loss = 0
            with torch.no_grad():
                for inputs, targets in valid_loader:
                    output = model(inputs.to("cuda"))
                    loss = criterion(output, targets.to("cuda"))
                    valid_loss += loss.item()
                    
            print(f'Epoch: {epoch}/{epochs} with lr {scheduler.get_last_lr()[0]:.9f}, Train Loss: {np.sqrt(running_loss/len(train_loader))}, Valid Loss: {np.sqrt(valid_loss/len(valid_HLM_loader))}')
            if earlyStop(valid_loss):
                break

            model.train()
            scheduler.step()    
        
    
    return model

In [22]:
print("Training Start: MLM")
model_MLM = train(train_MLM_loader, valid_MLM_loader, model_MLM, criterion, optimizer_MLM, scheduler_MLM, epochs=CFG['EPOCHS'])

print("Training Start: HLM")
model_HLM = train(train_HLM_loader, valid_HLM_loader, model_HLM, criterion, optimizer_HLM, scheduler_HLM, epochs=CFG['EPOCHS'])  

Training Start: MLM
Epoch: 0/1000 with lr 0.000100000, Train Loss: 51.47840270919502, Valid Loss: 52.603925071639935
Epoch: 100/1000 with lr 0.000095000, Train Loss: 43.069307825233786, Valid Loss: 44.234277953060854
