In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import mean_squared_error

from rdkit import DataStructs
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors

from typing import List, Union

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [2]:
class EarlyStopping:
    def __init__(self, patience=5, min_delta : float=-1.0):
        self.patience = patience  # number of times to allow for no improvement before stopping the execution
        self.min_delta = min_delta  # the minimum change to be counted as improvement
        self.counter = 0  # count the number of times the validation accuracy not improving
        self.min_validation_loss = np.inf

    # return True when encountering _patience_ times decrease in validation loss 
    def __call__(self, validation_loss, verbose=False):
        if ((validation_loss+self.min_delta) < self.min_validation_loss):
            self.min_validation_loss = validation_loss
            self.counter = 0  # reset the counter if validation loss decreased at least by min_delta
        elif ((validation_loss+self.min_delta) > self.min_validation_loss):
            self.counter += 1 # increase the counter if validation loss is not decreased by the min_delta
            if verbose:
                print(f"  >> now{validation_loss:.3f} > best{self.min_validation_loss:.3f}")
            if self.counter >= self.patience:
                return True
        return False

In [3]:
train_df = pd.read_csv("../input/train_features.csv")
test_df = pd.read_csv("../input/test_features.csv")

In [4]:
class CustomDataset(Dataset):
    def __init__(self, tab_df, target: str, is_test=False):
        self.tab_df = tab_df
        self.target = target
        self.is_test = is_test

        if self.is_test:
            self.drop_col = ["id", "SMILES"]
            self.tab_features = self.tab_df.drop(columns = self.drop_col, axis=1).values

        else:
            self.label = self.tab_df[target].values.reshape(-1, 1)
            self.drop_col = ["id", "SMILES", "MLM", "HLM"]
            self.tab_features = self.tab_df.drop(columns = self.drop_col, axis=1).values
            #self.fps_features = self.fps_scaler.fit_transform(self.fps_df)

            self.range_class = self.tab_df[target].apply(lambda x : np.int8(min(x, 100)//10)) # 구간 균등화 startify를 위함
            

    def __getitem__(self, index):
        feature  = self.tab_features[index]
        

        if self.is_test:
            return torch.tensor(feature).float()
        else:
            label = self.label[index]
            return torch.tensor(feature).float(), torch.tensor(label).float()

    def __len__(self):
        return len(self.tab_df)

In [5]:
train_MLM = CustomDataset(tab_df = train_df, target="MLM", is_test= False)
test_MLM = CustomDataset(tab_df = test_df,  target="MLM", is_test= True)

train_HLM = CustomDataset(tab_df = train_df,  target="HLM",  is_test= False)
test_HLM = CustomDataset(tab_df = test_df,  target="HLM", is_test= True)


In [6]:
input_size = train_MLM.tab_features.shape[1] 
#input_size = train_MLM.fps_features.shape[1]
print(input_size)

233


In [7]:
# Hyperparameter
CFG = {'BATCH_SIZE': 256,
       'EPOCHS': 8000,
       'INPUT_SIZE': input_size,
       'HIDDEN_SIZE': 1024,
       'OUTPUT_SIZE': 1,
       'DROPOUT_RATE': 0.8,
       'LEARNING_RATE': 0.001}

In [8]:
train_MLM.range_class.value_counts()

0    1333
9     407
1     252
8     239
7     237
2     219
4     198
6     196
5     191
3     189
Name: MLM, dtype: int64

In [9]:
# train,valid split
train_MLM_dataset, valid_MLM_dataset = train_test_split(train_MLM, test_size=0.2, random_state=42, stratify=train_MLM.range_class)
train_HLM_dataset, valid_HLM_dataset = train_test_split(train_HLM, test_size=0.2, random_state=42, stratify=train_HLM.range_class)

In [10]:
train_MLM_loader = DataLoader(dataset=train_MLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=True)

valid_MLM_loader = DataLoader(dataset=valid_MLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=False)

train_HLM_loader = DataLoader(dataset=train_HLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=True)

valid_HLM_loader = DataLoader(dataset=valid_HLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=False)

test_MLM_loader = DataLoader(dataset=test_MLM,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=False)

test_HLM_loader = DataLoader(dataset=test_HLM,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=False)

In [12]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_rate, out_size):
        super(Net, self).__init__()
        
        # fc 레이어 3개와 출력 레이어
        self.fc1 = nn.Linear(input_size, hidden_size) 
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, hidden_size)

        self.fc_out = nn.Linear(hidden_size, out_size)
        
        # 정규화
        self.ln1 = nn.BatchNorm1d(hidden_size)
        self.ln2 = nn.BatchNorm1d(hidden_size)
        self.ln3 = nn.BatchNorm1d(hidden_size)        
        self.ln4 = nn.BatchNorm1d(hidden_size)        

        # 활성화 함수
        self.activation = nn.LeakyReLU()
        
        # Dropout
        self.dropout = nn.Dropout(dropout_rate)
     
    def forward(self, x):
        out = self.fc1(x)
        out = self.ln1(out)
        out = self.activation(out)
        out = self.dropout(out)
        
        out = self.fc2(out)
        out = self.ln2(out)
        out = self.activation(out)
        out = self.dropout(out)
        
        out = self.fc3(out)
        out = self.ln3(out)
        out = self.activation(out)
        out = self.dropout(out)

        out = self.fc4(out)
        out = self.ln4(out)
        out = self.activation(out)
        out = self.dropout(out)


        out = self.fc_out(out)
        return out

In [13]:
model_MLM = Net(CFG['INPUT_SIZE'],CFG['HIDDEN_SIZE'],CFG['DROPOUT_RATE'],CFG['OUTPUT_SIZE']).to("cuda")
model_HLM = Net(CFG['INPUT_SIZE'],CFG['HIDDEN_SIZE'],CFG['DROPOUT_RATE'],CFG['OUTPUT_SIZE']).to("cuda")

In [14]:
print(model_MLM)
sum(p.numel() for p in model_MLM.parameters() if p.requires_grad)

Net(
  (fc1): Linear(in_features=233, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=1024, bias=True)
  (fc3): Linear(in_features=1024, out_features=1024, bias=True)
  (fc4): Linear(in_features=1024, out_features=1024, bias=True)
  (fc_out): Linear(in_features=1024, out_features=1, bias=True)
  (ln1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (ln2): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (ln3): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (ln4): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (activation): LeakyReLU(negative_slope=0.01)
  (dropout): Dropout(p=0.8, inplace=False)
)


3397633

In [15]:
criterion = nn.MSELoss()
optimizer_MLM = torch.optim.Adam(model_MLM.parameters(), lr=CFG['LEARNING_RATE'])
scheduler_MLM = torch.optim.lr_scheduler.LambdaLR(optimizer = optimizer_MLM, lr_lambda= lambda epoch : 0.95**(epoch))
optimizer_HLM = torch.optim.Adam(model_HLM.parameters(), lr=CFG['LEARNING_RATE'])
scheduler_HLM = torch.optim.lr_scheduler.LambdaLR(optimizer = optimizer_HLM, lr_lambda= lambda epoch : 0.95**(epoch))

In [16]:
def train(train_loader, valid_loader, model, criterion, optimizer, scheduler,  epochs):

    earlyStop = EarlyStopping(patience= 8, min_delta=-10)
    
    for epoch in range(epochs):
        model.train()
        running_loss = 0
        for inputs, targets in train_loader:
            optimizer.zero_grad() # Zero your gradients for every batch!
            
            output = model(inputs.to("cuda"))
            loss = criterion(output, targets.to("cuda"))
            loss.backward()
            
            optimizer.step() # Adjust learning weights
            running_loss += loss.item()

        if epoch % 20 == 0:
            model.eval()
            valid_loss = 0
            with torch.no_grad():
                for inputs, targets in valid_loader:
                    output = model(inputs.to("cuda"))
                    loss = criterion(output, targets.to("cuda"))
                    valid_loss += loss.item()
                 
                    
            print(f"Epoch: {epoch:4d}/{epochs} with lr {scheduler.get_last_lr()[0]:.9f}, Train Loss: {np.sqrt(running_loss/len(train_loader))}, Valid Loss: {np.sqrt(valid_loss/len(valid_loader))}")
            
            if earlyStop(valid_loss, verbose=True):
                break

            scheduler.step()    
        
    
    return model

In [17]:
model_MLM = train(train_MLM_loader, valid_MLM_loader, model_MLM, criterion, optimizer_MLM, scheduler_MLM, epochs=CFG["EPOCHS"])
model_HLM = train(train_HLM_loader, valid_HLM_loader, model_HLM, criterion, optimizer_HLM, scheduler_HLM, epochs=CFG["EPOCHS"])


Epoch:    0/8000 with lr 0.001000000, Train Loss: 50.456423082960455, Valid Loss: 51.28092943734867
Epoch:   20/8000 with lr 0.000950000, Train Loss: 31.892685932220193, Valid Loss: 32.44545484156914
Epoch:   40/8000 with lr 0.000902500, Train Loss: 30.10859574860259, Valid Loss: 32.205474476601346
Epoch:   60/8000 with lr 0.000857375, Train Loss: 29.389683395729314, Valid Loss: 31.952241863305083
Epoch:   80/8000 with lr 0.000814506, Train Loss: 28.57994565175083, Valid Loss: 32.48521486107661
  >> now3165.868 > best3062.837
Epoch:  100/8000 with lr 0.000773781, Train Loss: 27.348483762322388, Valid Loss: 33.29391875465648
  >> now3325.455 > best3062.837
Epoch:  120/8000 with lr 0.000735092, Train Loss: 26.72987281358306, Valid Loss: 32.94741021698838
  >> now3256.596 > best3062.837
Epoch:  140/8000 with lr 0.000698337, Train Loss: 26.118323562740066, Valid Loss: 35.188130221191656
  >> now3714.614 > best3062.837
Epoch:  160/8000 with lr 0.000663420, Train Loss: 25.217308602282593, Va

In [None]:
def inference(test_loader, model, label_scaler=None):
    model.eval()
    preds = []
    
    with torch.no_grad():
        for idx, inputs in enumerate(test_loader):
            print(f"{idx:3d}th: ", end=" ")
            for d in inputs:
                std = np.std(d.detach().numpy())
                print(f"{std} ", end=" ")
            print()
            output = model(inputs.to("cuda"))
            if label_scaler is not None:
                output = label_scaler.inverse_transform(output.cpu())
            preds.extend(output.flatten().tolist())
    
    return preds

predictions_MLM = inference(test_MLM_loader, model_MLM, label_scaler=None)
predictions_HLM = inference(test_HLM_loader, model_HLM, label_scaler=None)

In [None]:
submission = pd.read_csv('../input/sample_submission.csv')
submission['MLM'] = predictions_MLM
submission['HLM'] = predictions_HLM
submission

In [None]:
submission.to_csv('../output/submission.csv', index=False)
submission.describe()