<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import" data-toc-modified-id="Import-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import</a></span></li><li><span><a href="#Pre-Processing" data-toc-modified-id="Pre-Processing-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Pre-Processing</a></span></li><li><span><a href="#Custom-Dataset" data-toc-modified-id="Custom-Dataset-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Custom Dataset</a></span></li><li><span><a href="#Model" data-toc-modified-id="Model-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Model</a></span></li><li><span><a href="#Training" data-toc-modified-id="Training-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Training</a></span></li><li><span><a href="#Inference" data-toc-modified-id="Inference-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Inference</a></span></li><li><span><a href="#Submission" data-toc-modified-id="Submission-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Submission</a></span></li></ul></div>

## Import

In [1]:
import random
import os

import numpy as np
import pandas as pd

from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from rdkit import DataStructs
from rdkit.Chem import PandasTools, AllChem

In [2]:
device = torch.device('mps') if torch.backends.mps.is_available() else torch.device('cpu')

In [3]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)

seed_everything(42) # Seed 고정

## Pre-Processing

In [4]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [5]:
PandasTools.AddMoleculeColumnToFrame(train,'SMILES','Molecule')
PandasTools.AddMoleculeColumnToFrame(test,'SMILES','Molecule')

In [6]:
def mol2fp(mol):
    fp = AllChem.GetHashedMorganFingerprint(mol, 6, nBits=4096)
    ar = np.zeros((1,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(fp, ar)
    return ar

In [7]:
# FPs column 추가
train["FPs"] = train.Molecule.apply(mol2fp)
test["FPs"] = test.Molecule.apply(mol2fp)

In [8]:
# 사용할 column만 추출
train = train[['FPs','MLM', 'HLM', 'AlogP', 'Molecular_Weight', 'Num_H_Acceptors', 'Num_H_Donors', 'Molecular_PolarSurfaceArea']]
test = test[['FPs', 'AlogP', 'Molecular_Weight', 'Num_H_Acceptors', 'Num_H_Donors', 'Molecular_PolarSurfaceArea']]

In [9]:
# 컬럼 정규화
scaler = StandardScaler()
train_ss = torch.tensor(scaler.fit_transform(train.iloc[:,3:])).float()
test_ss = torch.tensor(scaler.fit_transform(test.iloc[:,1:])).float()

## Custom Dataset

In [10]:
class CustomDataset(Dataset):
    def __init__(self, df, target, other, transform, is_test=False):
        self.df = df
        self.target = target # HLM or MLM

        self.is_test = is_test # train,valid / test

        self.feature_select = transform
        if not self.is_test: 
            self.fp = self.feature_select.fit_transform(np.stack(df['FPs']))

        else: # valid or test
            self.fp = self.feature_select.transform(np.stack(df['FPs']))


    def __getitem__(self, index):
        fp = self.fp[index]
        if not self.is_test: # test가 아닌 경우(label 존재)
            label = self.df[self.target][index]
            return torch.tensor(fp).float(), torch.tensor(label).float().unsqueeze(dim=-1) # feature, label

        else: # test인 경우
            return torch.tensor(fp).float() # feature
        
    def __len__(self):
        return len(self.df)

In [11]:
transform = VarianceThreshold(threshold=0.05)

train_MLM = CustomDataset(df=train, target='MLM', other=train_ss, transform=transform, is_test=False)
train_HLM = CustomDataset(df=train, target='HLM', other=train_ss, transform=transform, is_test=False)

input_size = train_MLM.fp.shape[1]
input_size

251

In [12]:
# Hyperparameter
CFG = {'BATCH_SIZE': 512,
       'EPOCHS': 1000,
       'INPUT_SIZE': input_size,
       'HIDDEN_SIZE': 512,
       'OUTPUT_SIZE': 1,
       'DROPOUT_RATE': 0.9,
       'LEARNING_RATE': 0.0001}

In [13]:
# train,valid split
train_MLM_dataset, valid_MLM_dataset = train_test_split(train_MLM, test_size=0.2, random_state=42)
train_HLM_dataset, valid_HLM_dataset = train_test_split(train_HLM, test_size=0.2, random_state=42)

In [14]:
train_MLM_loader = DataLoader(dataset=train_MLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=True)

valid_MLM_loader = DataLoader(dataset=valid_MLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=False)


train_HLM_loader = DataLoader(dataset=train_HLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=True)

valid_HLM_loader = DataLoader(dataset=valid_HLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=False)

## Model

In [15]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_rate, out_size):
        super(Net, self).__init__()
        
        # fc 레이어 3개와 출력 레이어
#         self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size) 
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, hidden_size)
        self.fc_out = nn.Linear(hidden_size, out_size)
        
        # 정규화
        self.ln1 = nn.LayerNorm(hidden_size)
        self.ln2 = nn.LayerNorm(hidden_size)
        self.ln3 = nn.LayerNorm(hidden_size)        
        self.ln4 = nn.LayerNorm(hidden_size)
        
        # 활성화 함수
        self.activation = nn.LeakyReLU()
        
        # Dropout
        self.dropout = nn.Dropout(dropout_rate)
     
    def forward(self, x):
        out = self.fc1(x)
        out = self.ln1(out)
        out = self.activation(out)
        out = self.dropout(out)
        
        out = self.fc1(x)
        out = self.ln1(out)
        out = self.activation(out)
        out = self.dropout(out)
        
        out = self.fc2(out)
        out = self.ln2(out)
        out = self.activation(out)
        out = self.dropout(out)
        
#         out = self.fc3(out)
#         out = self.ln3(out)
#         out = self.activation(out)
#         out = self.dropout(out)
        
#         out = self.fc4(out)
#         out = self.ln4(out)
#         out = self.activation(out)
#         out = self.dropout(out)

        out = self.fc_out(out)
        return out

In [16]:
model_MLM = Net(CFG['INPUT_SIZE'],CFG['HIDDEN_SIZE'],CFG['DROPOUT_RATE'],CFG['OUTPUT_SIZE'])
model_HLM = Net(CFG['INPUT_SIZE'],CFG['HIDDEN_SIZE'],CFG['DROPOUT_RATE'],CFG['OUTPUT_SIZE'])

In [17]:
criterion = nn.MSELoss()
optimizer_MLM = torch.optim.Adam(model_MLM.parameters(), lr=CFG['LEARNING_RATE'])
optimizer_HLM = torch.optim.Adam(model_HLM.parameters(), lr=CFG['LEARNING_RATE'])

## Training

In [18]:
class RMSELoss(nn.Module):
    def __init__(self):
        super(RMSELoss, self).__init__()
        self.mse = nn.MSELoss()

    def forward(self, y_hat, y):
        loss = torch.sqrt(self.mse(y_hat,y))
        return loss

In [19]:
def train(train_loader, valid_loader, model, criterion, optimizer, epochs):
    model.train()
    criterion = RMSELoss().to(device)
    
    for epoch in range(epochs):
        running_loss = 0
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            
            output = model(inputs)
            loss = criterion(output, targets)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
        
        if epoch % 100 == 0:
            valid_loss = 0
            with torch.no_grad():
                for inputs, targets in valid_loader:
                    output = model(inputs)
                    loss = criterion(output, targets)
                    valid_loss += loss.item()
                    
            print(f'Epoch: {epoch}/{epochs}, Train Loss: {running_loss/len(train_loader)}, Valid Loss: {valid_loss/len(valid_HLM_loader)}')
            
            model.train()
    
    return model

In [20]:
print("Training Start: MLM")
model_MLM = train(train_MLM_loader, valid_MLM_loader, model_MLM, criterion, optimizer_MLM, epochs=CFG['EPOCHS'])

print("Training Start: HLM")
model_HLM = train(train_HLM_loader, valid_HLM_loader, model_HLM, criterion, optimizer_HLM, epochs=CFG['EPOCHS'])  

Training Start: MLM
Epoch: 0/1000, Train Loss: 51.625274658203125, Valid Loss: 52.74587440490723
Epoch: 100/1000, Train Loss: 38.11333974202474, Valid Loss: 39.68567085266113
Epoch: 200/1000, Train Loss: 36.46472295125326, Valid Loss: 38.06254196166992
Epoch: 300/1000, Train Loss: 36.04678090413412, Valid Loss: 38.15666389465332
Epoch: 400/1000, Train Loss: 33.28947003682455, Valid Loss: 35.751089096069336
Epoch: 500/1000, Train Loss: 30.352963129679363, Valid Loss: 36.11362266540527
Epoch: 600/1000, Train Loss: 29.12225341796875, Valid Loss: 35.52095031738281
Epoch: 700/1000, Train Loss: 28.083141326904297, Valid Loss: 36.51152801513672
Epoch: 800/1000, Train Loss: 26.871156692504883, Valid Loss: 37.183841705322266
Epoch: 900/1000, Train Loss: 25.89925702412923, Valid Loss: 35.664350509643555
Training Start: HLM
Epoch: 0/1000, Train Loss: 64.44739087422688, Valid Loss: 62.609514236450195
Epoch: 100/1000, Train Loss: 45.835109074910484, Valid Loss: 44.8192195892334
Epoch: 200/1000, Tra

## Inference

In [21]:
test_MLM = CustomDataset(df=test, target=None, other=test_ss, transform=transform, is_test=True)
test_HLM = CustomDataset(df=test, target=None, other=test_ss, transform=transform, is_test=True)

test_MLM_loader = DataLoader(dataset=test_MLM,
                             batch_size=CFG['BATCH_SIZE'],
                             shuffle=False)

test_HLM_loader = DataLoader(dataset=test_HLM,
                             batch_size=CFG['BATCH_SIZE'],
                             shuffle=False)

In [22]:
def inference(test_loader, model):
    model.eval()
    preds = []
    
    with torch.no_grad():
        for inputs in test_loader:
            output = model(inputs)
            preds.extend(output.cpu().numpy().flatten().tolist())
    
    return preds

In [23]:
predictions_MLM = inference(test_MLM_loader, model_MLM)
predictions_HLM = inference(test_HLM_loader, model_HLM)

## Submission

In [24]:
submission = pd.read_csv('./sample_submission.csv')
submission

Unnamed: 0,id,MLM,HLM
0,TEST_000,0,0
1,TEST_001,0,0
2,TEST_002,0,0
3,TEST_003,0,0
4,TEST_004,0,0
...,...,...,...
478,TEST_478,0,0
479,TEST_479,0,0
480,TEST_480,0,0
481,TEST_481,0,0


In [25]:
submission['MLM'] = predictions_MLM
submission['HLM'] = predictions_HLM
submission

Unnamed: 0,id,MLM,HLM
0,TEST_000,8.592646,46.538429
1,TEST_001,61.721275,72.102516
2,TEST_002,29.988794,57.158615
3,TEST_003,54.985352,56.561943
4,TEST_004,31.279552,66.371712
...,...,...,...
478,TEST_478,21.226883,35.888062
479,TEST_479,74.712845,79.459862
480,TEST_480,25.621298,67.227943
481,TEST_481,54.093544,71.336952


In [26]:
submission.to_csv('baseline_submission.csv', index=False)