<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import" data-toc-modified-id="Import-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import</a></span></li><li><span><a href="#Pre-Processing" data-toc-modified-id="Pre-Processing-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Pre-Processing</a></span></li><li><span><a href="#Custom-Dataset" data-toc-modified-id="Custom-Dataset-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Custom Dataset</a></span></li><li><span><a href="#Model" data-toc-modified-id="Model-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Model</a></span></li><li><span><a href="#Training" data-toc-modified-id="Training-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Training</a></span></li><li><span><a href="#Inference" data-toc-modified-id="Inference-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Inference</a></span></li><li><span><a href="#Submission" data-toc-modified-id="Submission-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Submission</a></span></li></ul></div>

## Import

In [None]:
import random
import os

import numpy as np
import pandas as pd

from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

import torch
import torch.nn as nn
from torch.utils.data import Dataset

from torch_geometric.nn import GCNConv, global_mean_pool
from torch_geometric.data import Data
from torch_geometric.utils import from_smiles
from torch_geometric.loader import DataLoader

from rdkit import DataStructs
from rdkit.Chem import PandasTools, AllChem



In [None]:
device = torch.device('mps') if torch.backends.mps.is_available() else torch.device('cpu')

In [None]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)

seed_everything(42) # Seed 고정

## Pre-Processing

In [None]:
train = pd.read_csv('./train.csv').drop(columns=["id"])
test = pd.read_csv('./test.csv').drop(columns=["id"])

In [None]:
# 사용할 column만 추출
train = train[['SMILES','MLM', 'HLM', 'AlogP', 'Molecular_Weight', 'Num_H_Acceptors', 'Num_H_Donors', 'Molecular_PolarSurfaceArea']]
test = test[['SMILES', 'AlogP', 'Molecular_Weight', 'Num_H_Acceptors', 'Num_H_Donors', 'Molecular_PolarSurfaceArea']]

In [None]:
imputer = IterativeImputer(estimator=RandomForestRegressor(n_jobs=-1), random_state=42)
imputer.fit_transform(train.drop(columns=['SMILES', 'MLM', 'HLM']))
imputer.transform(test.drop(columns='SMILES'))

## Custom Dataset

In [None]:
class CustomDataset(Dataset):
    def __init__(self, df, target, is_test=False):
        self.df = df
        self.target = target # HLM or MLM
        self.is_test = is_test # train,valid / test
        self.scaler = MinMaxScaler()
        
        if not self.is_test: 
            self.graph = self.smiles2mol(df['SMILES'])
            self.others = self.scaler.fit_transform(df.iloc[:,3:])

        else: # valid or test
            self.graph = self.smiles2mol(df['SMILES'])
            self.others = self.scaler.transform(df.iloc[:,1:])

    def smiles2mol(self, smiles_list):
        graph_list = []
        for smiles in smiles_list:
            graph_data = from_smiles(smiles)

            graph_data.smiles = None
            graph_data.edge_attr = None

            graph_list.append(graph_data)
        return graph_list            
            
    def __getitem__(self, index):
        graph = self.graph[index]
        others = self.others[index]
        if not self.is_test: # test가 아닌 경우(label 존재)
            label = self.df[self.target][index]
            return graph, torch.tensor(others).float(), torch.tensor(label).float().unsqueeze(dim=-1) # feature, label

        else: # test인 경우
            return graph, torch.tensor(others).float() # feature

    def __len__(self):
        return len(self.df)

In [None]:

train_MLM = CustomDataset(df=train, target='MLM', is_test=False)
train_HLM = CustomDataset(df=train, target='HLM', is_test=False)

input_size1 = 32
input_size2 = train_MLM.others.shape[1]
print(input_size1, input_size2)

In [None]:
# Hyperparameter
CFG = {'BATCH_SIZE': 256,
       'EPOCHS': 100,
       'INPUT_SIZE1': input_size1,
       'INPUT_SIZE2': input_size2,
       'HIDDEN_SIZE': 512,
       'OUTPUT_SIZE': 1,
       'DROPOUT_RATE': 0.8,
       'LEARNING_RATE': 1e-3}

In [None]:
# train,valid split
train_MLM_dataset, valid_MLM_dataset = train_test_split(train_MLM, test_size=0.2, random_state=42)
train_HLM_dataset, valid_HLM_dataset = train_test_split(train_HLM, test_size=0.2, random_state=42)

In [None]:
train_MLM_loader = DataLoader(dataset=train_MLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=True)

valid_MLM_loader = DataLoader(dataset=valid_MLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=False)


train_HLM_loader = DataLoader(dataset=train_HLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=True)

valid_HLM_loader = DataLoader(dataset=valid_HLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=False)

## Model

In [None]:
class Net(nn.Module):
    def __init__(self, input_size2, hidden_size, dropout_rate, output_size):
        super(Net, self).__init__()
        
        # fc 레이어 3개와 출력 레이어
        self.fc1 = nn.Linear(input_size2, hidden_size) 
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, hidden_size)
        self.fc_out = nn.Linear(hidden_size, output_size)
        
        # 정규화
        self.ln1 = nn.LayerNorm(hidden_size)
        self.ln2 = nn.LayerNorm(hidden_size)
        self.ln3 = nn.LayerNorm(hidden_size)        
        self.ln4 = nn.LayerNorm(hidden_size)
        
        # 활성화 함수
        self.activation = nn.ReLU()
        
        # Dropout
        self.dropout = nn.Dropout(dropout_rate)
        
    
    def forward(self, x):
        
        out = self.fc1(x)
        out = self.ln1(out)
        out = self.activation(out)
        out = self.dropout(out)
        
        out = self.fc2(out)
        out = self.ln2(out)
        out = self.activation(out)
        out = self.dropout(out)
        
#         out = self.fc3(out)
#         out = self.ln3(out)
#         out = self.activation(out)
#         out = self.dropout(out)

#         out = self.fc4(out)
#         out = self.ln4(out)
#         out = self.activation(out)
#         out = self.dropout(out)
        
        out = self.fc_out(out)
        return out

In [None]:
# GNN 모델 정의
class GNN(nn.Module):
    def __init__(self, input_size1, hidden_size, dropout_rate, output_size):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(input_size1, hidden_size)
        self.conv2 = GCNConv(hidden_size, hidden_size)
        self.fc_out = nn.Linear(hidden_size, output_size)
        
        self.bn1 = nn.BatchNorm1d(hidden_size)
        self.bn2 = nn.BatchNorm1d(hidden_size)
        
        # 활성화 함수
        self.activation = nn.ReLU()
        
        # Dropout
        self.dropout = nn.Dropout(dropout_rate)
    
     
    def forward(self, x, batch):
        out, edge_index = x, x.edge_index

        # 첫 번째 Graph Convolution 레이어 적용
        out = self.conv1(out, edge_index)
        out = self.bn1(out)
        out = self.activation(out)
        out = self.dropout(out)
        
        # 두 번째 Graph Convolution 레이어 적용
        out = self.conv2(out, edge_index)
        out = self.bn2(out)
        out = self.activation(out)
        out = self.dropout(out)
        
        out = global_mean_pool(out, batch) # read-out layer
        
        out = self.fc_out(out)
        return out


In [None]:
class ClassificationModel(nn.Module):
    def __init__(self, input_size1, input_size2, hidden_size, drop_rate, output_size):
        super(ClassificationModel, self).__init__()
        self.GNN_extractor = GNN(input_size1, hidden_size, drop_rate, output_size)
        self.Net_extractor = Net(input_size2, hidden_size, drop_rate, output_size)
#         self.classifier = nn.Linear(in_features=hidden_size*2, out_features=output_size)
        self.classifier = nn.Linear(in_features=output_size*2, out_features=output_size)

    def forward(self, graph, others, batch):
        GNN_feature = self.GNN_extractor(graph, batch)
        Net_feature = self.Net_extractor(others)
        feature = torch.cat([GNN_feature, Net_feature], dim=-1)
        output = self.classifier(feature)
        return output

In [None]:
model_MLM = ClassificationModel(CFG['INPUT_SIZE1'],CFG['INPUT_SIZE2'],CFG['HIDDEN_SIZE'],CFG['DROPOUT_RATE'],CFG['OUTPUT_SIZE'])
model_HLM = ClassificationModel(CFG['INPUT_SIZE1'],CFG['INPUT_SIZE2'],CFG['HIDDEN_SIZE'],CFG['DROPOUT_RATE'],CFG['OUTPUT_SIZE'])


In [None]:
# criterion = nn.MSELoss()
optimizer_MLM = torch.optim.Adam(model_MLM.parameters(), lr=CFG['LEARNING_RATE'])
optimizer_HLM = torch.optim.Adam(model_HLM.parameters(), lr=CFG['LEARNING_RATE'])
# optimizer_MLM = torch.optim.SGD(model_MLM.parameters(), lr=CFG['LEARNING_RATE'], momentum=0.9)
# optimizer_HLM = torch.optim.SGD(model_HLM.parameters(), lr=CFG['LEARNING_RATE'], momentum=0.9)
scheduler_MLM = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer_MLM, mode='min', factor=0.5, patience=40, threshold_mode='abs', min_lr=0, eps=1e-8, verbose=True)
scheduler_HLM = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer_HLM, mode='min', factor=0.5, patience=40, threshold_mode='abs', min_lr=0, eps=1e-8, verbose=True)

## Training

In [None]:
def train(model, optimizer, train_loader, val_loader, scheduler, device):
    model.train()
    model.to(device)

    criterion = nn.MSELoss().to(device)
    
    
    for epoch in range(CFG['EPOCHS']):
        running_loss = 0
        for inputs, others, targets in train_loader:
            optimizer.zero_grad()
            inputs = inputs.to(device)
            others = others.to(device)
            targets = targets.to(device)
            batch = inputs.batch.to(device)
            
            output = model(inputs, others, batch)
#             print(1)
            
            loss = criterion(output, targets)
#             print(2)
            
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            

            
        if epoch % 100 == 0:
            
            val_loss = validation(model, criterion, val_loader, device)
            
            print(f'Epoch: {epoch}, Train Loss: {running_loss/len(train_loader)}, Valid Loss: {val_loss/len(valid_HLM_loader)}')
            
            model.train()
        if scheduler is not None:
            scheduler.step(val_loss)
            if scheduler.num_bad_epochs > scheduler.patience:
                print(f'Early stopping at epoch {epoch}...')
                break
    return model

In [None]:
def validation(model, criterion, val_loader, device):
    model.eval()
    val_loss = 0

    with torch.no_grad():
        for inputs, others, targets in val_loader:
            inputs = inputs.to(device)
            others = others.to(device)
            targets = targets.to(device)
            batch = inputs.batch.to(device)
            
            output = model(inputs, others, batch)
            loss = criterion(output, targets)
            val_loss += loss.item()
            
    
    return val_loss

In [None]:
print("Training Start: MLM")
# model_MLM = train(train_MLM_loader, valid_MLM_loader, model_MLM, criterion, optimizer_MLM, epochs=CFG['EPOCHS'])
model_MLM = train(model_MLM, optimizer_MLM, train_MLM_loader, valid_MLM_loader, scheduler_MLM, device)
print("Training Start: HLM")
# model_HLM = train(train_HLM_loader, valid_HLM_loader, model_HLM, criterion, optimizer_HLM, epochs=CFG['EPOCHS'])
model_HLM = train(model_HLM, optimizer_HLM, train_HLM_loader, valid_HLM_loader, scheduler_HLM, device)

## Inference

In [None]:
test_MLM = CustomDataset(df=test, target=None, is_test=True)
test_HLM = CustomDataset(df=test, target=None, is_test=True)

test_MLM_loader = DataLoader(dataset=test_MLM,
                             batch_size=CFG['BATCH_SIZE'],
                             shuffle=False)

test_HLM_loader = DataLoader(dataset=test_HLM,
                             batch_size=CFG['BATCH_SIZE'],
                             shuffle=False)

In [None]:
def inference(test_loader, model):
    model.eval()
    preds = []
    
    with torch.no_grad():
        for inputs, others in test_loader:
            inputs = inputs.to(device)
            others = others.to(device)
            output = model(inputs, others)
            preds.extend(output.cpu().numpy().flatten().tolist())
    
    return preds

In [None]:
predictions_MLM = inference(test_MLM_loader, model_MLM)
predictions_HLM = inference(test_HLM_loader, model_HLM)

## Submission

In [None]:
submission = pd.read_csv('./sample_submission.csv')
submission

In [None]:
submission['MLM'] = predictions_MLM
submission['HLM'] = predictions_HLM
submission

In [None]:
submission.to_csv('baseline_submission.csv', index=False)