In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import mean_squared_error

from rdkit import DataStructs
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors

from typing import List, Union

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [None]:
def morgan_binary_features_generator(mol: Union[str, Chem.Mol], plot_img = False,
                                     radius: int = 6,
                                     num_bits: int = 4096) -> np.ndarray:
    
    mol = Chem.MolFromSmiles(mol) if type(mol) == str else mol
    if plot_img:
        display(mol)
    
    features_vec = AllChem.GetHashedMorganFingerprint(mol, radius, nBits=num_bits)
    features = np.zeros((1,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(features_vec, features)

    return features

In [None]:
def getMolDescriptors(mol: Union[str, Chem.Mol], missingVal=None):
    ''' calculate the full list of descriptors for a molecule

        missingVal is used if the descriptor cannot be calculated
    '''
    mol = Chem.MolFromSmiles(mol) if type(mol) == str else mol
    res = {}
    for nm,fn in Descriptors._descList:
        # some of the descriptor fucntions can throw errors if they fail, catch those here:
        try:
            val = fn(mol)
        except:
            # print the error message:
            import traceback
            traceback.print_exc()
            # and set the descriptor value to whatever missingVal is
            val = missingVal
        res[nm] = val
    return res

In [None]:
class EarlyStopping:
    def __init__(self, patience=5, min_delta=-1):
        self.patience = patience  # number of times to allow for no improvement before stopping the execution
        self.min_delta = min_delta  # the minimum change to be counted as improvement
        self.counter = 0  # count the number of times the validation accuracy not improving
        self.min_validation_loss = np.inf

    # return True when encountering _patience_ times decrease in validation loss 
    def __call__(self, validation_loss, verbose=False):
        if ((validation_loss+self.min_delta) < self.min_validation_loss):
            self.min_validation_loss = validation_loss
            self.counter = 0  # reset the counter if validation loss decreased at least by min_delta
        elif ((validation_loss+self.min_delta) > self.min_validation_loss):
            self.counter += 1 # increase the counter if validation loss is not decreased by the min_delta
            if verbose:
                print(f"  >> now{validation_loss:.3f} > best{self.min_validation_loss:.3f}")
            if self.counter >= self.patience:
                return True
        return False

In [None]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")

train_df["AlogP"].fillna(value=train_df["AlogP"].mean(), inplace=True)
test_df["AlogP"].fillna(value=test_df["AlogP"].mean(), inplace=True)
train_df.dropna(axis=0, inplace=True)

train_fps = pd.DataFrame(train_df["SMILES"].apply(morgan_binary_features_generator).tolist())
test_fps = pd.DataFrame(test_df["SMILES"].apply(morgan_binary_features_generator).tolist())

train_fps.rename(columns=lambda x: "FPS_" + str(x), inplace=True)
test_fps.rename(columns=lambda x: "FPS_" + str(x), inplace=True)

In [None]:
train_descriptor = pd.DataFrame([getMolDescriptors(smile) for smile in train_df['SMILES']])
test_descriptor =  pd.DataFrame([getMolDescriptors(smile) for smile in test_df['SMILES']])

In [None]:
train_df = pd.concat([train_df, train_descriptor], axis=1)
test_df = pd.concat([test_df, test_descriptor], axis=1)

In [None]:
train_df.drop(columns=['AlogP', 'MolWt', 'NumHAcceptors', 'NumHDonors', 'NumRotatableBonds', 'MolLogP'], inplace=True)
test_df.drop(columns=['AlogP', 'MolWt', 'NumHAcceptors', 'NumHDonors', 'NumRotatableBonds', 'MolLogP'], inplace=True)

In [None]:
train_df.fillna(train_df.mean(numeric_only=True), inplace=True)
test_df.fillna(test_df.mean(numeric_only=True), inplace=True)

In [None]:
class CustomDataset(Dataset):
    def __init__(self, tab_df, fps_df,  target: str, tab_scaler, fps_scaler, label_scaler=None, is_test=False):
        self.tab_df = tab_df
        self.fps_df = fps_df
        self.target = target
        self.is_test = is_test
        self.tab_scaler = tab_scaler
        self.fps_scaler = fps_scaler


        if self.is_test:
            self.drop_col = ["id", "SMILES"]
            self.tab_features = self.tab_scaler[1].transform(self.tab_scaler[0].transform(self.tab_df.drop(columns = self.drop_col, axis=1)))
            self.fps_features = self.fps_scaler[1].transform(self.fps_scaler[0].transform(self.fps_df))
            #self.fps_features = self.fps_scaler.transform(self.fps_df)

        else:
            self.drop_col = ["id", "SMILES", "MLM", "HLM"]
            self.tab_features = self.tab_scaler[1].fit_transform(self.tab_scaler[0].fit_transform(self.tab_df.drop(columns = self.drop_col, axis=1)))
            self.fps_features = self.fps_scaler[1].fit_transform(self.fps_scaler[0].fit_transform(self.fps_df))
            #self.fps_features = self.fps_scaler.fit_transform(self.fps_df)
            if label_scaler is None:
                self.label = self.tab_df[target].values.reshape(-1, 1)
            else:
                self.label = label_scaler.fit_transform(self.tab_df[[target]])

            self.range_class = self.tab_df[target].apply(lambda x : np.int8(min(x, 100)//10)) # 구간 균등화 startify를 위함
            

    def __getitem__(self, index):
        feature = np.concatenate([self.tab_features[index], self.fps_features[index]])
        #feature = self.fps_features[index]

        if self.is_test:
            return torch.tensor(feature).float()
        else:
            label = self.label[index]
            return torch.tensor(feature).float(), torch.tensor(label).float()

    def __len__(self):
        return len(self.tab_df)

In [None]:
tab_scaler = [VarianceThreshold(threshold=0.01), MinMaxScaler()]
fps_scaler = [VarianceThreshold(threshold=0.05), MinMaxScaler()]
#fps_scaler = VarianceThreshold(threshold=0.05)
label_scaler = None

train_MLM = CustomDataset(tab_df = train_df, fps_df = train_fps, target="MLM", tab_scaler = tab_scaler, fps_scaler=fps_scaler, label_scaler=label_scaler, is_test= False)
test_MLM = CustomDataset(tab_df = test_df, fps_df = test_fps, target="MLM", tab_scaler = tab_scaler, fps_scaler=fps_scaler, label_scaler=label_scaler, is_test= True)

train_HLM = CustomDataset(tab_df = train_df, fps_df = train_fps, target="HLM", tab_scaler = tab_scaler, fps_scaler=fps_scaler, label_scaler=label_scaler, is_test= False)
test_HLM = CustomDataset(tab_df = test_df, fps_df = test_fps, target="HLM", tab_scaler = tab_scaler, fps_scaler=fps_scaler, label_scaler=label_scaler, is_test= True)


In [None]:
input_size = train_MLM.fps_features.shape[1] + train_MLM.tab_features.shape[1] 
#input_size = train_MLM.fps_features.shape[1]
print(input_size)

In [None]:
# Hyperparameter
CFG = {'BATCH_SIZE': 256,
       'EPOCHS': 8000,
       'INPUT_SIZE': input_size,
       'HIDDEN_SIZE': 1024,
       'OUTPUT_SIZE': 1,
       'DROPOUT_RATE': 0.8,
       'LEARNING_RATE': 0.001}

In [None]:
train_MLM.range_class.value_counts()

In [None]:
# train,valid split
train_MLM_dataset, valid_MLM_dataset = train_test_split(train_MLM, test_size=0.2, random_state=42, stratify=train_MLM.range_class)
train_HLM_dataset, valid_HLM_dataset = train_test_split(train_HLM, test_size=0.2, random_state=42, stratify=train_HLM.range_class)

In [None]:
train_MLM_loader = DataLoader(dataset=train_MLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=True)

valid_MLM_loader = DataLoader(dataset=valid_MLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=False)

train_HLM_loader = DataLoader(dataset=train_HLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=True)

valid_HLM_loader = DataLoader(dataset=valid_HLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=False)

test_MLM_loader = DataLoader(dataset=test_MLM,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=False)

test_HLM_loader = DataLoader(dataset=test_HLM,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=False)

In [None]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_rate, out_size):
        super(Net, self).__init__()
        
        # fc 레이어 3개와 출력 레이어
        self.fc1 = nn.Linear(input_size, hidden_size) 
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, hidden_size)
        self.fc5 = nn.Linear(hidden_size, hidden_size)

        self.fc_out = nn.Linear(hidden_size, out_size)
        
        # 정규화
        self.ln1 = nn.BatchNorm1d(hidden_size)
        self.ln2 = nn.BatchNorm1d(hidden_size)
        self.ln3 = nn.BatchNorm1d(hidden_size)        
        self.ln4 = nn.BatchNorm1d(hidden_size)        
        self.ln5 = nn.BatchNorm1d(hidden_size)        


        # 활성화 함수
        self.activation = nn.LeakyReLU()
        
        # Dropout
        self.dropout = nn.Dropout(dropout_rate)
     
    def forward(self, x):
        out = self.fc1(x)
        out = self.ln1(out)
        out = self.activation(out)
        out = self.dropout(out)
        
        out = self.fc2(out)
        out = self.ln2(out)
        out = self.activation(out)
        out = self.dropout(out)
        
        out = self.fc3(out)
        out = self.ln3(out)
        out = self.activation(out)
        out = self.dropout(out)

        out = self.fc4(out)
        out = self.ln4(out)
        out = self.activation(out)
        out = self.dropout(out)

        out = self.fc5(out)
        out = self.ln5(out)
        out = self.activation(out)
        out = self.dropout(out)

        out = self.fc_out(out)
        return out

In [None]:
model_MLM = Net(CFG['INPUT_SIZE'],CFG['HIDDEN_SIZE'],CFG['DROPOUT_RATE'],CFG['OUTPUT_SIZE']).to("cuda")
model_HLM = Net(CFG['INPUT_SIZE'],CFG['HIDDEN_SIZE'],CFG['DROPOUT_RATE'],CFG['OUTPUT_SIZE']).to("cuda")

In [None]:
print(model_MLM)
sum(p.numel() for p in model_MLM.parameters() if p.requires_grad)

In [None]:
criterion = nn.MSELoss()
optimizer_MLM = torch.optim.Adam(model_MLM.parameters(), lr=CFG['LEARNING_RATE'])
scheduler_MLM = torch.optim.lr_scheduler.LambdaLR(optimizer = optimizer_MLM, lr_lambda= lambda epoch : 0.95**(epoch))
optimizer_HLM = torch.optim.Adam(model_HLM.parameters(), lr=CFG['LEARNING_RATE'])
scheduler_HLM = torch.optim.lr_scheduler.LambdaLR(optimizer = optimizer_HLM, lr_lambda= lambda epoch : 0.95**(epoch))

In [None]:
def train(train_loader, valid_loader, model, criterion, optimizer, scheduler,  epochs, label_scaling:Union[None, List] = None):

    earlyStop = EarlyStopping(patience= 8, min_delta=-10)
    
    for epoch in range(epochs):
        model.train()
        running_loss = 0
        for inputs, targets in train_loader:
            optimizer.zero_grad() # Zero your gradients for every batch!
            
            output = model(inputs.to("cuda"))
            loss = criterion(output, targets.to("cuda"))
            loss.backward()
            
            optimizer.step() # Adjust learning weights

            if label_scaling is None:
                running_loss += loss.item()

            else:
                metric_loss = label_scaling[1](label_scaling[0].inverse_transform(output.tolist()), targets.tolist())
                running_loss += metric_loss
            
        
        if epoch % 100 == 0:
            model.eval()
            valid_loss = 0
            with torch.no_grad():
                for inputs, targets in valid_loader:
                    output = model(inputs.to("cuda"))
                    loss = criterion(output, targets.to("cuda"))


                    if label_scaling is None:
                        valid_loss += loss.item()
                    else:
                        valid_metric_loss = label_scaling[1](label_scaling[0].inverse_transform(output.tolist()), targets.tolist())
                        valid_loss += valid_metric_loss
                    
            print(f"Epoch: {epoch:4d}/{epochs} with lr {scheduler.get_last_lr()[0]:.9f}, Train Loss: {np.sqrt(running_loss/len(train_loader))}, Valid Loss: {np.sqrt(valid_loss/len(valid_loader))}")
            
            if earlyStop(valid_loss, verbose=True):
                break

            scheduler.step()    
        
    
    return model

In [None]:
model_MLM = train(train_MLM_loader, valid_MLM_loader, model_MLM, criterion, optimizer_MLM, scheduler_MLM, epochs=CFG["EPOCHS"], label_scaling=None)
model_HLM = train(train_HLM_loader, valid_HLM_loader, model_HLM, criterion, optimizer_HLM, scheduler_HLM, epochs=CFG["EPOCHS"], label_scaling=None)


In [None]:
def inference(test_loader, model, label_scaler=None):
    model.eval()
    preds = []
    
    with torch.no_grad():
        for idx, inputs in enumerate(test_loader):
            print(f"{idx:3d}th: ", end=" ")
            for d in inputs:
                std = np.std(d.detach().numpy())
                print(f"{std} ", end=" ")
            print()
            output = model(inputs.to("cuda"))
            if label_scaler is not None:
                output = label_scaler.inverse_transform(output.cpu())
            preds.extend(output.flatten().tolist())
    
    return preds

In [None]:
predictions_MLM = inference(test_MLM_loader, model_MLM, label_scaler=label_scaler)
predictions_HLM = inference(test_HLM_loader, model_HLM, label_scaler=label_scaler)

In [None]:
submission = pd.read_csv('../input/sample_submission.csv')
submission['MLM'] = predictions_MLM
submission['HLM'] = predictions_HLM
submission

In [None]:
submission.describe()

In [None]:
submission.to_csv('../output/submission.csv', index=False)