In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import mean_squared_error

from rdkit import DataStructs
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors

from typing import List, Union

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [2]:
def morgan_binary_features_generator(mol: Union[str, Chem.Mol], plot_img = False,
                                     radius: int = 6,
                                     num_bits: int = 4096) -> np.ndarray:
    
    mol = Chem.MolFromSmiles(mol) if type(mol) == str else mol
    if plot_img:
        display(mol)
    
    features_vec = AllChem.GetHashedMorganFingerprint(mol, radius, nBits=num_bits)
    features = np.zeros((1,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(features_vec, features)

    return features

In [3]:
def getMolDescriptors(mol: Union[str, Chem.Mol], missingVal=None):
    ''' calculate the full list of descriptors for a molecule

        missingVal is used if the descriptor cannot be calculated
    '''
    mol = Chem.MolFromSmiles(mol) if type(mol) == str else mol
    res = {}
    for nm,fn in Descriptors._descList:
        # some of the descriptor fucntions can throw errors if they fail, catch those here:
        try:
            val = fn(mol)
        except:
            # print the error message:
            import traceback
            traceback.print_exc()
            # and set the descriptor value to whatever missingVal is
            val = missingVal
        res[nm] = val
    return res

In [4]:
class EarlyStopping:
    def __init__(self, patience=5, min_delta=-1):
        self.patience = patience  # number of times to allow for no improvement before stopping the execution
        self.min_delta = min_delta  # the minimum change to be counted as improvement
        self.counter = 0  # count the number of times the validation accuracy not improving
        self.min_validation_loss = np.inf

    # return True when encountering _patience_ times decrease in validation loss 
    def __call__(self, validation_loss, verbose=False):
        if ((validation_loss+self.min_delta) < self.min_validation_loss):
            self.min_validation_loss = validation_loss
            self.counter = 0  # reset the counter if validation loss decreased at least by min_delta
        elif ((validation_loss+self.min_delta) > self.min_validation_loss):
            self.counter += 1 # increase the counter if validation loss is not decreased by the min_delta
            if verbose:
                print(f"  >> now{validation_loss:.3f} > best{self.min_validation_loss:.3f}")
            if self.counter >= self.patience:
                return True
        return False

In [5]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")

train_df["AlogP"].fillna(value=train_df["AlogP"].mean(), inplace=True)
test_df["AlogP"].fillna(value=test_df["AlogP"].mean(), inplace=True)
train_df.dropna(axis=0, inplace=True)
train_df.drop_duplicates(["SMILES"], inplace=True)

train_fps = pd.DataFrame(train_df["SMILES"].apply(morgan_binary_features_generator).tolist())
test_fps = pd.DataFrame(test_df["SMILES"].apply(morgan_binary_features_generator).tolist())

train_fps.rename(columns=lambda x: "FPS_" + str(x), inplace=True)
test_fps.rename(columns=lambda x: "FPS_" + str(x), inplace=True)

In [6]:
train_descriptor = pd.DataFrame([getMolDescriptors(smile) for smile in train_df['SMILES']])
test_descriptor =  pd.DataFrame([getMolDescriptors(smile) for smile in test_df['SMILES']])

In [7]:
train_df = pd.concat([train_df, train_descriptor], axis=1)
test_df = pd.concat([test_df, test_descriptor], axis=1)

In [8]:
train_df.drop(columns=['AlogP', 'MolWt', 'NumHAcceptors', 'NumHDonors', 'NumRotatableBonds', 'MolLogP'], inplace=True)
test_df.drop(columns=['AlogP', 'MolWt', 'NumHAcceptors', 'NumHDonors', 'NumRotatableBonds', 'MolLogP'], inplace=True)

In [9]:
train_df.fillna(train_df.mean(numeric_only=True), inplace=True)
test_df.fillna(test_df.mean(numeric_only=True), inplace=True)

In [10]:
class CustomDataset(Dataset):
    def __init__(self, tab_df, fps_df,  target: str, tab_scaler, label_scaler=None, is_test=False):
        self.tab_df = tab_df
        self.fps_df = fps_df
        self.target = target
        self.is_test = is_test
        self.tab_scaler = tab_scaler


        if self.is_test:
            self.drop_col = ["id", "SMILES"]
            self.tab_features = self.tab_scaler[1].transform(self.tab_scaler[0].transform(self.tab_df.drop(columns = self.drop_col, axis=1)))
            self.fps_features = self.fps_df.values

        else:
            self.drop_col = ["id", "SMILES", "MLM", "HLM"]
            self.tab_features = self.tab_scaler[1].fit_transform(self.tab_scaler[0].fit_transform(self.tab_df.drop(columns = self.drop_col, axis=1)))
            self.fps_features = self.fps_df.values

            if label_scaler is None:
                self.label = self.tab_df[target].values.reshape(-1, 1)
            else:
                self.label = label_scaler.fit_transform(self.tab_df[[target]])

            self.range_class = self.tab_df[target].apply(lambda x : np.int8(min(x, 100)//10)) # 구간 균등화 startify를 위함
            

    def __getitem__(self, index):
        tab_feautres = self.tab_features[index]
        fps_feautres = self.fps_features[index]

        if self.is_test:
            return torch.tensor(tab_feautres).float(), torch.tensor(fps_feautres).float()
        else:
            label = self.label[index]
            return torch.tensor(tab_feautres).float(), torch.tensor(fps_feautres).float(), torch.tensor(label).float()

    def __len__(self):
        return len(self.tab_df)

In [11]:
tab_scaler = [VarianceThreshold(threshold=0), MinMaxScaler()]
label_scaler = None

train_MLM = CustomDataset(tab_df = train_df, fps_df = train_fps, target="MLM", tab_scaler = tab_scaler,  label_scaler=label_scaler, is_test= False)
test_MLM = CustomDataset(tab_df = test_df, fps_df = test_fps, target="MLM", tab_scaler = tab_scaler, label_scaler=label_scaler, is_test= True)

train_HLM = CustomDataset(tab_df = train_df, fps_df = train_fps, target="HLM", tab_scaler = tab_scaler, label_scaler=label_scaler, is_test= False)
test_HLM = CustomDataset(tab_df = test_df, fps_df = test_fps, target="HLM", tab_scaler = tab_scaler, label_scaler=label_scaler, is_test= True)


In [39]:
input_size = train_MLM.tab_features.shape[1] + 32 
#input_size = train_MLM.fps_features.shape[1]
print(input_size)

209


In [40]:
# Hyperparameter
CFG = {'BATCH_SIZE': 256,
       'EPOCHS': 8000,
       'INPUT_SIZE': input_size,
       'HIDDEN_SIZE': 1024,
       'OUTPUT_SIZE': 1,
       'DROPOUT_RATE': 0.8,
       'LEARNING_RATE': 0.001}

In [41]:
train_MLM.range_class.value_counts()

0     1345
9      410
1      254
8      244
7      240
2      219
4      201
6      200
5      193
3      189
10       3
Name: MLM, dtype: int64

In [42]:
# train,valid split
train_MLM_dataset, valid_MLM_dataset = train_test_split(train_MLM, test_size=0.2, random_state=42, stratify=train_MLM.range_class)
train_HLM_dataset, valid_HLM_dataset = train_test_split(train_HLM, test_size=0.2, random_state=42, stratify=train_HLM.range_class)

In [43]:
train_MLM_loader = DataLoader(dataset=train_MLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=True)

valid_MLM_loader = DataLoader(dataset=valid_MLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=False)

train_HLM_loader = DataLoader(dataset=train_HLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=True)

valid_HLM_loader = DataLoader(dataset=valid_HLM_dataset,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=False)

test_MLM_loader = DataLoader(dataset=test_MLM,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=False)

test_HLM_loader = DataLoader(dataset=test_HLM,
                              batch_size=CFG['BATCH_SIZE'],
                              shuffle=False)

In [97]:
class FpsAutoEncoder(nn.Module):
    def __init__(self, input_size, output_size):
        super(FpsAutoEncoder, self).__init__()

        self.encoder = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, output_size)
        )

        self.decoder = nn.Sequential(
            nn.Linear(output_size, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, input_size)
        )

    def forward(self, x):
        out = self.encoder(x)
        out = self.decoder(out)

        return out
    
    def get_codes(self, x):
        return self.encoder(x)

In [124]:
model_fps_encoder = FpsAutoEncoder(4096, 32)
model_fps_encoder.load_state_dict(torch.load("../archive_model/autoEncoder.pt"))

for name, param in model_fps_encoder.named_parameters():
    print(f"{name}: {param.shape}")



encoder.0.weight: torch.Size([512, 4096])
encoder.0.bias: torch.Size([512])
encoder.1.weight: torch.Size([512])
encoder.1.bias: torch.Size([512])
encoder.4.weight: torch.Size([256, 512])
encoder.4.bias: torch.Size([256])
encoder.5.weight: torch.Size([256])
encoder.5.bias: torch.Size([256])
encoder.8.weight: torch.Size([32, 256])
encoder.8.bias: torch.Size([32])
decoder.0.weight: torch.Size([256, 32])
decoder.0.bias: torch.Size([256])
decoder.2.weight: torch.Size([512, 256])
decoder.2.bias: torch.Size([512])
decoder.4.weight: torch.Size([4096, 512])
decoder.4.bias: torch.Size([4096])


In [125]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_rate, out_size, encoder):
        super(Net, self).__init__()

        self.fps_encoder = encoder
        
        for p in self.fps_encoder.parameters():
            p.requires_grad = False ## 모델 freeze
        
        # fc 레이어 3개와 출력 레이어
        self.fc1 = nn.Linear(input_size, hidden_size) 
        self.fc2 = nn.Linear(hidden_size, hidden_size)

        self.fc_out = nn.Linear(hidden_size +input_size, out_size)
        
        # 정규화
        self.ln1 = nn.BatchNorm1d(hidden_size)
        self.ln2 = nn.BatchNorm1d(hidden_size)

        # 활성화 함수
        self.activation = nn.LeakyReLU()
        
        # Dropout
        self.dropout = nn.Dropout(dropout_rate)
     
    def forward(self, tab_x, fps_x):

        enc = self.fps_encoder.get_codes(fps_x)

        x = torch.cat([enc, tab_x], dim=1)        

        out1 = self.fc1(x)
        out1 = self.ln1(out1)
        out1 = self.activation(out1)
        out1 = self.dropout(out1)
        
        out2 = self.fc2(out1)
        out2 = self.ln2(out2)
        out2 = self.activation(out2)
        out2 = self.dropout(out2)

        out3 = torch.cat([x , out2], dim=1)
        
        out = self.fc_out(out3)
        return out

In [126]:
model_MLM = Net(CFG['INPUT_SIZE'],CFG['HIDDEN_SIZE'],CFG['DROPOUT_RATE'],CFG['OUTPUT_SIZE'], model_fps_encoder).to("cuda")
model_HLM = Net(CFG['INPUT_SIZE'],CFG['HIDDEN_SIZE'],CFG['DROPOUT_RATE'],CFG['OUTPUT_SIZE'], model_fps_encoder).to("cuda")

In [127]:
print(model_MLM)
sum(p.numel() for p in model_MLM.parameters() if p.requires_grad)

Net(
  (fps_encoder): FpsAutoEncoder(
    (encoder): Sequential(
      (0): Linear(in_features=4096, out_features=512, bias=True)
      (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): Dropout(p=0.2, inplace=False)
      (4): Linear(in_features=512, out_features=256, bias=True)
      (5): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (6): ReLU()
      (7): Dropout(p=0.2, inplace=False)
      (8): Linear(in_features=256, out_features=32, bias=True)
    )
    (decoder): Sequential(
      (0): Linear(in_features=32, out_features=256, bias=True)
      (1): ReLU()
      (2): Linear(in_features=256, out_features=512, bias=True)
      (3): ReLU()
      (4): Linear(in_features=512, out_features=4096, bias=True)
    )
  )
  (fc1): Linear(in_features=209, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=1024, bias=True)
  (fc_out): Linear(in_features=1233, out_fe

1269970

In [128]:
criterion = nn.MSELoss()
optimizer_MLM = torch.optim.Adam(model_MLM.parameters(), lr=CFG['LEARNING_RATE'])
scheduler_MLM = torch.optim.lr_scheduler.LambdaLR(optimizer = optimizer_MLM, lr_lambda= lambda epoch : 0.95**(epoch))
optimizer_HLM = torch.optim.Adam(model_HLM.parameters(), lr=CFG['LEARNING_RATE'])
scheduler_HLM = torch.optim.lr_scheduler.LambdaLR(optimizer = optimizer_HLM, lr_lambda= lambda epoch : 0.95**(epoch))

In [129]:
def train(train_loader, valid_loader, model, criterion, optimizer, scheduler,  epochs, label_scaling:Union[None, List] = None):

    earlyStop = EarlyStopping(patience= 8, min_delta=-10)
    
    for epoch in range(epochs):
        model.train()
        running_loss = 0
        for inputs_tab, inputs_fps, targets in train_loader:
            optimizer.zero_grad() # Zero your gradients for every batch!
            
            output = model(inputs_tab.to("cuda"), inputs_fps.to("cuda"))
            loss = criterion(output, targets.to("cuda"))
            loss.backward()
            
            optimizer.step() # Adjust learning weights

            if label_scaling is None:
                running_loss += loss.item()

            else:
                metric_loss = label_scaling[1](label_scaling[0].inverse_transform(output.tolist()), targets.tolist())
                running_loss += metric_loss
            
        
        if epoch % 100 == 0:
            model.eval()
            valid_loss = 0
            with torch.no_grad():
                for inputs_tab, inputs_fps, targets in valid_loader:
                    output = model(inputs_tab.to("cuda"), inputs_fps.to("cuda"))
                    loss = criterion(output, targets.to("cuda"))


                    if label_scaling is None:
                        valid_loss += loss.item()
                    else:
                        valid_metric_loss = label_scaling[1](label_scaling[0].inverse_transform(output.tolist()), targets.tolist())
                        valid_loss += valid_metric_loss
                    
            print(f"Epoch: {epoch:4d}/{epochs} with lr {scheduler.get_last_lr()[0]:.9f}, Train Loss: {np.sqrt(running_loss/len(train_loader))}, Valid Loss: {np.sqrt(valid_loss/len(valid_loader))}")
            
            if earlyStop(valid_loss, verbose=True):
                break

            scheduler.step()    
        
    
    return model

In [130]:
model_MLM = train(train_MLM_loader, valid_MLM_loader, model_MLM, criterion, optimizer_MLM, scheduler_MLM, epochs=CFG["EPOCHS"], label_scaling=None)
model_HLM = train(train_HLM_loader, valid_HLM_loader, model_HLM, criterion, optimizer_HLM, scheduler_HLM, epochs=CFG["EPOCHS"], label_scaling=None)


Epoch:    0/8000 with lr 0.001000000, Train Loss: 49.96390409621241, Valid Loss: 50.416865745130366
Epoch:  100/8000 with lr 0.000950000, Train Loss: 25.218523707550393, Valid Loss: 30.480642711040804
Epoch:  200/8000 with lr 0.000902500, Train Loss: 21.909402039979714, Valid Loss: 30.6965254118977
  >> now2826.830 > best2787.209
Epoch:  300/8000 with lr 0.000857375, Train Loss: 20.426559612477458, Valid Loss: 30.826711022544195
  >> now2850.858 > best2787.209
Epoch:  400/8000 with lr 0.000814506, Train Loss: 18.445734766040804, Valid Loss: 31.212087027746225
  >> now2922.583 > best2787.209
Epoch:  500/8000 with lr 0.000773781, Train Loss: 17.1402408287778, Valid Loss: 30.975967443843157
  >> now2878.532 > best2787.209
Epoch:  600/8000 with lr 0.000735092, Train Loss: 16.572738429985385, Valid Loss: 31.356484916546602
  >> now2949.687 > best2787.209
Epoch:  700/8000 with lr 0.000698337, Train Loss: 15.79337207945729, Valid Loss: 31.304814556039474
  >> now2939.974 > best2787.209
Epoch:

In [131]:
def inference(test_loader, model, label_scaler=None):
    model.eval()
    preds = []
    
    with torch.no_grad():
        for inputs_tab, inputs_fps in test_loader:
            output = model(inputs_tab.to("cuda"), inputs_fps.to("cuda"))
            if label_scaler is not None:
                output = label_scaler.inverse_transform(output.cpu())
            preds.extend(output.flatten().tolist())
    
    return preds

In [132]:
predictions_MLM = inference(test_MLM_loader, model_MLM, label_scaler=label_scaler)
predictions_HLM = inference(test_HLM_loader, model_HLM, label_scaler=label_scaler)

In [133]:
submission = pd.read_csv('../input/sample_submission.csv')
submission['MLM'] = predictions_MLM
submission['HLM'] = predictions_HLM
submission

Unnamed: 0,id,MLM,HLM
0,TEST_000,0.201058,9.581748
1,TEST_001,57.406219,80.907387
2,TEST_002,56.564281,73.834068
3,TEST_003,50.966148,67.596542
4,TEST_004,57.561031,63.305035
...,...,...,...
478,TEST_478,28.630735,8.052299
479,TEST_479,87.059135,84.735931
480,TEST_480,7.877865,26.742342
481,TEST_481,53.881615,59.072704


In [134]:
submission.describe()

Unnamed: 0,MLM,HLM
count,483.0,483.0
mean,32.541354,45.860658
std,26.377052,25.51147
min,-2.368363,-2.204836
25%,8.044451,25.029202
50%,28.395479,43.250042
75%,53.696426,68.332569
max,104.684044,113.410904


In [135]:
submission.to_csv('../output/submission.csv', index=False)