In [2]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
import numpy as np

In [3]:
def getMolDescriptors(mol, missingVal=None):
    ''' calculate the full list of descriptors for a molecule

        missingVal is used if the descriptor cannot be calculated
    '''
    res = {}
    for nm,fn in Descriptors._descList:
        # some of the descriptor fucntions can throw errors if they fail, catch those here:
        try:
            val = fn(mol)
        except:
            # print the error message:
            import traceback
            traceback.print_exc()
            # and set the descriptor value to whatever missingVal is
            val = missingVal
        res[nm] = val
    return res

In [4]:
#데이터 로드 & 결측치 처리
train = pd.read_csv('../input/train.csv')
train['AlogP'] = np.where(pd.isna(train['AlogP']), train['LogD'], train['AlogP'])
test = pd.read_csv('../input/test.csv')
test['AlogP'] = np.where(pd.isna(test['AlogP']), test['LogD'], test['LogD'])

In [5]:
train['Molecule'] = train['SMILES'].apply(Chem.MolFromSmiles)
test['Molecule'] = test['SMILES'].apply(Chem.MolFromSmiles)

In [6]:
train.drop_duplicates(['SMILES'], keep=False, inplace=True)
train.reset_index(drop=True,inplace=True)

In [7]:
train_dsc = [getMolDescriptors(mol) for mol in train['Molecule']]
test_dsc = [getMolDescriptors(mol) for mol in test['Molecule']]

In [8]:
# drop_col = ['AlogP', 'Molecule', 'MolWt', 'NumHAcceptors', 'NumHDonors', 'NumRotatableBonds', 'MolLogP']
# drop_col = ['AlogP','Molecule','Molecular_Weight','Num_H_Acceptors','Num_H_Donors','Num_RotatableBonds','MolLogP']
drop_col = ['AlogP','Molecule','Molecular_Weight','Num_H_Acceptors','Num_H_Donors','Num_RotatableBonds']

In [9]:
train_dsc = pd.DataFrame(train_dsc)
test_dsc = pd.DataFrame(test_dsc)

In [10]:
train_data = pd.concat([train, train_dsc], axis=1)
test_data = pd.concat([test, test_dsc], axis=1)

In [11]:
train_data[train_data.isnull().any(axis=1)]

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
986,TRAIN_1000,O=c1c2ccccc2[se]n1-c1ccc(S(=O)(=O)Nc2ccccn2)cc1,31.223,43.963,3.103,430.339,4,1,4,3.126,...,0,1,0,0,0,0,0,0,0,0
1380,TRAIN_1399,Brc1cnc2n[se]nc2c1,91.192,99.9,1.82,262.953,3,0,0,1.82,...,0,0,0,0,0,0,0,0,0,0


In [12]:
train_data = train_data.dropna(axis=0).reset_index(drop=True)

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [14]:
scaler = MinMaxScaler()
col = ['id','SMILES', 'MLM','HLM','Molecule']
col_list = [colum for colum in train_data.columns if colum not in col]

In [15]:
train_data[col_list] = scaler.fit_transform(train_data[col_list])
test_data[col_list] = scaler.transform(test_data[col_list])

In [16]:
drop_col = ['id','SMILES','Molecule','AlogP','Molecule','Molecular_Weight','Num_H_Acceptors','Num_H_Donors','Num_RotatableBonds']
target = ['HLM','MLM']

---
Split out the code: taking the pre-processed dataframe

In [22]:
test_data.shape

(483, 219)

In [21]:
train_data.to_csv("../input/train_descriptor.csv", index=False)
test_data.to_csv("../input/test_descriptor.csv", index=False)


In [38]:
HLM = train_data['HLM']
MLM = train_data['MLM']
train_df = train_data.drop(columns = drop_col,axis = 1 )
train_df = train_df.drop(columns = target, axis = 1)
test_df = test_data.drop(columns = drop_col, axis = 1)

In [82]:
from torch.utils.data import Dataset, DataLoader
import torch
class CustomDataset(Dataset):
  def __init__(self, df, target=None, is_test=False):
    self.df = df
    self.is_test = is_test
    if not self.is_test:
      self.target = target
    self.features = self.df.values
  def __len__(self):
    return len(self.df)

  def __getitem__(self, idx):
    ft = self.features[idx]
    if not self.is_test:
      label = self.target[idx]
      return torch.tensor(ft).float(), torch.tensor(label).float().unsqueeze(dim=-1)
    else:
      return torch.tensor(ft).float()

In [83]:
train_HLM = CustomDataset(train_df, HLM)
train_MLM = CustomDataset(train_df, MLM)

In [119]:
test_HLM = CustomDataset(df = test_df,target=None,is_test=True)
test_MLM = CustomDataset(df = test_df,target=None,is_test=True)
test_HLM_loader = DataLoader(dataset=test_HLM,batch_size=64,shuffle=False)
test_MLM_loader = DataLoader(dataset=test_MLM,batch_size=64,shuffle=False)

In [120]:
train_HLM_dataset, valid_HLM_dataset = train_test_split(train_HLM, test_size=0.2, random_state=42)
train_MLM_dataset, valid_MLM_dataset = train_test_split(train_MLM, test_size=0.2, random_state=42)

In [139]:
train_HLM_loader = DataLoader(dataset=train_HLM_dataset, batch_size = 256,shuffle=True)
valid_HLM_loader = DataLoader(dataset=valid_HLM_dataset, batch_size = 256,shuffle=False)

train_MLM_loader = DataLoader(dataset=train_MLM_dataset, batch_size = 256,shuffle=True)
valid_MLM_loader = DataLoader(dataset=valid_MLM_dataset, batch_size = 256,shuffle=False)

In [130]:
input_size = train_HLM.features.shape[1]

In [131]:
len(train_HLM_dataset)

2753

In [73]:
import torch.nn as nn
import torch.optim as optim

In [124]:
class LinModel(nn.Module):
  def __init__(self, input_size, dropout_rate):
    super(LinModel, self).__init__()

    self.fc1 = nn.Sequential(
        nn.Linear(input_size, 256),
        nn.ReLU(),
        nn.BatchNorm1d(256),
        nn.Dropout(dropout_rate)
    )
    self.fc2 = nn.Sequential(
        nn.Linear(256, 1024),
        nn.ReLU(),
        nn.BatchNorm1d(1024),
        nn.Dropout(dropout_rate)
    )
    self.fc3 = nn.Sequential(
        nn.Linear(1024,512),
        nn.ReLU(),
        nn.BatchNorm1d(512),
        nn.Dropout(dropout_rate)
    )
    self.fc4 = nn.Sequential(
        nn.Linear(512, 256),
        nn.ReLU(),
        nn.BatchNorm1d(256),
        nn.Dropout(dropout_rate)
    )
    self.fc5 = nn.Sequential(
        nn.Linear(256, 128),
        nn.ReLU(),
        nn.BatchNorm1d(128)
    )

    self.out = nn.Linear(128,1)

  def forward(self, x):
    x = self.fc1(x)
    x = self.fc2(x)
    x = self.fc3(x)
    x = self.fc4(x)
    x = self.fc5(x)
    x = self.out(x)
    return x

In [143]:
model_HLM = LinModel(input_size, 0.2)
model_MLM = LinModel(input_size, 0.2)

In [144]:
criterion = nn.MSELoss()
optimizer_HLM = optim.AdamW(model_HLM.parameters(), lr=0.001)
optimizer_MLM = optim.AdamW(model_MLM.parameters(), lr=0.001)

In [145]:
def train(train_loader, valid_loader, model, criterion, optimizer, epochs):
    model.train()

    for epoch in range(epochs):
        running_loss = 0
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            output = model(inputs)
            loss = criterion(output, targets)

            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        if epoch % 100 == 0:
            valid_loss = 0
            with torch.no_grad():
                for inputs, targets in valid_loader:
                    output = model(inputs)
                    loss = criterion(output, targets)
                    valid_loss += loss.item()

            print(f'Epoch: {epoch}/{epochs}, Train Loss: {running_loss/len(train_loader)}, Valid Loss: {valid_loss/len(valid_loader)}')
            model.train()

    return model

In [146]:
print("Training Start: HLM")
model_HLM = train(train_HLM_loader, valid_HLM_loader, model_HLM, criterion, optimizer_HLM, epochs=500)
print("Training Start: MLM")
model_MLM = train(train_MLM_loader, valid_MLM_loader, model_MLM, criterion, optimizer_MLM, epochs=500)

Training Start: HLM
Epoch: 0/500, Train Loss: 4045.017156427557, Valid Loss: 4118.683430989583
Epoch: 100/500, Train Loss: 87.5599198774858, Valid Loss: 1558.084716796875
Epoch: 200/500, Train Loss: 66.6342228976163, Valid Loss: 1567.322021484375
Epoch: 300/500, Train Loss: 46.59217036854137, Valid Loss: 1614.1225179036458
Epoch: 400/500, Train Loss: 39.507639451460406, Valid Loss: 1646.2931722005208
Training Start: MLM
Epoch: 0/500, Train Loss: 2575.7985617897725, Valid Loss: 2702.2242024739585
Epoch: 100/500, Train Loss: 66.58714017001066, Valid Loss: 1505.1709391276042
Epoch: 200/500, Train Loss: 58.14421254938299, Valid Loss: 1503.3934326171875
Epoch: 300/500, Train Loss: 45.88111530650746, Valid Loss: 1436.4686279296875
Epoch: 400/500, Train Loss: 38.449436534534804, Valid Loss: 1556.6769612630208


In [147]:
def inference(test_loader, model):
    model.eval()
    preds = []

    with torch.no_grad():
        for inputs in test_loader:
            output = model(inputs)
            preds.extend(output.cpu().numpy().flatten().tolist())

    return preds

In [148]:
predictions_MLM = inference(test_MLM_loader,model_MLM)
predictions_HLM = inference(test_HLM_loader,model_HLM)

In [149]:
submission = pd.read_csv('./sample_submission.csv')
submission['MLM'] = predictions_MLM
submission['HLM'] = predictions_HLM
submission

Unnamed: 0,id,MLM,HLM
0,TEST_000,7.978578,57.683105
1,TEST_001,62.255535,80.932808
2,TEST_002,30.180443,37.984024
3,TEST_003,78.276054,95.201408
4,TEST_004,82.833420,42.715679
...,...,...,...
478,TEST_478,15.161558,6.189603
479,TEST_479,94.719543,87.633453
480,TEST_480,14.936805,46.551128
481,TEST_481,2.495585,91.809273
