In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
import numpy as np

In [2]:
def getMolDescriptors(mol, missingVal=None):
    ''' calculate the full list of descriptors for a molecule

        missingVal is used if the descriptor cannot be calculated
    '''
    res = {}
    for nm,fn in Descriptors._descList:
        # some of the descriptor fucntions can throw errors if they fail, catch those here:
        try:
            val = fn(mol)
        except:
            # print the error message:
            import traceback
            traceback.print_exc()
            # and set the descriptor value to whatever missingVal is
            val = missingVal
        res[nm] = val
    return res

In [3]:
#데이터 로드 & 결측치 처리
train = pd.read_csv('../input/train.csv')
train['AlogP'] = np.where(pd.isna(train['AlogP']), train['LogD'], train['AlogP'])
test = pd.read_csv('../input/test.csv')
test['AlogP'] = np.where(pd.isna(test['AlogP']), test['LogD'], test['LogD'])

In [4]:
train['Molecule'] = train['SMILES'].apply(Chem.MolFromSmiles)
test['Molecule'] = test['SMILES'].apply(Chem.MolFromSmiles)

In [5]:
train.drop_duplicates(['SMILES'], keep=False, inplace=True)
train.reset_index(drop=True,inplace=True)

In [6]:
train_dsc = [getMolDescriptors(mol) for mol in train['Molecule']]
test_dsc = [getMolDescriptors(mol) for mol in test['Molecule']]

In [7]:
# drop_col = ['AlogP', 'Molecule', 'MolWt', 'NumHAcceptors', 'NumHDonors', 'NumRotatableBonds', 'MolLogP']
# drop_col = ['AlogP','Molecule','Molecular_Weight','Num_H_Acceptors','Num_H_Donors','Num_RotatableBonds','MolLogP']
drop_col = ['AlogP','Molecule','Molecular_Weight','Num_H_Acceptors','Num_H_Donors','Num_RotatableBonds']

In [8]:
train_dsc = pd.DataFrame(train_dsc)
test_dsc = pd.DataFrame(test_dsc)

In [9]:
train_data = pd.concat([train, train_dsc], axis=1)
test_data = pd.concat([test, test_dsc], axis=1)

In [10]:
train_data[train_data.isnull().any(axis=1)]

Unnamed: 0,id,SMILES,MLM,HLM,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
986,TRAIN_1000,O=c1c2ccccc2[se]n1-c1ccc(S(=O)(=O)Nc2ccccn2)cc1,31.223,43.963,3.103,430.339,4,1,4,3.126,...,0,1,0,0,0,0,0,0,0,0
1380,TRAIN_1399,Brc1cnc2n[se]nc2c1,91.192,99.9,1.82,262.953,3,0,0,1.82,...,0,0,0,0,0,0,0,0,0,0


In [11]:
train_data = train_data.dropna(axis=0).reset_index(drop=True)

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [13]:
scaler = MinMaxScaler()
col = ['id','SMILES', 'MLM','HLM','Molecule']
col_list = [colum for colum in train_data.columns if colum not in col]

In [14]:
train_data[col_list] = scaler.fit_transform(train_data[col_list])
test_data[col_list] = scaler.transform(test_data[col_list])

In [15]:
drop_col = ['id','SMILES','Molecule','AlogP','Molecule','Molecular_Weight','Num_H_Acceptors','Num_H_Donors','Num_RotatableBonds']
target = ['HLM','MLM']

---
Split out the code: taking the pre-processed dataframe

In [16]:
train_df = train_data.drop(columns = drop_col,axis = 1 )
test_df = test_data.drop(columns = drop_col, axis = 1)

In [17]:
train_df.head()

Unnamed: 0,MLM,HLM,LogD,Molecular_PolarSurfaceArea,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,26.01,50.68,0.57966,0.38927,0.696343,0.696343,0.184646,0.803121,0.627006,0.19214,...,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0
1,29.27,50.59,0.502622,0.239538,0.669702,0.669702,0.029993,0.828822,0.975653,0.108753,...,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0
2,5.586,80.892,0.461021,0.201951,0.152663,0.152663,0.602919,0.93401,0.760547,0.105346,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,5.71,2.0,0.594968,0.304785,0.776741,0.776741,0.241502,0.320881,0.558604,0.27138,...,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,93.27,99.99,0.514316,0.133668,0.708802,0.708802,0.014885,0.831125,0.840289,0.080899,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
test_df.head()

Unnamed: 0,LogD,Molecular_PolarSurfaceArea,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,0.535436,0.305331,0.487946,0.487946,0.17241,0.862465,0.83099,0.15933,0.153919,0.159263,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0
1,0.390149,0.221938,0.741037,0.741037,0.137684,0.758096,0.868162,0.166811,0.166518,0.166807,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.652729,0.305672,0.544276,0.544276,0.051763,0.823021,0.448666,0.14747,0.146643,0.147459,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.475904,0.265937,0.731173,0.731173,0.10694,0.796889,0.580201,0.14574,0.150253,0.14574,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.360666,0.197517,0.731834,0.731834,0.030648,0.814153,0.730907,0.152522,0.150249,0.15252,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
train_df.to_csv("../input/train_descriptor.csv", index=False)
test_df.to_csv("../input/test_descriptor.csv", index=False)


In [19]:
HLM = train_data['HLM']
MLM = train_data['MLM']
train_df = train_data.drop(columns = drop_col,axis = 1 )
train_df = train_df.drop(columns = target, axis = 1)
test_df = test_data.drop(columns = drop_col, axis = 1)

In [20]:
from torch.utils.data import Dataset, DataLoader
import torch
class CustomDataset(Dataset):
  def __init__(self, df, target=None, is_test=False):
    self.df = df
    self.is_test = is_test
    if not self.is_test:
      self.target = target
    self.features = self.df.values
  def __len__(self):
    return len(self.df)

  def __getitem__(self, idx):
    ft = self.features[idx]
    if not self.is_test:
      label = self.target[idx]
      return torch.tensor(ft).float(), torch.tensor(label).float().unsqueeze(dim=-1)
    else:
      return torch.tensor(ft).float()

In [21]:
train_HLM = CustomDataset(train_df, HLM)
train_MLM = CustomDataset(train_df, MLM)

In [22]:
test_HLM = CustomDataset(df = test_df,target=None,is_test=True)
test_MLM = CustomDataset(df = test_df,target=None,is_test=True)
test_HLM_loader = DataLoader(dataset=test_HLM,batch_size=64,shuffle=False)
test_MLM_loader = DataLoader(dataset=test_MLM,batch_size=64,shuffle=False)

In [23]:
train_HLM_dataset, valid_HLM_dataset = train_test_split(train_HLM, test_size=0.2, random_state=42)
train_MLM_dataset, valid_MLM_dataset = train_test_split(train_MLM, test_size=0.2, random_state=42)

In [24]:
train_HLM_loader = DataLoader(dataset=train_HLM_dataset, batch_size = 256,shuffle=True)
valid_HLM_loader = DataLoader(dataset=valid_HLM_dataset, batch_size = 256,shuffle=False)

train_MLM_loader = DataLoader(dataset=train_MLM_dataset, batch_size = 256,shuffle=True)
valid_MLM_loader = DataLoader(dataset=valid_MLM_dataset, batch_size = 256,shuffle=False)

In [25]:
input_size = train_HLM.features.shape[1]

In [26]:
len(train_HLM_dataset)

2753

In [27]:
import torch.nn as nn
import torch.optim as optim

In [28]:
class LinModel(nn.Module):
  def __init__(self, input_size, dropout_rate):
    super(LinModel, self).__init__()

    self.fc1 = nn.Sequential(
        nn.Linear(input_size, 256),
        nn.ReLU(),
        nn.BatchNorm1d(256),
        nn.Dropout(dropout_rate)
    )
    self.fc2 = nn.Sequential(
        nn.Linear(256, 1024),
        nn.ReLU(),
        nn.BatchNorm1d(1024),
        nn.Dropout(dropout_rate)
    )
    self.fc3 = nn.Sequential(
        nn.Linear(1024,512),
        nn.ReLU(),
        nn.BatchNorm1d(512),
        nn.Dropout(dropout_rate)
    )
    self.fc4 = nn.Sequential(
        nn.Linear(512, 256),
        nn.ReLU(),
        nn.BatchNorm1d(256),
        nn.Dropout(dropout_rate)
    )
    self.fc5 = nn.Sequential(
        nn.Linear(256, 128),
        nn.ReLU(),
        nn.BatchNorm1d(128)
    )

    self.out = nn.Linear(128,1)

  def forward(self, x):
    x = self.fc1(x)
    x = self.fc2(x)
    x = self.fc3(x)
    x = self.fc4(x)
    x = self.fc5(x)
    x = self.out(x)
    return x

In [29]:
model_HLM = LinModel(input_size, 0.2)
model_MLM = LinModel(input_size, 0.2)

In [30]:
criterion = nn.MSELoss()
optimizer_HLM = optim.AdamW(model_HLM.parameters(), lr=0.001)
optimizer_MLM = optim.AdamW(model_MLM.parameters(), lr=0.001)

In [31]:
def train(train_loader, valid_loader, model, criterion, optimizer, epochs):
    model.train()

    for epoch in range(epochs):
        running_loss = 0
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            output = model(inputs)
            loss = criterion(output, targets)

            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        if epoch % 100 == 0:
            valid_loss = 0
            with torch.no_grad():
                for inputs, targets in valid_loader:
                    output = model(inputs)
                    loss = criterion(output, targets)
                    valid_loss += loss.item()

            print(f'Epoch: {epoch}/{epochs}, Train Loss: {running_loss/len(train_loader)}, Valid Loss: {valid_loss/len(valid_loader)}')
            model.train()

    return model

In [32]:
print("Training Start: HLM")
model_HLM = train(train_HLM_loader, valid_HLM_loader, model_HLM, criterion, optimizer_HLM, epochs=500)
print("Training Start: MLM")
model_MLM = train(train_MLM_loader, valid_MLM_loader, model_MLM, criterion, optimizer_MLM, epochs=500)

Training Start: HLM
Epoch: 0/500, Train Loss: 4053.0500266335225, Valid Loss: 4134.94873046875
Epoch: 100/500, Train Loss: 78.38030797784978, Valid Loss: 1503.505126953125
Epoch: 200/500, Train Loss: 53.32106538252397, Valid Loss: 1654.6580810546875
Epoch: 300/500, Train Loss: 59.037040363658555, Valid Loss: 1632.1141357421875
Epoch: 400/500, Train Loss: 34.87889497930353, Valid Loss: 1606.3430989583333
Training Start: MLM
Epoch: 0/500, Train Loss: 2568.3782182173295, Valid Loss: 2700.8321940104165
Epoch: 100/500, Train Loss: 77.79888638583097, Valid Loss: 1510.2305908203125
Epoch: 200/500, Train Loss: 65.90286844426936, Valid Loss: 1476.4521077473958
Epoch: 300/500, Train Loss: 33.457498203624375, Valid Loss: 1557.892822265625
Epoch: 400/500, Train Loss: 31.386333638971504, Valid Loss: 1514.7577311197917


In [33]:
def inference(test_loader, model):
    model.eval()
    preds = []

    with torch.no_grad():
        for inputs in test_loader:
            output = model(inputs)
            preds.extend(output.cpu().numpy().flatten().tolist())

    return preds

In [34]:
predictions_MLM = inference(test_MLM_loader,model_MLM)
predictions_HLM = inference(test_HLM_loader,model_HLM)

In [36]:
submission = pd.read_csv('../input/sample_submission.csv')
submission['MLM'] = predictions_MLM
submission['HLM'] = predictions_HLM
submission

Unnamed: 0,id,MLM,HLM
0,TEST_000,13.487659,44.302662
1,TEST_001,79.915855,78.493149
2,TEST_002,3.403304,50.642715
3,TEST_003,78.031914,91.980125
4,TEST_004,72.422691,28.262505
...,...,...,...
478,TEST_478,22.854019,25.724598
479,TEST_479,86.857521,93.575180
480,TEST_480,18.096180,26.369389
481,TEST_481,16.308226,80.234985


In [None]:
submission.to_csv("../output/subsub.csv", index=False)