In [None]:
import pandas as pd
import numpy as np

In [None]:
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')
train_df['AlogP'] = np.where(pd.isna(train_df['AlogP']), train_df['LogD'], train_df['AlogP'])
test_df['AlogP'] = np.where(pd.isna(test_df['AlogP']), test_df['LogD'], test_df['AlogP'])

In [None]:
from rdkit.Chem import Descriptors
def getMolDescriptors(mol, missingVal=None):
    res = {}
    for nm,fn in Descriptors._descList:
        # some of the descriptor fucntions can throw errors if they fail, catch those here:
        try:
            val = fn(mol)
        except:
            # print the error message:
            import traceback
            traceback.print_exc()
            # and set the descriptor value to whatever missingVal is
            val = missingVal
        res[nm] = val
    return res

In [None]:
from rdkit import Chem
train_df['Molecule'] = train_df['SMILES'].apply(Chem.MolFromSmiles)
test_df['Molecule'] = test_df['SMILES'].apply(Chem.MolFromSmiles)

In [None]:
train_desc = [getMolDescriptors(m) for m in train_df['Molecule']]
test_desc = [getMolDescriptors(m) for m in test_df['Molecule']]
train_desc = pd.DataFrame(train_desc)
test_desc = pd.DataFrame(test_desc)

In [None]:
train_desc = pd.concat([train_df,train_desc],axis = 1)
test_desc = pd.concat([test_df,test_desc],axis = 1)

In [None]:
train_desc = train_desc.drop_duplicates(['SMILES'], keep=False).reset_index(drop=True)

In [None]:
col_list = train_desc.columns
drop_list = []
for col in col_list:
  missing = train_desc[col].isna().any()
  if missing == True:
    drop_list.append(col)
train_desc = train_desc.drop(drop_list, axis=1)
test_desc = test_desc.drop(drop_list, axis=1)

In [None]:
from rdkit.Chem import MACCSkeys

def toMACCKeys(mol):
    maccs = MACCSkeys.GenMACCSKeys(mol)
    bits = [int(bit) for bit in maccs.ToBitString()]

    return bits

In [None]:
train_fps = pd.DataFrame(train_desc["Molecule"].apply(toMACCKeys))
test_fps =  pd.DataFrame(test_desc["Molecule"].apply(toMACCKeys))


In [None]:
train_fps = pd.DataFrame(train_fps["Molecule"].tolist(), columns=[f"fp{i}" for i in range(len(train_fps["Molecule"][0]))])
test_fps = pd.DataFrame(test_fps["Molecule"].tolist(), columns=[f"fp{i}" for i in range(len(test_fps["Molecule"][0]))])

In [None]:
train_desc = pd.concat([train_desc, train_fps], axis=1)
test_desc = pd.concat([test_desc, test_fps], axis=1)

In [None]:
train_desc.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [None]:
drop_col_list = ['id','SMILES','MLM','HLM','Molecule']
target_mlm = train_desc['MLM']
target_hlm = train_desc['HLM']

In [None]:
train = train_desc.drop(drop_col_list,axis=1)
test = test_desc.drop(['id','SMILES','Molecule'], axis=1)

In [None]:
feature = [col for col in train.columns if col not in drop_col_list]

In [None]:
train[feature] = scaler.fit_transform(train[feature])
test[feature] = scaler.transform(test[feature])

In [None]:
train_MLM = pd.concat([train, target_mlm], axis=1)
train_HLM = pd.concat([train, target_hlm], axis=1)

In [None]:
from autogluon.tabular import TabularDataset, TabularPredictor
from IPython.display import Image, display

In [None]:
predictor_MLM = TabularPredictor(label='MLM', eval_metric='root_mean_squared_error', verbosity=False).fit(train_MLM)
predictor_HLM = TabularPredictor(label='HLM', eval_metric='root_mean_squared_error', verbosity=False).fit(train_HLM)

ld_board_MLM = predictor_MLM.leaderboard(train_MLM, silent=True)
print("="*20, "MLM", "="*20)
print(ld_board_MLM)
print(f"Best: {predictor_MLM.get_model_best()}")

ld_board_HLM = predictor_HLM.leaderboard(train_HLM, silent=True)
print("="*20, "HLM", "="*20)
print(ld_board_HLM)
print(f"Best: {predictor_HLM.get_model_best()}")

In [None]:
path_to_png=predictor_MLM.plot_ensemble_model()
display(Image(filename=path_to_png))
path_to_png=predictor_HLM.plot_ensemble_model()
display(Image(filename=path_to_png))

In [None]:
# 결과파일 작성
pred_MLM = predictor_MLM.predict(test)
pred_HLM = predictor_HLM.predict(test)

submission = pd.DataFrame()
submission["id"] = test_df["id"]
submission["MLM"] = pred_MLM
submission["HLM"] = pred_HLM

submission.to_csv("../output/submission.csv", index=False)

In [None]:
submission