In [None]:
# download DILI dataset from TDC.
# DILI: cause liver damage(1) or not(0)
!pip install PyTDC

In [None]:
from tdc.single_pred import Tox

data = Tox(name='DILI')
split = data.get_split()

# train data
train = split['train']
y_train = train['Y']

# test data
test = split['test']
y_test = test['Y']

In [None]:
import rdkit.Chem as Chem
from rdkit.Chem import Descriptors

import pandas as pd
import numpy as np

In [None]:
# Function to conver SMILES to MOL

def smiles2mol(dataset):
    mols = [Chem.MolFromSmiles(mol) for mol in dataset['Drug']]
    return mols

train_mols = smiles2mol(train)
test_mols = smiles2mol(test)

In [None]:
# Function to convert Mol to RDKit Descriptor

def getMolDescriptors(mol, err=None):

    res = {}
    for nm,fn in Descriptors._descList:
        try:
            val = fn(mol)
        except:
            val = err
        res[nm] = val
    return res

In [None]:
# Function to convert Mol to RDKit Descriptor for all molecules in existing list

def rdkitdesc_conversion(mols, is_train = True):

    descrs = [getMolDescriptors(mol) for mol in mols]
    descrs_df = pd.DataFrame(descrs)

    return descrs_df

In [None]:
# Obtain dataframe of RDKit Descriptor for all molecules in train and test dataset

train_df = rdkitdesc_conversion(train_mols)
test_df = rdkitdesc_conversion(test_mols, is_train=False)

In [None]:
# Check if there is any NaN values in train/test descriptors dataframe

(train_df.isna()).sum().sum(), (test_df.isna()).sum().sum()

In [None]:
# Replace NaN value with column-wise (Values for same single descriptor property) average value

train_df = train_df.fillna(train_df.mean())
test_df = test_df.fillna(test_df.mean())

print(train_df)

# Random Forest

In [None]:
# Your code here
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
import numpy as np

criterions = ['gini', 'entropy']
n_estimators = [1, 5, 10, 50, 100]
random_state = 42

In [None]:
print("Random Forest: criterions & estimators\n")

for criterion in criterions:

    accuracy_list_1 = []

    for estimator in n_estimators:

        RF = RandomForestClassifier(n_estimators = estimator, 
                                    criterion = criterion, 
                                    random_state = random_state)
        
        RF.fit(train_df, y_train) # train

        y_prediction = RF.predict(test_df) # prediction

        accuracy = np.sum(y_test == y_prediction) / len(y_test)
        accuracy_list_1.append(accuracy)

    # 가장 좋은 hyperparameter 확인 
    best_result_index = accuracy_list_1.index(max(accuracy_list_1))
    print(f"Best estimator <{criterion}> : {n_estimators[best_result_index]}")



# XGBoost

In [None]:
# Your code here
!pip install xgboost
from xgboost import XGBClassifier

In [None]:
reg_lambdas = [0.01, 0.1, 0.5, 1, 1.5, 2]

accuracy_list_2_1 = []

print("XGBoost: lambda\n")

for reg_lambda in reg_lambdas:

    XGB = XGBClassifier(reg_lambda = reg_lambda, 
                        random_state = random_state,
                        eval_metric='logloss')
    
    XGB.fit(train_df, y_train) # train
    
    y_prediction = XGB.predict(test_df)
    
    accuracy = np.sum(y_test == y_prediction) / len(y_test)
    accuracy_list_2_1.append(accuracy)
    
    print(f"reg_lambda: {reg_lambda} ~ Accuracy = {accuracy:.3f}")

In [None]:
alphas = [0, 0.01, 0.1, 0.5, 1, 1.5]

accuracy_list_2_2 = []

print("XGBoost: alpha\n")

for alpha in alphas:

    XGB = XGBClassifier(alpha = alpha,
                        random_state = random_state,
                        eval_metric = 'logloss')
    
    XGB.fit(train_df, y_train) # train

    y_prediction = XGB.predict(test_df)

    accuracy = np.sum(y_test == y_prediction) / len(y_test)
    accuracy_list_2_2.append(accuracy)
    
    print(f"alpha: {alpha} ~ Accuracy = {accuracy:.3f}")

### We need to adjust "alpha" to adjust L1 regularization term!

In [None]:
print("L1 regularization")

best_alpha_index = accuracy_list_2_2.index(max(accuracy_list_2_2))

best_alpha = alphas[best_alpha_index]
print(f"best alpha: {best_alpha}")

best_accuracy_alpha = accuracy_list_2_2[best_alpha_index]
print(f"in this case, accuracy is {best_accuracy_alpha}")

# Q1. Random Forest

1.

- For criterion 'Gini' -> n_estimators = 10
- For criterion 'entropy' -> n_estimators = 5

# Q2. XGBoost

1.
- reg_lambda = 0.01 -> Accuracy = 0.779
- reg_lambda = 0.1 -> Accuracy = 0.789 (best)
- reg_lambda = 0.5 -> Accuracy = 0.779
- reg_lambda = 1 -> Accuracy = 0.768
- reg_lambda = 1.5 -> Accuracy = 0.737
- reg_lambda = 2 -> Accuracy = 0.758

2.
- alpha = 0 -> Accuracy = 0.768
- alpha = 0.01 -> Accuracy = 0.779
- alpha = 0.1 -> Accuracy = 0.768
- alpha = 0.5 -> Accuracy = 0.758
- alpha = 1 -> Accuracy = 0.789 (best)
- alpha = 1.5 -> Accuracy = 0.768

3.

- Adjust "alpha" hyperparamter to value "1"