# Import Libraries

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from models import (find_best_params_rf,
                    find_best_params_xgb,
                    train_random_forest,
                    train_xgboost,
                    predict_and_evaluate)



# Import Data

In [4]:
# Auto Data:
# Load Random Split Data:
train_random_auto = pd.read_csv('data/auto_random_split_train.csv')
test_random_auto = pd.read_csv('data/auto_random_split_test.csv')

# Manual Data:
# Load Random Split Data:
train_random_manual = pd.read_csv('data/manual_random_split_train.csv')
test_random_manual = pd.read_csv('data/manual_random_split_test.csv')
# Load Realistic Split:
train_realistic_manual = pd.read_csv('data/manual_realistic_split_train.csv')
test_realistic_manual = pd.read_csv('data/manual_realistic_split_test.csv')

# Auto Data - Random Split:

## Molecular Descriptors

In [5]:
descriptors = [
    'MaxEStateIndex', 'MinEStateIndex', 'MinAbsEStateIndex', 'qed', 'HeavyAtomMolWt', 'NumValenceElectrons',
    'MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'FpDensityMorgan1',
    'FpDensityMorgan2', 'FpDensityMorgan3', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n',
    'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Kappa1', 'Kappa2', 'Kappa3',
    'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2',
    'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1',
    'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA9', 'SlogP_VSA1',
    'SlogP_VSA10', 'SlogP_VSA11', 'SlogP_VSA12', 'SlogP_VSA2', 'SlogP_VSA3', 'SlogP_VSA4', 'SlogP_VSA5', 'SlogP_VSA6',
    'SlogP_VSA7', 'SlogP_VSA8', 'TPSA', 'EState_VSA1', 'EState_VSA10', 'EState_VSA11', 'EState_VSA2', 'EState_VSA3',
    'EState_VSA4', 'EState_VSA5', 'EState_VSA6', 'EState_VSA7', 'EState_VSA8', 'EState_VSA9', 'VSA_EState1',
    'VSA_EState10', 'VSA_EState2', 'VSA_EState3', 'VSA_EState4', 'VSA_EState5', 'VSA_EState6', 'VSA_EState7',
    'VSA_EState8', 'VSA_EState9', 'FractionCSP3', 'HeavyAtomCount', 'NHOHCount', 'NOCount', 'NumAliphaticCarbocycles',
    'NumAliphaticHeterocycles', 'NumAliphaticRings', 'NumAromaticCarbocycles', 'NumAromaticHeterocycles', 'NumAromaticRings',
    'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms', 'NumRotatableBonds', 'NumSaturatedCarbocycles', 'NumSaturatedHeterocycles',
    'NumSaturatedRings', 'RingCount', 'MolLogP', 'MolMR', 'fr_Al_COO', 'fr_Al_OH', 'fr_Al_OH_noTert', 'fr_ArN', 'fr_Ar_COO',
    'fr_Ar_N', 'fr_Ar_NH', 'fr_Ar_OH', 'fr_COO', 'fr_C_O', 'fr_C_O_noCOO', 'fr_C_S', 'fr_HOCCN', 'fr_Imine', 'fr_NH0',
    'fr_NH1', 'fr_NH2', 'fr_N_O', 'fr_Ndealkylation1', 'fr_Ndealkylation2', 'fr_SH', 'fr_aldehyde', 'fr_alkyl_halide',
    'fr_allylic_oxid', 'fr_amide', 'fr_amidine', 'fr_aniline', 'fr_aryl_methyl', 'fr_azide', 'fr_azo', 'fr_benzene',
    'fr_bicyclic', 'fr_ester', 'fr_ether', 'fr_furan', 'fr_guanido', 'fr_halogen', 'fr_hdrzone', 'fr_imidazole', 'fr_imide',
    'fr_isothiocyan', 'fr_ketone', 'fr_ketone_Topliss', 'fr_lactone', 'fr_methoxy', 'fr_morpholine', 'fr_nitrile', 'fr_nitro', 
    'fr_nitro_arom', 'fr_nitro_arom_nonortho', 'fr_oxazole', 'fr_NH1', 'fr_NH2', 'fr_N_O', 'fr_Ndealkylation1', 'fr_Ndealkylation2', 
    'fr_SH', 'fr_aldehyde', 'fr_alkyl_halide', 'fr_allylic_oxid', 'fr_amide', 'fr_amidine', 'fr_aniline', 'fr_aryl_methyl',
    'fr_azide', 'fr_azo', 'fr_sulfide', 'fr_sulfonamd', 'fr_sulfone', 'fr_term_acetylene', 'fr_tetrazole', 'fr_thiazole',
    'fr_thiocyan', 'fr_thiophene', 'fr_unbrch_alkane', 'SlogP', 'LabuteASA (#1)', 'AMW', 'ExactMW', 'NumLipinskiHBA', 'NumLipinskiHBD',
    'NumHBD', 'NumAmideBonds', 'NumAtoms', 'NumStereocenters', 'NumUnspecifiedStereocenters', 'NumSaturatedRings (#1)',
    'NumAliphaticRings (#1)', 'NumAromaticHeterocycles (#1)', 'NumSaturatedHeterocycles (#1)', 'NumAromaticCarbocycles (#1)',
    'Chi1v (#1)', 'Chi2v (#1)', 'Chi3v (#1)', 'Chi1n (#1)', 'Chi2n (#1)', 'HallKierAlpha (#1)', 'kappa1', 'kappa2', 'kappa3',
    'slogp_VSA1', 'slogp_VSA3', 'slogp_VSA10', 'slogp_VSA12', 'smr_VSA1', 'smr_VSA2', 'smr_VSA3', 'smr_VSA4', 'smr_VSA6',
    'smr_VSA9', 'smr_VSA10', 'peoe_VSA1', 'peoe_VSA3', 'peoe_VSA4', 'peoe_VSA5', 'peoe_VSA8', 'peoe_VSA9', 'peoe_VSA14',
    'MQN1', 'MQN2', 'MQN3', 'MQN4', 'MQN5', 'MQN6', 'MQN8', 'MQN9', 'MQN10', 'MQN11', 'MQN13', 'MQN14', 'MQN15', 'MQN16',
    'MQN17', 'MQN20', 'MQN21', 'MQN22', 'MQN23', 'MQN24', 'MQN25', 'MQN26', 'MQN27', 'MQN28', 'MQN29', 'MQN30', 'MQN31', 'MQN32', 
    'MQN33', 'MQN34', 'MQN35', 'MQN36', 'MQN37', 'MQN38', 'MQN40', 'MQN41', 'MQN42',
 
 
 'Z1_1_Y95',   'Z2_1_Y95',   'Z3_1_Y95',   'Z4_1_Y95',   'Z5_1_Y95',   'Z1_2_A96',   'Z2_2_A96',   'Z3_2_A96',   'Z4_2_A96',   'Z5_2_A96',
 'Z1_3_D98',   'Z2_3_D98',   'Z3_3_D98',   'Z4_3_D98',   'Z5_3_D98',   'Z1_4_A169',  'Z2_4_A169',  'Z3_4_A169',  'Z4_4_A169',  'Z5_4_A169',
 'Z1_5_I172',  'Z2_5_I172',  'Z3_5_I172',  'Z4_5_I172',  'Z5_5_I172',  'Z1_6_A173',  'Z2_6_A173',  'Z3_6_A173',  'Z4_6_A173',  'Z5_6_A173',
 'Z1_7_Y175',  'Z2_7_Y175',  'Z3_7_Y175',  'Z4_7_Y175',  'Z5_7_Y175',  'Z1_8_F335',  'Z2_8_F335',  'Z3_8_F335',  'Z4_8_F335',  'Z5_8_F335',
 'Z1_9_S336',  'Z2_9_S336',  'Z3_9_S336',  'Z4_9_S336',  'Z5_9_S336',  'Z1_10_L337', 'Z2_10_L337', 'Z3_10_L337', 'Z4_10_L337', 'Z5_10_L337',
 'Z1_11_G338', 'Z2_11_G338', 'Z3_11_G338', 'Z4_11_G338', 'Z5_11_G338', 'Z1_12_F341', 'Z2_12_F341', 'Z3_12_F341', 'Z4_12_F341', 'Z5_12_F341',
 'Z1_13_S438', 'Z2_13_S438', 'Z3_13_S438', 'Z4_13_S438', 'Z5_13_S438', 'Z1_14_T439', 'Z2_14_T439', 'Z3_14_T439', 'Z4_14_T439', 'Z5_14_T439',
 'Z1_15_G442', 'Z2_15_G442', 'Z3_15_G442', 'Z4_15_G442', 'Z5_15_G442', 'Z1_16_L443', 'Z2_16_L443', 'Z3_16_L443', 'Z4_16_L443', 'Z5_16_L443',
 'Z1_17_T497', 'Z2_17_T497', 'Z3_17_T497', 'Z4_17_T497', 'Z5_17_T497', 'Z1_18_G498', 'Z2_18_G498', 'Z3_18_G498', 'Z4_18_G498', 'Z5_18_G498',
 'Z1_19_V501', 'Z2_19_V501', 'Z3_19_V501', 'Z4_19_V501', 'Z5_19_V501',
]

In [6]:
X_train = train_random_auto[descriptors].values
y_train = train_random_auto['pchembl'].values

X_test = test_random_auto[descriptors].values
y_test = test_random_auto['pchembl'].values

## RF

In [8]:
rf_params = find_best_params_rf(X_train, y_train)