# Modules & Functions

In [2]:
from sklearn.model_selection import train_test_split
import pandas as pd
import chemprop

def train_chemprop(ENDPOINT_S):
    path_y_file_train = f"./data/ADMET/{ENDPOINT_S}_TRAIN.csv"
    path_y_file_test = f"./data/ADMET/{ENDPOINT_S}_TEST.csv"
    folder_model = f"./model/{ENDPOINT_S}/"
    
    os.makedirs(folder_model, exist_ok=True)  
    
    arguments_train = [
    '--data_path', path_y_file_train,'--dataset_type', 'regression','--save_dir', folder_model,'--smiles_columns', 'SMILES','--metric', 'rmse','--target_columns', "Resp_MEDIAN",'--ffn_hidden_size', "300",'--ffn_num_layers', "2",
        '--num_workers', '128','--dropout', str("0.2"),'--batch_size', str('64'),'--split_sizes',"0.9", "0.05", "0.05",'--epochs', '100']
    
    args = chemprop.args.TrainArgs().parse_args(arguments_train)
    chemprop.train.cross_validate(args=args, train_func=chemprop.train.run_training)

    return(folder_model, path_y_file_test)


def test_chemprop(ENDPOINT_S, folder_model, path_y_file_test):
    path_from = folder_model + "/fold_0/model_0/model.pt"
    
    arguments_val = [
        '--test_path', path_y_file_test,'--checkpoint_dir', folder_model + "/fold_0/model_0/" , '--smiles_columns', 'SMILES','--num_workers', '128','--preds_path',path_y_file_test.replace(".csv","_Prediction")]
    
    args = chemprop.args.PredictArgs().parse_args(arguments_val)
    pred = chemprop.train.make_predictions(args=args)
    
    y_folder_test = pd.read_csv(path_y_file_test)
    path_val_output = f"./data/ADMET/{ENDPOINT_S}__PREDICTION.csv"
    y_folder_test["Prediction"]   = [i[0] for i in pred]
    y_folder_test["Experimental"] = y_folder_test["Resp_MEDIAN"].tolist()
    y_folder_test.to_csv(path_val_output, index = False)
    
    return(y_folder_test)

# Load Data

In [4]:
LogP = pd.read_csv("./data/ADMET/LogP.csv")
LogD = pd.read_csv("./data/ADMET/LogD.csv")
Papp = pd.read_csv("./data/ADMET/Papp.csv")
Sapp = pd.read_csv("./data/ADMET/Sapp.csv")
hERG = pd.read_csv("./data/ADMET/hERG.csv")

# Split Data

In [5]:
# Split 80/20
train_LogP, test_LogP = train_test_split(LogP, test_size=0.2, random_state=42)
train_LogD, test_LogD = train_test_split(LogD, test_size=0.2, random_state=42)
train_Papp, test_Papp = train_test_split(Papp, test_size=0.2, random_state=42)
train_Sapp, test_Sapp = train_test_split(Sapp, test_size=0.2, random_state=42)
train_hERG, test_hERG = train_test_split(hERG, test_size=0.2, random_state=42)

# Write Data

In [6]:
# Write each to .csv
train_LogP.to_csv("./data/ADMET/LogP_TRAIN.csv", index = False)
train_LogD.to_csv("./data/ADMET/LogD_TRAIN.csv", index = False)
train_Papp.to_csv("./data/ADMET/Papp_TRAIN.csv", index = False)
train_Sapp.to_csv("./data/ADMET/Sapp_TRAIN.csv", index = False)
train_hERG.to_csv("./data/ADMET/hERG_TRAIN.csv", index = False)
test_LogP.to_csv("./data/ADMET/LogP_TEST.csv", index = False)
test_LogD.to_csv("./data/ADMET/LogD_TEST.csv", index = False)
test_Papp.to_csv("./data/ADMET/Papp_TEST.csv", index = False)
test_Sapp.to_csv("./data/ADMET/Sapp_TEST.csv", index = False)
test_hERG.to_csv("./data/ADMET/hERG_TEST.csv", index = False)

# Train Model

- LogP

In [None]:
ENDPOINT_S = "LogP"

folder_model, path_y_file_test = train_chemprop(ENDPOINT_S)
y_folder_test = test_chemprop(ENDPOINT_S, folder_model, path_y_file_test)

- LogD

In [None]:
ENDPOINT_S = "LogD"

folder_model, path_y_file_test = train_chemprop(ENDPOINT_S)
y_folder_test = test_chemprop(ENDPOINT_S, folder_model, path_y_file_test)

- Papp

In [None]:
ENDPOINT_S = "Papp"

folder_model, path_y_file_test = train_chemprop(ENDPOINT_S)
y_folder_test = test_chemprop(ENDPOINT_S, folder_model, path_y_file_test)

- Sapp

In [None]:
ENDPOINT_S = "Sapp"

folder_model, path_y_file_test = train_chemprop(ENDPOINT_S)
y_folder_test = test_chemprop(ENDPOINT_S, folder_model, path_y_file_test)

- hERG

In [None]:
ENDPOINT_S = "hERG"

folder_model, path_y_file_test = train_chemprop(ENDPOINT_S)
y_folder_test = test_chemprop(ENDPOINT_S, folder_model, path_y_file_test)