In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
import os
import pickle
import pandas as pd

from src.utils.generate_morgan_fp import generate_fingerprint

In [27]:
datasets=[i for i in os.listdir('data_split_cleaned') if i[-4:]=='.csv']

simple_models_path='Simple_Models'
model_paths=[i for i in os.listdir('Simple_Models')if (i!='.ipynb_checkpoints') and (i!='Predictions')]
data_set_path='data_split_cleaned'

In [28]:
model_paths

['sol_del-cluster',
 'HIV-random',
 'bace-random',
 'HIV-cluster',
 'deepchem_Lipophilicity-cluster',
 'tox21-cluster',
 'deepchem_Lipophilicity-random',
 'tox21-random',
 'clintox-cluster',
 'clintox-random',
 'sol_del-random',
 'bace-cluster']

In [29]:
data_map={
    'HIV': {'target':'HIV_active','structure':'smiles'},
    'bace':{'target':'active','structure':'mol'},
    'tox21':{'target':'NR-AhR','structure':'smiles'},
    'clintox':{'target':'CT_TOX','structure':'smiles'},
    'sol_del':{'target':'binned_sol','structure':'smiles'},
    'deepchem_Lipophilicity':{'target':'drug_like','structure':'smiles'}   
}


In [31]:
# for each model/dataset pair - run the predictions on train and validate - use the predict proba function
# Add the predictions to the dataframe and save these to a simple models predictions folder
#load the dataset as df
# featurize the molecules
#load the model
# predict proba for each model 
# save the predictions to dataframe and write it to a predictions folider in simple models

In [32]:
for path in model_paths:
    print(path)
    dataset=path.split('-')[0]
    split_type=path.split('-')[1]
    models=[i for i in os.listdir(os.path.join(simple_models_path,path)) if i[-4:]=='.pkl']
    
    # locate the datasets to predict:
    val_path=os.path.join(data_set_path,f'{dataset}-{split_type}-validate.csv')
    train_path=os.path.join(data_set_path,f'{dataset}-{split_type}-train.csv')
    
    smiles_col=data_map[dataset]['structure']
    # read in the dataframes:
    val_df = pd.read_csv(val_path)
    train_df = pd.read_csv(train_path)
    # featurize the datafreams:
    val_df['fp']=val_df[smiles_col].apply(lambda x: generate_fingerprint(x,2,1024))
    train_df['fp']=train_df[smiles_col].apply(lambda x: generate_fingerprint(x,2,1024))
    val_df.dropna(subset=['fp'],inplace=True)
    train_df.dropna(subset=['fp'],inplace=True)
    
    X_val=val_df['fp'].to_list()
    X_train=train_df['fp'].to_list()
    
    # loop over the models - load them, predict on the X
    for model in models:
        model_loc=os.path.join(simple_models_path,path,model)
        model_name=model.replace('_model.pkl','')
        with open(model_loc,'rb') as file:
            clf = pickle.load(file)
            val_df[f'{model_name}_prediction']=[i[1] for i in clf.predict_proba(X_val)]
            train_df[f'{model_name}_prediction']=[i[1] for i in clf.predict_proba(X_train)]
    # save the predictions for each dataset:
    val_df.to_csv(os.path.join('Simple_Models/Predictions',f'{dataset}-{split_type}-validate-pred.csv'))
    train_df.to_csv(os.path.join('Simple_Models/Predictions',f'{dataset}-{split_type}-train-pred.csv'))
                  


sol_del-cluster
HIV-random




bace-random
HIV-cluster




deepchem_Lipophilicity-cluster
tox21-cluster




deepchem_Lipophilicity-random
tox21-random




clintox-cluster
clintox-random
sol_del-random
bace-cluster
