## This notebook combine and collect the training and test features for hierarhical models

In [1]:
from utils import * 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import itertools
from pprint import pprint
import joblib

import statistics

# Models
from xgboost import XGBClassifier, XGBRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.model_selection import KFold, cross_validate, GridSearchCV, cross_val_score, RandomizedSearchCV 
from sklearn.model_selection import cross_val_predict

from sklearn.pipeline import Pipeline

from sklearn.metrics import make_scorer

#regression matrics
from sklearn.metrics import mean_absolute_error , mean_squared_error, r2_score

#classification metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score, f1_score, matthews_corrcoef

from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.base import TransformerMixin
from sklearn.base import clone
from sklearn.model_selection._split import check_cv

## Hierarchical Features for Training Set

In [11]:
import os

def Hfeature_collector(Hfiles_path):
    '''
    Hfeature_path: path to the hierarchial feautres dictionary.
    
    This function is specific for this dataset, 
    one need to revise it accordingly for other datasets.
    '''
           
    Hfeature_files = os.listdir(Hfiles_path) # find all files
    Hfeature_files.sort() # sort the list by name
    print(f'{len(Hfeature_files)} files found')
    
    Hfeature_df = pd.DataFrame()
    
    for f in Hfeature_files:
        if f.split('_')[0] == 'LD50':
            df_f = pd.read_csv(f'{Hfiles_path}{f}', index_col = 'CASRN')
            Hfeature_df = pd.concat([Hfeature_df, df_f], axis=1, sort=False)
        
        elif f.split('_')[0] == 'Toxic':
            # only keep the prediced prob for the positive class
            # choose the first and third cols from the csv file use the *uescols* argument
            df_f = pd.read_csv(f'{Hfiles_path}{f}', usecols=[0,2], index_col = 'CASRN')
            Hfeature_df = pd.concat([Hfeature_df, df_f], axis=1, sort=False)            

        elif f.split('_')[0] == 'EPA':
            # keep the predicted probs for the first three classes
            df_f = pd.read_csv(f'{Hfiles_path}{f}',usecols=[0,1,2,3], index_col = 'CASRN')
            Hfeature_df = pd.concat([Hfeature_df, df_f], axis=1, sort=False)
        else:
            print(f'File name *{f}* may not a feature file and skipped')    
    # set name of the index
    Hfeature_df.index.name = 'CASRN'    
    return Hfeature_df

In [3]:
Hfeature_path = '../data/Hmodel_features/'
train_Hfeatures = Hfeature_collector(Hfeature_path)
train_Hfeatures.shape

60 files found


(8221, 100)

In [4]:
train_Hfeatures.head(1)

Unnamed: 0_level_0,EPA_RF_ecfp6bits-1,EPA_RF_ecfp6bits-2,EPA_RF_ecfp6bits-3,EPA_RF_ecfp6counts-1,EPA_RF_ecfp6counts-2,EPA_RF_ecfp6counts-3,EPA_RF_maccs-1,EPA_RF_maccs-2,EPA_RF_maccs-3,EPA_RF_mordred-1,...,Toxic_svm_ecfp6bits-1,Toxic_svm_ecfp6counts-1,Toxic_svm_maccs-1,Toxic_svm_mordred-1,Toxic_svm_rdkit2d-1,Toxic_xgboost_ecfp6bits-1,Toxic_xgboost_ecfp6counts-1,Toxic_xgboost_maccs-1,Toxic_xgboost_mordred-1,Toxic_xgboost_rdkit2d-1
CASRN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
23233-88-7,0.133902,0.240154,0.502604,0.125821,0.246357,0.522198,0.104243,0.165346,0.515884,0.081167,...,0.582366,0.682633,0.275429,0.816182,0.489093,0.668234,0.759473,0.36906,0.470704,0.472904


In [6]:
train_Hfeatures.to_csv('../data/Hmodel_features_combined/train_Hfeatures.csv')

## Hierarchical Features for Test Set

We use all the base mdoels to make predctions on the test set and combine them together.

In [2]:
# import all the test features
test_ecfp6bits = pd.read_csv('../data/Bmodel_features/modeling_test_ecfp6_bits.csv', index_col='CASRN')
test_ecfp6counts = pd.read_csv('../data/Bmodel_features/modeling_test_ecfp6_counts.csv', index_col='CASRN')
test_maccs = pd.read_csv('../data/Bmodel_features/modeling_test_maccs.csv', index_col='CASRN')
test_rdkit2d = pd.read_csv('../data/Bmodel_features/modeling_test_rdkit2d.csv', index_col='CASRN')
test_mordred = pd.read_csv('../data/Bmodel_features/modeling_test_mordred.csv', index_col='CASRN')

In [3]:
feature_dict = {
    'ecfp6bits': test_ecfp6bits,
    'ecfp6counts': test_ecfp6counts,
    'maccs': test_maccs,
    'rdkit2d': test_rdkit2d,
    'mordred': test_mordred
}

In [4]:
endpoints = ['Toxic', 'EPA', 'LD50']
descriptors = ['ecfp6bits', 'ecfp6counts', 'maccs', 'rdkit2d', 'mordred']
algorithms = ['knn', 'svm', 'RF', 'xgboost']

loop all the base models (60 in total) and make predictions on the test set data. The predictions are saved as csv file for each base model. The names of files are the same as the training hierarchial features. All the results are saved in the `Hmodel_features_test` folder.

In [6]:
%%time

index = test_ecfp6bits.index

for e in endpoints:
    for d in descriptors:
        for a in algorithms:
            name = f'{e}_{a}_{d}'
            print(f'{name}: computing....')
            model = joblib.load(f'../models/Base_models/{name}.pkl')
            feature = feature_dict[f'{d}'].values.astype('float32')
            if e == 'Toxic':
                predictions = model.predict_proba(feature)
                df = pd.DataFrame(predictions, columns=[f'{name}-0', f'{name}-1'],index = index)
                df.to_csv(f'../data/Hmodel_features_test/{name}.csv')

                print(f'{name}: saved')
            if e == 'EPA':
                predictions = model.predict_proba(feature)
                df = pd.DataFrame(predictions, columns=[f'{name}-1', f'{name}-2', f'{name}-3', f'{name}-4'], index = index)
                df.to_csv(f'../data/Hmodel_features_test/{name}.csv')

                print(f'{name}: saved')
            if e == 'LD50':
                predictions = model.predict(feature)
                df = pd.DataFrame(predictions, columns=[f'{name}'],index = index)

                df.to_csv(f'../data/Hmodel_features_test/{name}.csv')
                print(f'{name}: saved')                

Toxic_knn_ecfp6bits: computing....
Toxic_knn_ecfp6bits: saved
Toxic_svm_ecfp6bits: computing....
Toxic_svm_ecfp6bits: saved
Toxic_RF_ecfp6bits: computing....
Toxic_RF_ecfp6bits: saved
Toxic_xgboost_ecfp6bits: computing....
Toxic_xgboost_ecfp6bits: saved
Toxic_knn_ecfp6counts: computing....
Toxic_knn_ecfp6counts: saved
Toxic_svm_ecfp6counts: computing....
Toxic_svm_ecfp6counts: saved
Toxic_RF_ecfp6counts: computing....
Toxic_RF_ecfp6counts: saved
Toxic_xgboost_ecfp6counts: computing....
Toxic_xgboost_ecfp6counts: saved
Toxic_knn_maccs: computing....
Toxic_knn_maccs: saved
Toxic_svm_maccs: computing....
Toxic_svm_maccs: saved
Toxic_RF_maccs: computing....
Toxic_RF_maccs: saved
Toxic_xgboost_maccs: computing....
Toxic_xgboost_maccs: saved
Toxic_knn_rdkit2d: computing....
Toxic_knn_rdkit2d: saved
Toxic_svm_rdkit2d: computing....
Toxic_svm_rdkit2d: saved
Toxic_RF_rdkit2d: computing....
Toxic_RF_rdkit2d: saved
Toxic_xgboost_rdkit2d: computing....
Toxic_xgboost_rdkit2d: saved
Toxic_knn_mordre

In [12]:
Hfeature_test_path = '../data/Hmodel_features_test/'
test_Hfeatures = Hfeature_collector(Hfeature_test_path)
test_Hfeatures.shape

61 files found
File name *.ipynb_checkpoints* may not a feature file and skipped


(2849, 100)

In [13]:
test_Hfeatures.head(1)

Unnamed: 0_level_0,EPA_RF_ecfp6bits-1,EPA_RF_ecfp6bits-2,EPA_RF_ecfp6bits-3,EPA_RF_ecfp6counts-1,EPA_RF_ecfp6counts-2,EPA_RF_ecfp6counts-3,EPA_RF_maccs-1,EPA_RF_maccs-2,EPA_RF_maccs-3,EPA_RF_mordred-1,...,Toxic_svm_ecfp6bits-1,Toxic_svm_ecfp6counts-1,Toxic_svm_maccs-1,Toxic_svm_mordred-1,Toxic_svm_rdkit2d-1,Toxic_xgboost_ecfp6bits-1,Toxic_xgboost_ecfp6counts-1,Toxic_xgboost_maccs-1,Toxic_xgboost_mordred-1,Toxic_xgboost_rdkit2d-1
CASRN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
130209-82-4,0.045211,0.198811,0.440143,0.04315,0.164965,0.440232,0.112043,0.195739,0.440197,0.111167,...,0.552823,0.200557,0.526833,0.548591,0.258444,0.215411,0.142432,0.530601,0.638196,0.493193


In [14]:
train_Hfeatures = pd.read_csv('../data/Hmodel_features_combined/train_Hfeatures.csv', index_col = 'CASRN')

In [15]:
#make sure the cols are the same
list(train_Hfeatures) == list(test_Hfeatures)

True

In [16]:
test_Hfeatures.to_csv('../data/Hmodel_features_combined/test_Hfeatures.csv')