In [1]:
import pandas as pd
from sklearn.model_selection import RepeatedStratifiedKFold
from MIL_functions import data_splitting,model_building,data_encoding
from tpot import TPOTClassifier
from IPython.display import clear_output
import gc

try:
    import misvm 
except:
    print("please use command to install MIL modelling package \n pip install -e git+https://github.com/garydoranjr/misvm.git#egg=misvm")
    quit()

# Aromatic amine models

MACCS keys version

In [2]:
input_data = [
    {'name':'random','function':data_splitting.random_split,'internal_save':'model_results\internal\MIL_aromatic_amine_cv_results.pk1','external_save':'model_results\external\ext_val_results.pk1'},
    {'name':'scaffold','function':data_splitting.scaffold_split,'internal_save':'model_results\internal\MIL_aromatic_amine_cv_Scaffold.pk1','external_save':'model_results\external\ext_val_results_scaffold_stratified.pk1'},
    {'name':'LSH','function':data_splitting.LSH,'internal_save':'model_results\internal\MIL_aromatic_amine_cv_LSH.pk1','external_save':'model_results\external\ext_val_results_LSH_stratified.pk1'},
    {'name':'SEC','function':data_splitting.SEC,'internal_save':'model_results\internal\MIL_aromatic_amine_cv_SEC.pk1','external_save':'model_results\external\ext_val_results_SEC_stratified.pk1'},
]


data = data_encoding.load_compressed_pickle("data/encoded/encoded_data.dat")
for splitting_method in input_data:
    ########## Internal Validation
    for encoding in ["MACCS"]:
        training_data,test_data = data_splitting.split_data(data,splitting_method['function'])
        rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=6234794)
        for fold,[train_index, validation_index] in enumerate(rskf.split(training_data, training_data["Ames"])):
            train   =   training_data.iloc[train_index];        validation    =   training_data.iloc[validation_index]
            model_building.develop_models(training_data=train,testing_data=validation,encoding = encoding,suffix={"fold":fold%10,"iteration":fold//10},save_model=False,save_name=splitting_method['internal_save'])
            del train; del validation; gc.collect()

    ########## External validation
    best_model = ["total_data_NSK_polynomial",misvm.NSK(kernel="polynomial",verbose=False)]; encoding = "MACCS"
    tpot_model = TPOTClassifier(generations=10, population_size=500, cv=5, verbosity=1, n_jobs=8)
    model_building.build_test_mil_model(training_data=training_data,testing_data=test_data,encoding=encoding,suffix={"fold":"","iteration":""},save_model=False,save_name=splitting_method['external_save'],model_name=best_model[0],MIL=best_model[1])
    model_building.build_test_ml_model( training_data=training_data,testing_data=test_data,encoding=encoding,suffix={"fold":"","iteration":""},save_model=False,save_name=splitting_method['external_save'],tpot=tpot_model,splitting_name=splitting_method['name'])

Already tested   fold:     iteration:     model: TPOT    encoding: MACCS


Morgan version

In [2]:
input_data = [
    {'name':'random','function':data_splitting.random_split,'internal_save':'model_results\internal\MIL_aromatic_amine_cv_results_MORGAN.pk1','external_save':'model_results\external\ext_val_results_MORGAN.pk1'},
    {'name':'scaffold','function':data_splitting.scaffold_split,'internal_save':'model_results\internal\MIL_aromatic_amine_cv_Scaffold_MORGAN.pk1','external_save':'model_results\external\ext_val_results_scaffold_stratified_MORGAN.pk1'},
    {'name':'LSH','function':data_splitting.LSH,'internal_save':'model_results\internal\MIL_aromatic_amine_cv_LSH_MORGAN.pk1','external_save':'model_results\external\ext_val_results_LSH_stratified_MORGAN.pk1'},
    {'name':'SEC','function':data_splitting.SEC,'internal_save':'model_results\internal\MIL_aromatic_amine_cv_SEC_MORGAN.pk1','external_save':'model_results\external\ext_val_results_SEC_stratified_MORGAN.pk1'},
]


data = data_encoding.load_compressed_pickle("data/encoded/encoded_data.dat")
data = model_building.remove_zero_variance(data,encoding='Morgan')
for splitting_method in input_data:
    ########## Internal Validation
    for encoding in ["Morgan"]:
        training_data,test_data = data_splitting.split_data(data,splitting_method['function'])
        rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=6234794)
        for fold,[train_index, validation_index] in enumerate(rskf.split(training_data, training_data["Ames"])):
            train   =   training_data.iloc[train_index];        validation    =   training_data.iloc[validation_index]
            model_building.develop_models(training_data=train,testing_data=validation,encoding = encoding,suffix={"fold":fold%10,"iteration":fold//10},save_model=False,save_name=splitting_method['internal_save'])
            del train; del validation; gc.collect()

    ########## External validation
    best_model = ["total_data_NSK_polynomial",misvm.NSK(kernel="polynomial",verbose=False)]; encoding = "Morgan"
    tpot_model = TPOTClassifier(generations=10, population_size=500, cv=5, verbosity=3, n_jobs=-1)
    model_building.build_test_mil_model(training_data=training_data,testing_data=test_data,encoding=encoding,suffix={"fold":"","iteration":""},save_model=False,save_name=splitting_method['external_save'],model_name=best_model[0],MIL=best_model[1])
    model_building.build_test_ml_model( training_data=training_data,testing_data=test_data,encoding=encoding,suffix={"fold":"","iteration":""},save_model=False,save_name=splitting_method['external_save'],tpot=tpot_model,splitting_name=splitting_method['name'])

Building and testing:   fold:     Iteration:     model: TPOT    encoding: Morgan


  from pandas import MultiIndex, Int64Index


32 operators have been imported by TPOT.
Skipped pipeline #137 due to time out. Continuing to the next pipeline.          
_pre_test decorator: _random_mutation_operator: num_test=0 [11:21:18] C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/learner.cc:602: Check failed: mparam_.num_feature != 0 (0 vs. 0) : 0 feature is supplied.  Are you using raw Booster interface?.
_pre_test decorator: _random_mutation_operator: num_test=0 Found array with 0 feature(s) (shape=(50, 0)) while a minimum of 1 is required..
_pre_test decorator: _random_mutation_operator: num_test=0 manhattan was provided as affinity. Ward can only work with euclidean distances..
_pre_test decorator: _random_mutation_operator: num_test=0 Found array with 0 feature(s) (shape=(50, 0)) while a minimum of 1 is required..
_pre_test decorator: _random_mutation_operator: num_test=0 Found array with 0 feature(s) (shape=(50, 0)) while a minimum of 1 is required..
_pre_test decorator: _random_mutation_operator: num_

# Hansen Models

In [5]:
from sklearn.feature_selection import VarianceThreshold
import numpy as np

In [6]:
def remove_zero_variance(inp):
    df = inp.copy()
    all_data = [lst for lists in df['Morgan_MIL'].to_list() for lst in lists]
    constant_filter = VarianceThreshold(threshold=0)
    constant_filter.fit(all_data)
    # df['Morgan'] = df['Morgan'].apply(lambda x: constant_filter.transform(np.array(x).reshape(1, -1)))
    df['Morgan_MIL'] = df['Morgan_MIL'].apply(lambda x: constant_filter.transform(x))
    return df


In [12]:
data = {}
data['MACCS'] = pd.read_pickle("data/encoded/encoded_data_hansen.pk1")
data['Morgan'] = pd.read_pickle("data/encoded/encoded_data_hansen.pk1"); data['Morgan'] = model_building.clean_data(data['Morgan']); data['Morgan'] = remove_zero_variance(data['Morgan'])
rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=6234794)
for kernel in ['linear', 'polynomial']:
    mil = misvm.NSK(kernel=kernel,verbose=False)
    
    name = "NSK "+str(kernel)
    for encoding in ["MACCS",'Morgan']:
        dataset = data[encoding]
        for fold,[train_index, validation_index] in enumerate(rskf.split(dataset, dataset["Ames"])):
            train   =   dataset.iloc[train_index];        validation    =   dataset.iloc[validation_index]
            model_building.check_rank(train);model_building.check_rank(validation)
            model_building.build_test_mil_model(training_data=train,testing_data=validation,encoding = encoding,suffix={"fold":fold%10,"iteration":fold//10},MIL = mil,save_model=False,save_name='model_results/hansen/rscv_random_hansen_results.pk1',model_name=name)
            del train; del validation; gc.collect()

This will fail... 20
This will fail... 4
Already tested   fold: 9    iteration: 0    model: NSK polynomial    encoding: Morgan
