In [11]:
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics import (
    make_scorer,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    fbeta_score,
    roc_auc_score,
    average_precision_score,
    log_loss,
    confusion_matrix,
    classification_report,
    precision_recall_curve,
    roc_curve,
)
from tpot import TPOTClassifier

In [2]:
SEED = 1990
MODEL_REPO = 'models/'

### Read Data

In [3]:
with open('classification_data.pkl', 'rb') as f:
    data = pickle.load(f)
data.keys()

dict_keys(['dataset', 'X_train', 'X_test', 'y_train', 'y_test', 'train', 'test'])

In [4]:
X_train, X_test, y_train, y_test = data['X_train'], data['X_test'], data['y_train'], data['y_test']

### Prepare Dataset

In [16]:
with open(f'{MODEL_REPO}prepro.pkl', 'rb') as f:
    prepro = pickle.load(f)

In [17]:
prepro

In [18]:
X_train_prp = prepro.fit_transform(X_train,y_train)
X_train_prp.shape

(750, 7)

### TPOT Classifier

* [TPOT default params grid](https://github.com/EpistasisLab/tpot/blob/master/tpot/config/classifier.py)

In [26]:
scoring = {
    "accuracy": make_scorer(accuracy_score),
    "f1": make_scorer(f1_score),
    "f2": make_scorer(fbeta_score, beta=2),
    "f0.5": make_scorer(fbeta_score, beta=0.5),
    "precision": make_scorer(precision_score),
    "recall": make_scorer(recall_score),
    "roc_auc": make_scorer(roc_auc_score),
    "pr_auc": make_scorer(average_precision_score),
    "log_loss": make_scorer(log_loss),
}

In [40]:
tpot = TPOTClassifier(
    generations=100,
    population_size=100,
    verbosity=2,
    max_time_mins=5,
    max_eval_time_mins=0.04,
    offspring_size=None,
    mutation_rate=0.9,
    crossover_rate=0.1,
    scoring=scoring,
    cv=5,
    subsample=1.0,
    n_jobs=1,
    random_state=SEED
)

In [43]:
tpot.fit(X_train_prp, y_train)

Optimization Progress:   0%|          | 0/100 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.8853333333333333

Generation 2 - Current best internal CV score: 0.8853333333333333

Generation 3 - Current best internal CV score: 0.8866666666666667

Generation 4 - Current best internal CV score: 0.8866666666666667

Generation 5 - Current best internal CV score: 0.8866666666666667

5.01 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: ExtraTreesClassifier(CombineDFs(input_matrix, input_matrix), bootstrap=True, criterion=entropy, max_features=0.9500000000000001, min_samples_leaf=8, min_samples_split=5, n_estimators=100)


In [52]:
[x for x in dir(tpot) if not x.startswith('_')]

['arguments',
 'classification',
 'clean_pipeline_string',
 'config_dict',
 'crossover_rate',
 'cv',
 'default_config_dict',
 'disable_update_check',
 'early_stop',
 'evaluated_individuals_',
 'export',
 'fit',
 'fit_predict',
 'fitted_pipeline_',
 'generations',
 'get_params',
 'log_file',
 'log_file_',
 'max_eval_time_mins',
 'max_time_mins',
 'memory',
 'mutation_rate',
 'n_jobs',
 'offspring_size',
 'op_list',
 'operators',
 'operators_context',
 'pareto_front_fitted_pipelines_',
 'periodic_checkpoint_folder',
 'population_size',
 'predict',
 'predict_proba',
 'pretest_X',
 'pretest_y',
 'random_state',
 'regression',
 'ret_types',
 'score',
 'scoring',
 'scoring_function',
 'set_params',
 'subsample',
 'template',
 'tree_structure',
 'use_dask',
 'verbosity',
 'warm_start']

In [44]:
tpot.fitted_pipeline_

In [45]:
tpot.fitted_pipeline_.steps

[('featureunion',
  FeatureUnion(transformer_list=[('functiontransformer-1',
                                  FunctionTransformer(func=<function copy at 0x1040b5e50>)),
                                 ('functiontransformer-2',
                                  FunctionTransformer(func=<function copy at 0x1040b5e50>))])),
 ('extratreesclassifier',
  ExtraTreesClassifier(bootstrap=True, criterion='entropy',
                       max_features=0.9500000000000001, min_samples_leaf=8,
                       min_samples_split=5, random_state=1990))]

In [46]:
tpot.pareto_front_fitted_pipelines_

{'ExtraTreesClassifier(CombineDFs(input_matrix, input_matrix), ExtraTreesClassifier__bootstrap=True, ExtraTreesClassifier__criterion=entropy, ExtraTreesClassifier__max_features=0.9500000000000001, ExtraTreesClassifier__min_samples_leaf=8, ExtraTreesClassifier__min_samples_split=5, ExtraTreesClassifier__n_estimators=100)': Pipeline(steps=[('featureunion',
                  FeatureUnion(transformer_list=[('functiontransformer-1',
                                                  FunctionTransformer(func=<function copy at 0x1040b5e50>)),
                                                 ('functiontransformer-2',
                                                  FunctionTransformer(func=<function copy at 0x1040b5e50>))])),
                 ('extratreesclassifier',
                  ExtraTreesClassifier(bootstrap=True, criterion='entropy',
                                       max_features=0.9500000000000001,
                                       min_samples_leaf=8, min_samples_split=5,
  

In [47]:
tpot.evaluated_individuals_

{'MLPClassifier(BernoulliNB(input_matrix, BernoulliNB__alpha=0.1, BernoulliNB__fit_prior=False), MLPClassifier__alpha=0.0001, MLPClassifier__learning_rate_init=0.001)': {'generation': 0,
  'mutation_count': 0,
  'crossover_count': 0,
  'predecessor': ('ROOT',),
  'operator_count': 2,
  'internal_cv_score': 0.8853333333333333},
 'MLPClassifier(RobustScaler(input_matrix), MLPClassifier__alpha=0.01, MLPClassifier__learning_rate_init=0.1)': {'generation': 0,
  'mutation_count': 0,
  'crossover_count': 0,
  'predecessor': ('ROOT',),
  'operator_count': 2,
  'internal_cv_score': 0.8559999999999999},
 'GradientBoostingClassifier(OneHotEncoder(input_matrix, OneHotEncoder__minimum_fraction=0.1, OneHotEncoder__sparse=False, OneHotEncoder__threshold=10), GradientBoostingClassifier__learning_rate=0.01, GradientBoostingClassifier__max_depth=7, GradientBoostingClassifier__max_features=0.9500000000000001, GradientBoostingClassifier__min_samples_leaf=11, GradientBoostingClassifier__min_samples_split=1

In [48]:
tpot.export('tpot_pipeline.py')

In [None]:
# %load tpot_pipeline.py
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=1990)

# Average CV score on the training set was: 0.8866666666666667
exported_pipeline = make_pipeline(
    make_union(
        FunctionTransformer(copy),
        FunctionTransformer(copy)
    ),
    ExtraTreesClassifier(bootstrap=True, criterion="entropy", max_features=0.9500000000000001, min_samples_leaf=8, min_samples_split=5, n_estimators=100)
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 1990)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
