Dokumentacja TPOT: https://epistasislab.github.io/tpot/

Przykładowe konfiguracje: https://github.com/EpistasisLab/tpot/tree/master/tpot/config

%%bash
conda install -c conda-forge tpot

# Przygotowanie danych

In [11]:
from tpot import TPOTClassifier

import pandas as pd
import numpy as np

In [12]:
X_train = pd.read_csv("../output/X_train.csv", index_col = "index")
y_train = pd.read_csv("../output/y_train.csv", names = ["index", "klasa"], index_col = "index")

X_test = pd.read_csv("../output/X_test.csv", index_col = "index")
y_test = pd.read_csv("../output/y_test.csv", names = ["index", "klasa"], index_col = "index")

In [13]:
y_train["klasa"].replace(["Ł"], 0, inplace = True)
y_train["klasa"].replace(["Z"], 1, inplace = True)

y_test["klasa"].replace(["Ł"], 0, inplace = True)
y_test["klasa"].replace(["Z"], 1, inplace = True)

# Użycie znanych modeli

In [4]:
konfiguracja_tpot = {
    'sklearn.tree.DecisionTreeClassifier': {
        'criterion': ["gini", "entropy"],
        'max_depth': range(1, 11),
        'min_samples_split': range(2, 21),
        'min_samples_leaf': range(1, 21)
    },
    'sklearn.ensemble.RandomForestClassifier': {
        'n_estimators': [100],
        'criterion': ["gini", "entropy"],
        'max_features': np.arange(0.05, 1.01, 0.05),
        'min_samples_split': range(2, 21),
        'min_samples_leaf':  range(1, 21),
        'bootstrap': [True, False]
    }
}

In [5]:
klasyfikator = TPOTClassifier(config_dict = konfiguracja_tpot, 
                              generations = 5, 
                              population_size = 50, 
                              verbosity = 2, 
                              random_state = 42)

In [6]:
klasyfikator.fit(features = X_train, target = y_train.values.ravel())

Optimization Progress:  33%|███▎      | 100/300 [04:46<16:59,  5.10s/pipeline]

Generation 1 - Current best internal CV score: 0.9626373626373628


Optimization Progress:  50%|█████     | 150/300 [08:35<07:44,  3.10s/pipeline]

Generation 2 - Current best internal CV score: 0.9626373626373628


Optimization Progress:  67%|██████▋   | 200/300 [12:58<12:24,  7.45s/pipeline]

Generation 3 - Current best internal CV score: 0.9626373626373628


Optimization Progress:  83%|████████▎ | 250/300 [18:17<06:03,  7.26s/pipeline]

Generation 4 - Current best internal CV score: 0.964835164835165


                                                                              

Generation 5 - Current best internal CV score: 0.9692307692307693

Best pipeline: RandomForestClassifier(CombineDFs(input_matrix, input_matrix), bootstrap=False, criterion=entropy, max_features=0.15000000000000002, min_samples_leaf=2, min_samples_split=18, n_estimators=100)


TPOTClassifier(config_dict={'sklearn.tree.DecisionTreeClassifier': {'criterion': ['gini', 'entropy'], 'max_depth': range(1, 11), 'min_samples_split': range(2, 21), 'min_samples_leaf': range(1, 21)}, 'sklearn.ensemble.RandomForestClassifier': {'n_estimators': [100], 'criterion': ['gini', 'entropy'], 'max_features':...), 'min_samples_split': range(2, 21), 'min_samples_leaf': range(1, 21), 'bootstrap': [True, False]}},
        crossover_rate=0.1, cv=5, disable_update_check=False,
        early_stop=None, generations=5, max_eval_time_mins=5,
        max_time_mins=None, memory=None, mutation_rate=0.9, n_jobs=1,
        offspring_size=50, periodic_checkpoint_folder=None,
        population_size=50, random_state=42, scoring=None, subsample=1.0,
        verbosity=2, warm_start=False)

![Proces TPOT](../input/tpot-ml-pipeline.png)

# "Szybka" konfiguracja

In [7]:
klasyfikator = TPOTClassifier(config_dict = "TPOT light", 
                              generations = 100, 
                              population_size = 100, 
                              verbosity = 2, 
                              random_state = 42, 
                              early_stop = 10)
klasyfikator.fit(features = X_train, target = y_train.values.ravel())

Optimization Progress:   2%|▏         | 200/10100 [00:20<15:07, 10.91pipeline/s]  

Generation 1 - Current best internal CV score: 0.9758241758241759


Optimization Progress:   3%|▎         | 300/10100 [00:37<11:39, 14.02pipeline/s]  

Generation 2 - Current best internal CV score: 0.9758241758241759


Optimization Progress:   4%|▍         | 400/10100 [00:53<13:59, 11.56pipeline/s]  

Generation 3 - Current best internal CV score: 0.9758241758241759


Optimization Progress:   5%|▍         | 500/10100 [01:28<58:20,  2.74pipeline/s]  

Generation 4 - Current best internal CV score: 0.9780219780219781


Optimization Progress:   6%|▌         | 600/10100 [02:07<42:41,  3.71pipeline/s]  

Generation 5 - Current best internal CV score: 0.9780219780219781


Optimization Progress:   7%|▋         | 700/10100 [02:52<1:39:00,  1.58pipeline/s]

Generation 6 - Current best internal CV score: 0.9780219780219781


Optimization Progress:   8%|▊         | 800/10100 [03:17<16:40,  9.29pipeline/s]  

Generation 7 - Current best internal CV score: 0.9802197802197803


Optimization Progress:   9%|▉         | 900/10100 [03:41<12:09, 12.61pipeline/s]  

Generation 8 - Current best internal CV score: 0.9802197802197803


Optimization Progress:  10%|▉         | 1000/10100 [04:19<23:59,  6.32pipeline/s] 

Generation 9 - Current best internal CV score: 0.9802197802197803


Optimization Progress:  11%|█         | 1100/10100 [05:00<43:00,  3.49pipeline/s]  

Generation 10 - Current best internal CV score: 0.9802197802197803


Optimization Progress:  12%|█▏        | 1200/10100 [05:27<25:00,  5.93pipeline/s]  

Generation 11 - Current best internal CV score: 0.9802197802197803


Optimization Progress:  13%|█▎        | 1300/10100 [05:57<15:51,  9.25pipeline/s]  

Generation 12 - Current best internal CV score: 0.9824175824175825


Optimization Progress:  14%|█▍        | 1400/10100 [06:28<1:31:49,  1.58pipeline/s]

Generation 13 - Current best internal CV score: 0.9824175824175825


Optimization Progress:  15%|█▍        | 1500/10100 [07:05<36:37,  3.91pipeline/s]  

Generation 14 - Current best internal CV score: 0.9824175824175825


Optimization Progress:  16%|█▌        | 1600/10100 [07:45<1:49:57,  1.29pipeline/s]

Generation 15 - Current best internal CV score: 0.9824175824175825


Optimization Progress:  17%|█▋        | 1700/10100 [08:44<6:27:08,  2.77s/pipeline]

Generation 16 - Current best internal CV score: 0.9824175824175825


Optimization Progress:  18%|█▊        | 1800/10100 [09:33<52:11,  2.65pipeline/s]  

Generation 17 - Current best internal CV score: 0.9824175824175825


Optimization Progress:  19%|█▉        | 1900/10100 [10:19<27:19,  5.00pipeline/s]  

Generation 18 - Current best internal CV score: 0.9824175824175825


Optimization Progress:  20%|█▉        | 2000/10100 [10:58<3:52:12,  1.72s/pipeline]

Generation 19 - Current best internal CV score: 0.9824175824175825


Optimization Progress:  21%|██        | 2100/10100 [12:09<1:16:13,  1.75pipeline/s]

Generation 20 - Current best internal CV score: 0.9824175824175825


Optimization Progress:  22%|██▏       | 2200/10100 [12:57<37:14,  3.54pipeline/s]  

Generation 21 - Current best internal CV score: 0.9824175824175825


Optimization Progress:  23%|██▎       | 2300/10100 [13:33<51:37,  2.52pipeline/s]  

Generation 22 - Current best internal CV score: 0.9846153846153847


Optimization Progress:  24%|██▍       | 2400/10100 [14:19<2:14:56,  1.05s/pipeline]

Generation 23 - Current best internal CV score: 0.9846153846153847


Optimization Progress:  25%|██▍       | 2500/10100 [15:43<24:42,  5.13pipeline/s]  

Generation 24 - Current best internal CV score: 0.9846153846153847


Optimization Progress:  26%|██▌       | 2600/10100 [16:37<21:22,  5.85pipeline/s]  

Generation 25 - Current best internal CV score: 0.9846153846153847


Optimization Progress:  27%|██▋       | 2700/10100 [17:41<38:59,  3.16pipeline/s]  

Generation 26 - Current best internal CV score: 0.9846153846153847


Optimization Progress:  28%|██▊       | 2800/10100 [19:02<1:00:31,  2.01pipeline/s]

Generation 27 - Current best internal CV score: 0.9846153846153847


Optimization Progress:  29%|██▊       | 2900/10100 [21:24<1:48:28,  1.11pipeline/s]

Generation 28 - Current best internal CV score: 0.9846153846153847


Optimization Progress:  30%|██▉       | 3000/10100 [22:39<37:30,  3.15pipeline/s]  

Generation 29 - Current best internal CV score: 0.9846153846153847


Optimization Progress:  31%|███       | 3100/10100 [24:05<2:26:53,  1.26s/pipeline]

Generation 30 - Current best internal CV score: 0.9846153846153847


Optimization Progress:  32%|███▏      | 3200/10100 [25:25<3:11:07,  1.66s/pipeline]

Generation 31 - Current best internal CV score: 0.9846153846153847


                                                                                   

Generation 32 - Current best internal CV score: 0.9846153846153847

The optimized pipeline was not improved after evaluating 10 more generations. Will end the optimization process.

TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: LogisticRegression(StandardScaler(LogisticRegression(input_matrix, C=0.0001, dual=False, penalty=l1)), C=0.1, dual=False, penalty=l2)


TPOTClassifier(config_dict={'sklearn.naive_bayes.GaussianNB': {}, 'sklearn.naive_bayes.BernoulliNB': {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 'fit_prior': [True, False]}, 'sklearn.naive_bayes.MultinomialNB': {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 'fit_prior': [True, False]}, 'sklearn.tree.DecisionT...e_selection.VarianceThreshold': {'threshold': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.2]}},
        crossover_rate=0.1, cv=5, disable_update_check=False,
        early_stop=10, generations=100, max_eval_time_mins=5,
        max_time_mins=None, memory=None, mutation_rate=0.9, n_jobs=1,
        offspring_size=100, periodic_checkpoint_folder=None,
        population_size=100, random_state=42, scoring=None, subsample=1.0,
        verbosity=2, warm_start=False)

In [16]:
klasyfikator.score(testing_features = X_test, testing_target = y_test)

0.9824561403508771

In [14]:
klasyfikator.export("../output/tpot.py")

True

In [15]:
%%bash
cat ../output/tpot.py

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import StandardScaler
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.9846153846153847
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=LogisticRegression(C=0.0001, dual=False, penalty="l1")),
    StandardScaler(),
    LogisticRegression(C=0.1, dual=False, penalty="l2")
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline