In [None]:
import pandas as pd
import autosklearn.classification
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif, SelectKBest
from sklearn.preprocessing import (
    StandardScaler,
    RobustScaler,
    MinMaxScaler,
    MaxAbsScaler,
)
from matplotlib import pyplot as plt
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from itertools import product
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from autogluon.tabular import TabularDataset, TabularPredictor
import time
from IPython.display import display
import warnings

warnings.filterwarnings("ignore")
warnings.simplefilter("ignore")

In [None]:
# set random seed for replication
seed = 42

In [None]:
# load train and test data
train = pd.read_csv("./fashion-mnist_train.csv").sample(n=1000, random_state=seed)
test = pd.read_csv("./fashion-mnist_test.csv").sample(n=100, random_state=seed)
label = "label"
X_train, X_test, y_train, y_test = train.drop(columns=[label]), test.drop(columns=[label]), train[label], test[label]

In [None]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In [None]:
# visualize categories in the dataset
target_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
y_labels = y_train.apply(lambda x: target_names[x])
samples = X_train.loc[[x[0] for x in y_labels.reset_index().groupby("label").first().values]]
#setup image
plt.figure()
#subplot(r,c) provide the no. of rows and columns
f, axarr = plt.subplots(2,5) 
for i, (idx, row) in enumerate(samples.iterrows()):
    pvals = np.array(row, dtype='float')
    pixels = pvals.reshape((28, 28))
    plt.subplot(2, 5, i+1)
    fig = plt.imshow(pixels, cmap='gray')
    fig.axes.get_xaxis().set_visible(False)
    fig.axes.get_yaxis().set_visible(False)
    print(y_labels.loc[idx], end=", ")
plt.axis('off')
plt.show()

# Exhaustive search

1. Define pipeline options
2. Run all pipelines
3. Return the best pipeline

In [None]:
# define pipelines
def mi(X, y):
    return mutual_info_classif(X, y, random_state=seed)
preprocessing_options = [
    StandardScaler(),
    MinMaxScaler(),
]
feature_selection_options = [
    SelectKBest(score_func=f_classif, k=50),
    SelectKBest(score_func=mi, k=50),
]
model_options = [MLPClassifier(random_state=seed), AdaBoostClassifier(random_state=seed), DecisionTreeClassifier(random_state=seed), SVC(random_state=seed)]
stages = ["preprocessing", "feature selection", "models"]


def create_pipelines():
    pipelines = [
        Pipeline([_ for _ in zip(stages, pipeline)])
        for pipeline in product(
            preprocessing_options, feature_selection_options, model_options
        )
    ]
    return pipelines

In [None]:
# Execute pipelines
results = []
for pp in create_pipelines():
    print(pp)
    pp.fit(X_train, y_train)
    predictions = pp.predict(X_test)
    results.append(
        dict(
            [(key, str(val)) for key, val in pp.steps]
            + [("score", accuracy_score(y_test.tolist(), predictions))]
        )
    )
    print("============================================================")
exhaustive_search_result_df = pd.DataFrame(results)
display(exhaustive_search_result_df)

In [None]:
# pipeline with highest r2_score
print(exhaustive_search_result_df.loc[exhaustive_search_result_df["score"].idxmax()])

## Execute pipelines in parallel using joblib

1. Executing pipelines is an embarassingly parallel job
2. Pipelines can be executed independently on all available cores
3. We use Python's joblib package for parallel execution

## Further optimizations

Execution time can be further improved by 
1. Using multiple machines
2. Pipelines share common subtasks. Results of these subtasks can be shared across cores and machines

In [None]:
from joblib import Parallel, delayed
from tqdm import tqdm


def workerfunc(args):
    X_train, y_train, X_test, y_test, pp = args
    pp.fit(X_train, y_train)
    predictions = pp.predict(X_test)
    return accuracy_score(y_test, predictions)


with Parallel(n_jobs=-1) as parallel:
    tasks = [[X_train, y_train, X_test, y_test, pp] for pp in create_pipelines()]
    results_parallel = parallel(delayed(workerfunc)(args) for args in tqdm(tasks))
    exhaustive_search_parallel_result_df = pd.DataFrame(
        zip([x[-1] for x in tasks], results_parallel), columns=["pipeline", "score"]
    )
    display(exhaustive_search_parallel_result_df)

In [None]:
# pipeline with best r2_score using parallel exhaustive search
print(
    exhaustive_search_parallel_result_df.loc[
        exhaustive_search_parallel_result_df["score"].idxmax()
    ]
)

# Genetic programming
1. TPOT package (https://epistasislab.github.io/tpot/)
2. Initialized with number of generations, population size, maximum time

In [None]:
# tpot genetic algorithms
from tpot import TPOTClassifier

tpot = TPOTClassifier(
    generations=15,
    population_size=10,
    verbosity=2,
    random_state=42,
    max_time_mins=1,
    n_jobs=-1,
)
tpot.fit(X_train, y_train)
predictions = tpot.predict(X_test)
display(accuracy_score(y_test.tolist(), predictions))

# Bayesian optimization

1. Uses same interface as scikit-learn https://automl.github.io/auto-sklearn/master/#

In [None]:
# bayesian optimization
import autosklearn.classification
automl = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task=30, memory_limit=None, n_jobs=-1)
automl.fit(X_train, y_train)
predictions = automl.predict(X_test)

In [None]:
# best r2_score using AutoSklearn
print("accuracy score:", accuracy_score(y_test.tolist(), predictions))
print(automl.show_models())

# AutoGluon (https://auto.gluon.ai/stable/index.html#)

In [None]:
save_path = 'autogluon-fashion-mnist'  # specifies folder to store trained models
label = "label"
time_limit = 60  # for quick demonstration only, you should set this to longest time you are willing to wait (in seconds)
metric = 'accuracy'  # specify your evaluation metric here
predictor = TabularPredictor(label="label", path=save_path).fit(train,  time_limit=time_limit, presets='best_quality')

In [None]:
y_test = test[label]  # values to predict
test_data_nolab = test.drop(columns=[label])  # delete label column to prove we're not cheating
test_data_nolab.head()
y_pred = predictor.predict(test_data_nolab)
print("Predictions:  \n", y_pred)
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)

In [None]:
predictor.leaderboard(test, silent=True)

# Conclusions
1. AutoML tools automate the search for optimal ML pipelines
2. Tools differ in the way they explore the space of potential pipelines