# INCLUDE
Instruction to colab user:
- Run the first cell to install auto-sklearn on colab, and auto restart runtime, then proceed to run the following cells
- Run the second cell to mount google drive for I/O of dataset, results. In this notebook, dataset can be load from pickle and results can be dump to pickle.

Instruction to non-colab user:
- Ignore first two cell
- Change the paths


## Run for colab

In [None]:
# For Colab, you need to install auto-sklearn every time
import sys
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
    !pip install auto-sklearn # Downgrade scipy to 1.4.x
    #!pip install scipy # Upgrade scipy to 1.7.x

import os, signal
os.kill(os.getpid(), signal.SIGKILL) # Restart_runtime

Collecting auto-sklearn
  Downloading auto-sklearn-0.14.5.tar.gz (6.4 MB)
[K     |████████████████████████████████| 6.4 MB 4.4 MB/s 
Collecting distro
  Downloading distro-1.6.0-py2.py3-none-any.whl (19 kB)
Collecting scipy>=1.7.0
  Downloading scipy-1.7.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (38.1 MB)
[K     |████████████████████████████████| 38.1 MB 58.8 MB/s 
Collecting scikit-learn<0.25.0,>=0.24.0
  Downloading scikit_learn-0.24.2-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
[K     |████████████████████████████████| 22.3 MB 1.6 MB/s 
[?25hCollecting dask>=2021.12
  Downloading dask-2022.1.1-py3-none-any.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 41.6 MB/s 
[?25hCollecting distributed>=2012.12
  Downloading distributed-2022.1.1-py3-none-any.whl (830 kB)
[K     |████████████████████████████████| 830 kB 41.3 MB/s 
Collecting liac-arff
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
Collecting ConfigSpace<0.5,>=0.4.14
  Downloading ConfigSpa

In [1]:
import sys
IN_COLAB = 'google.colab' in sys.modules

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Run for everyone

In [2]:
# Dependencies
# Common
import os
import pickle
import sys
import time
from tqdm.autonotebook import tqdm
from datetime import datetime

import numpy as np
import pandas as pd

# Plot
import matplotlib.pyplot as plt
import seaborn as sns

# ML
import sklearn # Import sklearn before autosklearn, solve scipy version error
from sklearn.model_selection import train_test_split
import sklearn.datasets
from sklearn.metrics import accuracy_score, balanced_accuracy_score

# AML
import autosklearn
import autosklearn.classification
from autosklearn.metrics import balanced_accuracy, precision, recall

# Wilcoxon test
from scipy.stats import wilcoxon

# Check machine
#import multiprocessing
#multiprocessing.cpu_count()

  import sys


## Set paths

In [3]:
# Set up paths

# User insert folder to store pickles
# dataset name in format openml_xxx.pkl
datasets_folder = '/content/drive/My Drive/Colab Notebooks/course_AML_proj/data'
results_folder = '/content/drive/My Drive/Colab Notebooks/course_AML_proj/results'

In [4]:
# Look up results directory
os.listdir(results_folder)

['openml_meta_features.pkl',
 'experiment_20220119-174007_ensemble_size_50.pkl',
 'experiment_20220119-192815_ensemble_size_50.pkl',
 'experiment_20220119-201052_ensemble_size_50.pkl',
 'experiment_20220119-205603_ensemble_size_50.pkl',
 'experiment_20220119-220334_ensemble_size_50.pkl',
 'experiment_20220119-225840_ensemble_size_50.pkl',
 'experiment_20220119-235856_ensemble_size_50.pkl',
 'experiment_20220120-010129_ensemble_size_50.pkl',
 'experiment_20220120-021706_ensemble_size_50.pkl',
 'experiment_20220120-090232_ensemble_size_50.pkl',
 'experiment_20220120-095056_ensemble_size_50.pkl',
 'experiment_20220120-194311_ensemble_size_50.pkl',
 'unknown',
 'luuk',
 'experiment_20220128-102503_ensemble_size_100.pkl',
 'experiment_20220128-115856_ensemble_size_100.pkl',
 'experiment_20220128-154520_ensemble_size_100.pkl',
 'experiment_20220128-172015_ensemble_size_100.pkl',
 'experiment_20220203-170623_ensemble_size_100.pkl']

# Functions

In [5]:
def Load_result_pkl(folder_path, result_name):
    fname = os.path.join(folder_path, result_name)
    file_read = open(fname, "rb")
    res = pickle.load(file_read) # Load pickle to data
    file_read.close()
    return res

def Prep_res_df(res, meta_features_df):
    res_df = pd.DataFrame({k:{'mean_acc': np.mean(v['acc']), 'sd_acc': np.std(v['acc'])} for k,v in res.items()}).T
    res_df.reset_index(inplace=True)
    res_df.rename(columns={'index': 'dataset'}, inplace=True)
    res_df = res_df.merge(
        meta_features_df, 
        how='inner', on='dataset')
    return res_df

In [6]:
def Compare_acc_meta_feature(res_df, y1, y2, sortby='dataset'):
    # Plot test acc avg on each dataset
    fig, ax1 = plt.subplots(figsize=(20,4))
    ax2 = ax1.twinx()

    sns.scatterplot(x='dataset', y=y1,
                    data=res_df.sort_values(sortby),
#                    data=res_df.sort_values('mean_acc'),
                    color='r',
                    ax=ax1)
    sns.scatterplot(x='dataset', y=y2, 
                    data=res_df.sort_values(sortby), 
#                    data=res_df.sort_values('mean_acc'),
                    color='b',
                    ax=ax2)

    ax2.set(yscale='log')

    ax1.tick_params(axis='x', rotation=90)
    ax1.set_xlabel('Dataset')
    ax1.set_ylabel('Balanced accuracy')
    ax2.set_ylabel(y2)

    ax1.grid(color='k', linestyle='--', alpha=0.2, linewidth=1)

    fig.legend([y1, y2],
            loc='lower right')

    fig.tight_layout()

    return fig

def Scatter_acc_meta_feature(res_df):
    x, y, z = res_df['n_instance'], res_df['n_feature'], res_df['mean_acc']

    fig, ax = plt.subplots()

    plt.scatter(x=x, y=y, c=z,
                cmap='inferno')

    ax.set(xscale='log', yscale='log')

    ax.set_xlabel('n_instance')
    ax.set_ylabel('n_feature')
    ax.set_facecolor('silver')

    plt.colorbar(ax=ax)

    plt.tight_layout()

    return fig

# Load meta features

In [7]:
folder_path = results_folder

fname = os.path.join(folder_path, 'openml_meta_features.pkl')
file_read = open(fname, "rb")
meta_features = pickle.load(file_read) # Load pickle to data
file_read.close()

In [8]:
meta_features_df = pd.DataFrame(meta_features).T
meta_features_df.reset_index(inplace=True)
meta_features_df.rename(columns={'index': 'dataset'}, inplace=True)
meta_features_df.head()

Unnamed: 0,dataset,n_instance,n_feature,n_class
0,41168,83733,54,4
1,1596,581012,54,7
2,41150,130064,50,2
3,40668,67557,42,3
4,3,3196,36,2


# Experiment 1

## Process results

In [39]:
# Experiments are run in parts. Results are stored in separate pickles.
# Need to load all results into a list for futher summary. 
all_exp_fnames = [res_fname for res_fname in os.listdir(results_folder) if 'experiment_' in res_fname]

list_res = []

for load_result in all_exp_fnames:

    fname = os.path.join(results_folder, load_result)
    file_read = open(fname, "rb")
    res = pickle.load(file_read) # Load pickle to data
    file_read.close()
    list_res.append(res)

In [40]:
# Extract validation accuracy
res_dict = {k: {} for k in np.unique([k for res in list_res for k in res.keys()])}

for res in list_res:
    for k1,v1 in res.items():
        for k2,v2 in v1['val_acc'].items():
            res_dict[k1].update({k2: np.mean(v2)})

In [41]:
# Validation results
# row - ensemle size
# col - datasets
res_val = pd.DataFrame(res_dict)
res_val.round(3)

Unnamed: 0,1111,12,1461,1468,1486,1489,3,40981,40984,41161,41164,41165,54
10,0.645,0.969,0.85,0.9,0.965,0.843,0.996,0.847,0.966,0.5,0.67,0.1,0.858
25,0.645,0.971,0.85,0.905,0.966,0.853,0.996,0.842,0.966,0.5,0.669,0.1,0.864
50,0.645,0.981,0.85,0.929,0.966,0.859,0.996,0.845,0.966,0.5,0.663,0.1,0.859
75,0.645,0.986,0.851,0.927,0.966,0.857,0.998,0.866,0.966,0.5,0.65,0.1,0.887
100,0.585,0.978,0.851,0.933,0.965,0.863,0.998,0.85,0.966,0.5,0.653,0.1,0.887


In [42]:
# Find hyperparameter that gives max val acc
res_opt_size = pd.DataFrame(res_val.idxmax()).reset_index()

In [43]:
# Extract test accuracy
res_dict = {k: {} for k in np.unique([k for res in list_res for k in res.keys()])}

for i in range(len(res_opt_size)):
    dataset, opt_size = res_opt_size.iloc[i,:]
    for res in list_res:
        for k, v in res.items():
            if (k == dataset) and (opt_size in v['val_acc'].keys()):
                res_dict[dataset].update({'opt_size': opt_size, 'test_acc': np.mean(v['acc'])})

res_test = pd.DataFrame(res_dict).transpose()
res_test.round({'opt_size': 0, 'test_acc': 3})

Unnamed: 0,opt_size,test_acc
1111,75.0,0.664
12,75.0,0.993
1461,75.0,0.869
1468,100.0,0.944
1486,25.0,0.969
1489,100.0,0.871
3,75.0,0.99
40981,75.0,0.898
40984,50.0,0.984
41161,10.0,0.5


In [50]:
((np.divide(res_test.iloc[:,1], res_val.mean()) - 1) * 100).round(3)

1111     4.823
12       1.645
1461     2.220
1468     2.767
1486     0.312
1489     1.900
3       -0.723
40981    5.643
40984    1.869
41161    0.000
41164    4.557
41165    0.000
54      -5.440
dtype: float64

# For Random Search experiment on dataset 1461, 1486, 54, 1468

In [9]:
results_folder2 = '/content/drive/My Drive/Colab Notebooks/course_AML_proj/results/luuk'

all_exp_fnames = [res_fname for res_fname in os.listdir(results_folder2) if 'experiment_' in res_fname]

list_res = []

for load_result in all_exp_fnames:

    fname = os.path.join(results_folder2, load_result)
    file_read = open(fname, "rb")
    res = pickle.load(file_read) # Load pickle to data
    file_read.close()
    list_res.append(res)

In [15]:
res_table_luuk_all = {k: pd.DataFrame(res[k]['acc'])for res in list_res for k in res.keys()}

In [38]:
for k in res_table_luuk_all.keys():
    res_table_luuk_all[k].columns = ['val_acc', 'test_acc']
    maxind = np.argmax(res_table_luuk_all[k].iloc[:,0])
    print(f'for dataset: {k}')
    print(res_table_luuk_all[k].round(3))
    print(f'max config: {maxind}')
    print(f'max test acc: {res_table_luuk_all[k].iloc[:,1].max():.3f}')
    test_improve_acc = (res_table_luuk_all[k].iloc[maxind,1] / res_table_luuk_all[k].iloc[:,1].mean() - 1) * 100
    print(f'% improvement: {test_improve_acc:.3f}\n')

for dataset: 1461
   val_acc  test_acc
0    0.626     0.630
1    0.675     0.700
2    0.622     0.634
3    0.570     0.582
4    0.574     0.585
5    0.701     0.728
6    0.680     0.706
7    0.620     0.639
8    0.533     0.541
9    0.636     0.644
max config: 5
max test acc: 0.728
% improvement: 13.894

for dataset: 1486
   val_acc  test_acc
0    0.936     0.937
1    0.946     0.943
2    0.955     0.955
3    0.500     0.500
4    0.956     0.948
5    0.920     0.917
6    0.919     0.917
7    0.933     0.934
8    0.911     0.906
9    0.961     0.960
max config: 9
max test acc: 0.960
% improvement: 7.675

for dataset: 54
   val_acc  test_acc
0    0.812     0.746
1    0.830     0.739
2    0.250     0.250
3    0.901     0.855
4    0.819     0.830
5    0.778     0.777
6    0.913     0.856
7    0.794     0.744
8    0.808     0.838
9    0.833     0.819
max config: 6
max test acc: 0.856
% improvement: 14.816

for dataset: 1468
   val_acc  test_acc
0    0.916     0.951
1    0.916     0.948
2   