In [None]:
# For Colab, you need to install auto-sklearn every time
import sys
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
    !pip install auto-sklearn # Downgrade scipy to 1.4.x
    #!pip install scipy # Upgrade scipy to 1.7.x

import os, signal
os.kill(os.getpid(), signal.SIGKILL) # Restart_runtime



In [None]:
import sys
IN_COLAB = 'google.colab' in sys.modules

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Dependencies
# Common
import os
import pickle
import sys
import time
from tqdm.autonotebook import tqdm
from datetime import datetime

import numpy as np
import pandas as pd

# Plot
import matplotlib.pyplot as plt
import seaborn as sns

# ML
import sklearn # Import sklearn before autosklearn, solve scipy version error
from sklearn.model_selection import train_test_split
import sklearn.datasets
from sklearn.metrics import accuracy_score, balanced_accuracy_score

# AML
import autosklearn
import autosklearn.classification
from autosklearn.metrics import balanced_accuracy, precision, recall

# Check machine
import multiprocessing
multiprocessing.cpu_count()

  import sys


2

In [None]:
# Set up paths

# User insert folder to store pickles
# dataset name in format openml_xxx.pkl
datasets_folder = '/content/drive/My Drive/Colab Notebooks/course_AML_proj/data'
results_folder = '/content/drive/My Drive/Colab Notebooks/course_AML_proj/results'

In [None]:
## RUN ONCE
# Fetch OpenML datasets

# All dataset ids are to be downloaded 
# except 41147 that returned error
# Dataset ids extracted from  Appendix D, Table 20 from the autosklearn2.0 paper
dataset_ids = [40981,1464,54,31,1468,40975,12,1067,40984,41143,3,41146,1489,41142,41164,41163,41165,41159,41161,4135,1486,41027,1461,1590,1111]

# User insert folder to store pickles
folder_path = datasets_folder

# # Fetch and save each dataset
# for dataset_id in dataset_ids:
#     print(f'Fetching data: {dataset_id}')
#     # Using fetch function from sklearn
#     x, y = sklearn.datasets.fetch_openml(data_id=dataset_id, 
#                                          return_X_y=True, 
#                                          as_frame=True) # Return dataframe
#     # Store as dict
#     data = {'x': x, 'y': y}
#     # Dump dict to pickle
#     file_to_dump = data

#     fname = os.path.join(folder_path, 'openml_' + str(dataset_id))
#     file_write = open(fname, 'wb')
#     # "wb" mode opens the file in binary format for writing
#     pickle.dump(file_to_dump, file_write)
#     file_write.close()

In [None]:
meta_features = {}

folder_path = datasets_folder
all_dataset_names = os.listdir(folder_path)

print(f'Total # datasets = {len(all_dataset_names)}')

for dataset_name in all_dataset_names:

    # Load data
    fname = os.path.join(folder_path, dataset_name)
    file_read = open(fname, "rb")
    data = pickle.load(file_read) # Load pickle to data
    file_read.close()

    # dataset name in format openml_xxx.pkl
    # extract xxx as key
    dataset_id = dataset_name.replace('_', '.').split('.')[1]

    x, y = data['x'], data['y']
    print(f'\nCompute meta features of test dataset: {dataset_id}')
    print(f'meta feature: # instance {x.shape[0]}, # feature {x.shape[1]}')
    meta_features[str(dataset_id)] = {'n_instance': x.shape[0], # Row
                                      'n_feature': x.shape[1], # Col
                                      'n_class': len(np.unique(y))}   

Total # datasets = 38

Compute meta features of test dataset: 41165
meta feature: # instance 10000, # feature 7200

Compute meta features of test dataset: 41161
meta feature: # instance 20000, # feature 4296

Compute meta features of test dataset: 41159
meta feature: # instance 20000, # feature 4296

Compute meta features of test dataset: 41163
meta feature: # instance 10000, # feature 2000

Compute meta features of test dataset: 41142
meta feature: # instance 5418, # feature 1636

Compute meta features of test dataset: 1468
meta feature: # instance 1080, # feature 856

Compute meta features of test dataset: 41164
meta feature: # instance 8237, # feature 800

Compute meta features of test dataset: 40996
meta feature: # instance 70000, # feature 784

Compute meta features of test dataset: 1111
meta feature: # instance 50000, # feature 230

Compute meta features of test dataset: 12
meta feature: # instance 2000, # feature 216

Compute meta features of test dataset: 41166
meta feature: # 

In [None]:
# Save meta features
folder_path = results_folder

# Dump dict to pickle
file_to_dump = meta_features
fname = os.path.join(folder_path, 'openml_meta_features.pkl')
file_write = open(fname, 'wb')
# "wb" mode opens the file in binary format for writing
pickle.dump(file_to_dump, file_write)
file_write.close()

In [None]:
# Dataset

# User insert folder to store pickles
folder_path = datasets_folder
# Datasets D_new to evaluate
# All datasets located in the datasets_folder directory set up by user
all_dataset_names = os.listdir(folder_path)

# Auto-sklearn hyperparameters
# Time limit in seconds for the search of appropriate models.
time_left_for_this_task = 60 # Time budget
#per_run_time_limit = int(time_left_for_this_task / 10) # Default is budget/10. Longer the more training time.
# Memory limit in MB for the machine learning algorithm.
memory_limit = 3072 # Memory budget

# Repeat evaluation with different seeds
repeat = 100

# Set random seed generator
rng = np.random.default_rng(12345) # Fix seed for reproducibility
seeds = rng.integers(0, 2**32-1, repeat) # Max seed limited by autoskl

seed_split = 12345 # Fix seed for reproducibility on train test split


# Check estimated experiment duration before proceed
print(f'Est. time required: {repeat * time_left_for_this_task * len(all_dataset_names) / 60} min')
print(f'seeds: {seeds}')

Est. time required: 3800.0 min
seeds: [3003105692  976400780 3387213021 1360466708  876933080 3424658561
 2760304911 2904491693 4245388044 1679802727 3605303106 1429424934
 2437408156 2569716528  915442270  802017219  984129068 2889465206
 2634853374 4045012504 3032918238 1066207224 3929519758 4075413513
 3145778132 2865763038  561033802  411878496 1143809828 1897686915
  310984977 3807402261 2035922533 2995539971  913046978 1402190273
  498139153 3152197458 3325497001  945472433 3070822591  350446006
 1681873724  686746376 3195502165 1460719170 2033282004 1997989380
 2042481486 1144269602 2396799522 3503732972 2141779427  830193079
  104832471  556065447  349892316  393697109  526061820 2570830042
 3466446890 3671088524 2806373941 2583943556 1424584135 4002859530
 2751328124 3112912241 3152106376 3696039763 3017305172 3991475463
 2325576896 2345851045 1078349150 4027274691 2374937433 2125957013
 1358653540 1175846864 2719779336 1940374772 2439679748 2856320425
 3861219000 1421165723 2

In [None]:
# Run experiment

# Set unique identifier
timestamp = datetime.now().strftime('%Y%m%d-%H%M%S')
exp_id = "experiment_" + timestamp

# Tqdm bars
outter_bar = tqdm(dataset_ids, desc='dataset', leave=False)
inner_bar = tqdm(seeds, desc='seed', leave=False)

# Store result
res = {}

# Start experiment
print(f'\nStart running experiment id: {exp_id}')
start = time.perf_counter()

for dataset_name in all_dataset_names:
    if int(dataset_name.replace('_', '.').split('.')[1]) in dataset_ids:
      outter_bar.update(1) # tqdm progress bar + 1

      # Load data
      fname = os.path.join(folder_path, dataset_name)
      file_read = open(fname, "rb")
      data = pickle.load(file_read) # Load pickle to data
      file_read.close()

      # dataset name in format openml_xxx.pkl
      # extract xxx as key
      dataset_id = dataset_name.replace('_', '.').split('.')[1]

      x, y = data['x'], data['y']
      print(f'\nStart training on dataset: {dataset_id}')
      print(f'meta feature: # instance = {x.shape[0]}, # feature = {x.shape[1]}')

      # Split data
      x_train, x_test, y_train, y_test = train_test_split(
          x, y, test_size=0.2, random_state=seed_split)

      res_cls = []
      res_acc = []

      for seed in seeds:

          inner_bar.update(1) # tqdm progress bar + 1

          # Train
          cls = autosklearn.classification.AutoSklearnClassifier(
              time_left_for_this_task=time_left_for_this_task,
  #            per_run_time_limit=per_run_time_limit, # Default 1/10 of budget
              seed=int(seed),
              memory_limit=memory_limit,
              metric=balanced_accuracy, # For optim
              scoring_functions=[precision, recall] # Not for optim
          )
          # Feurer et al.(2021): single metric for binary clf, multiclass clf and unbalanced datasets

          # Train
          cls.fit(x_train, y_train, x_test, y_test,
                  dataset_name=dataset_id)
          
          # Test on trained classfier
          predictions = cls.predict(x_test)
          test_acc = balanced_accuracy_score(y_test, predictions)

          # Store
          res_cls.append(cls)
          res_acc.append(test_acc)
      
      inner_bar.reset() # reset the seed progress bar for next dataset

      res[dataset_id] = {'cls': res_cls, 'acc': res_acc}
      # Save
      file_dir = results_folder
      file_path = os.path.join(file_dir, exp_id)
      file_write = open(file_path, "wb")
      # "wb" mode opens the file in binary format for writing
      pickle.dump(res, file_write)
      file_write.close()
# Finish experiment
end = time.perf_counter()

print(f'\nExperiment completed in: {(end-start)/60:.1f} min')

dataset:   0%|          | 0/1 [00:00<?, ?it/s]

seed:   0%|          | 0/100 [00:00<?, ?it/s]


Start running experiment id: experiment_20211213-123348

Start training on dataset: 1464
meta feature: # instance = 748, # feature = 4

Experiment completed in: 93.1 min


In [None]:
# Save result and trained model to pickle for persitence

# Save
file_dir = results_folder
file_path = os.path.join(file_dir, exp_id)
file_write = open(file_path, "wb")
# "wb" mode opens the file in binary format for writing
pickle.dump(res, file_write)
file_write.close()