In [2]:
import pickle
import os
import pandas as pd
import numpy as np
import openml
import glob
from gama import GamaClassifier

from sklearn.cluster import KMeans

from jobs.batch_pipeline_evaluation import PipelineExecutor
from jobs.batch_automl import AutomlExecutor
from main.metadata import MetaDataSet
from main.clustering import MetaDataCluster
from main.portfolio import PortFolioBuilder

In [None]:
data_directory = os.path.join(os.getcwd(), "/src/data")

## Construct the performance matrix

In [3]:
csv_files = glob.glob(f"{data_directory}/batch_results/*.{'csv'}")
df = pd.concat([pd.read_csv(f, index_col = 0) for f in csv_files])
df.dropna(inplace = True)
test_indexes = df.sample(30, replace = False).index
train_indexes = df.drop(test_indexes, axis = 0).index

In [5]:
meta_dataframe = pd.read_csv(f"{data_directory}/meta_dataframe.csv", index_col = 0)

In [6]:
def indexes_per_cluster(labels):
    """
    Utility function to get the indexes per cluster
    """
    vals, inverse, count = np.unique(labels, return_inverse=True,
                                return_counts=True)

    idx_vals_repeated = np.where(count >= 1)[0]
    vals_repeated = vals[idx_vals_repeated]

    rows, cols = np.where(inverse == idx_vals_repeated[:, np.newaxis])
    _, inverse_rows = np.unique(rows, return_index=True)
    cluster_indexes = np.split(cols, inverse_rows[1:])
    return cluster_indexes

In [7]:
#create cluster
n_clusters = 3
clustering_obj = KMeans(n_clusters = n_clusters, init = 'random', n_init = 'auto')
metadata_cluster = MetaDataCluster(meta_dataframe)
cluster = metadata_cluster.compute_clusters(clustering_obj)
cluster_labels = cluster.labels_

In [8]:
cluster_indexes = indexes_per_cluster(cluster_labels)

In [9]:
builder = PortFolioBuilder(size = 15)
portfolio_dict = dict()
for label, c_index in enumerate(cluster_indexes):
    portfolio = builder.build_portfolio(df, train_indexes[c_index])
    portfolio_dict[label] = portfolio.index
    #store portfolio for reference (as dict)

In [None]:
evaluations = []
pipeline_runner = PipelineExecutor()
for test_index in test_indexes:
    dataset = openml.datasets.get_dataset(test_index)
    meta_data = pd.DataFrame.from_dict(orient = "index", data = dataset.qualities).T
    meta_data = meta_data.combine_first(pd.DataFrame(meta_dataframe.mean(axis = 0)).T).values
    X, y, _, _ = dataset.get_data(dataset_format="dataframe", target = dataset.default_target_attribute)
    evaluations_individual = []
    cluster_assignment = cluster.predict(meta_data)
    portfolio = portfolio_dict[cluster_assignment[0]]
    for pipeline in portfolio:
        sklearn_pipeline = eval(pipeline)
        x_enc, _ = pipeline_runner.basic_encoding(X, is_classification=True)
        accuracy = pipeline_runner.evaluate_pipeline(sklearn_pipeline, x_enc, y)
        evaluations_individual.append(accuracy)
    evaluations.append(evaluations_individual)

df = pd.DataFrame(index = test_indexes, data = evaluations, columns = portfolio)


In [None]:
gamaclassifier = GamaClassifier(max_total_time = 150, store = 'nothing')
automl_instance = AutomlExecutor(gamaclassifier)
accuracies_gama = []
for test_index in test_indexes:
    accuracy = automl_instance.run_automl(test_index)
    print(f"accuracy for run on {test_index} is {accuracy}")
    accuracies_gama.append(accuracy)