## Analysis of D3M Experiments

In [1]:
import pandas as pd
import numpy as np

In [2]:
performances = pd.read_csv('resource/d3m_leaderboard.csv')
performances = performances.round(2)
performances.head()

Unnamed: 0,Dataset,Task,Metric,AlphaD3M,AutonML,Ensemble,Aika,Distil AutoML,Autoflow,Axolotl,Drori
0,124_120_mnist_8747,image_classification,accuracy,0.98,0.94,0.46,0.18,0.94,0.11,,
1,124_138_cifar100_1858,image_classification,accuracy,0.67,0.48,0.42,0.12,0.48,0.01,,
2,124_16_fashion_mnist,image_classification,accuracy,0.9,0.83,0.84,0.12,0.85,0.1,,
3,124_174_cifar10_MIN_METADATA,image_classification,accuracy,0.88,0.82,0.84,0.27,0.8,0.1,,
4,124_188_usps_MIN_METADATA,image_classification,accuracy,0.96,0.95,0.94,0.26,0.92,0.18,0.11,


In [3]:
# Utils

error_based_metrics = ['mean_absolute_error', 'mean_squared_error', 'root_mean_squared_error']

def calculate_coverage(performances):
    all_tasks_coverage = []
    
    for task in performances['Task'].unique():
        performances_by_task = performances[performances['Task'] == task]
        performances_by_task = performances_by_task.T
        performances_by_task.columns = performances_by_task.loc['Dataset'].tolist() # Set the datasets as column names
        performances_by_task = performances_by_task.drop(['Dataset', 'Task', 'Metric']) # Remove 'Dataset', 'Task', and 'Metric' rows
        coverage_by_task = performances_by_task[performances_by_task != np.nan].count(axis=1)
        coverage_by_task = pd.DataFrame({task: coverage_by_task})
        all_tasks_coverage.append(coverage_by_task)

    all_tasks_coverage = pd.concat(all_tasks_coverage, axis=1, join='inner')
    all_tasks_coverage['unique_task'] = all_tasks_coverage[all_tasks_coverage != 0].count(axis=1) # Add a column with unique tasks supported
    
    return all_tasks_coverage


def calculate_rank(performances):
    performances_t = performances.T
    performances_t.columns = performances_t.loc['Dataset'].tolist() # Set the datasets as column names
    all_ranks = []
    
    for dataset in performances_t.columns:
        metric = performances_t[dataset]['Metric']
        order_mode = metric in error_based_metrics
        ranks_by_dataset = pd.DataFrame({dataset: performances_t[dataset]})
        ranks_by_dataset.drop(['Dataset', 'Task', 'Metric'], inplace=True) # Remove 'Dataset', 'Task', and 'Metric' rows
        ranks_by_dataset = ranks_by_dataset.rank(ascending=order_mode, method='min')
        worst_rank = float(ranks_by_dataset.shape[0]) # Number of AutoML Systems
        ranks_by_dataset.fillna(worst_rank, inplace=True) # Add the worst rank to the systems that didn't produce pipelines
        all_ranks.append(ranks_by_dataset)
        
    all_ranks = pd.concat(all_ranks, axis=1, join='inner')
    
    return all_ranks

def calculate_rank_by_task(performances):
    all_tasks_rank = []
    
    for task in performances['Task'].unique():
        performances_by_task = performances[performances['Task'] == task]
        ranks_by_task = calculate_rank(performances_by_task)
        ranks_by_task = ranks_by_task.mean(axis=1)
        ranks_by_task = pd.DataFrame({task: ranks_by_task})
        ranks_by_task = ranks_by_task.round({task: 3})
        all_tasks_rank.append(ranks_by_task)

    all_tasks_rank = pd.concat(all_tasks_rank, axis=1, join='inner')
    
    return all_tasks_rank

def winner_pipelines_top_3(performances):
    top_3_performances = performances.drop(columns=['Aika', 'Distil AutoML', 'Autoflow', 'Axolotl', 'Drori'])
    top_3_performances = top_3_performances[~top_3_performances.isnull().any(axis=1)]  # Select datasets where the 3 systems have pipelines
    top_3_ranks = calculate_rank(top_3_performances)
    winner_pipelines = top_3_ranks[top_3_ranks == 1.0].count(axis=1)
    winner_pipelines = pd.DataFrame({'winner_pipelines': winner_pipelines})
    
    return winner_pipelines

### Calculating Coverage

Number of datasets by each system for all the task types.

In [4]:
coverage_all_tasks = calculate_coverage(performances)
coverage_all_tasks

Unnamed: 0,image_classification,tabular_classification,tabular_regression,image_regression,text_classification,audio_classification,graph_matching,time_series_forecasting,link_prediction,collaborative_filtering,time_series_classification,community_detection,video_classification,vertex_classification,object_detection,semisupervised_classification,lupi,unique_task
AlphaD3M,9,20,11,1,9,2,3,13,3,1,19,3,2,4,2,6,4,17
AutonML,8,19,11,1,9,2,3,13,3,0,19,3,2,4,2,6,4,16
Ensemble,9,18,11,1,9,2,3,13,3,1,19,0,2,4,0,6,4,15
Aika,9,20,8,1,9,2,3,13,3,1,17,2,2,4,1,3,4,17
Distil AutoML,7,18,9,1,8,1,2,2,2,0,19,2,0,4,1,6,4,15
Autoflow,7,17,6,1,8,2,2,12,2,1,15,1,2,4,0,4,4,16
Axolotl,2,13,5,1,9,2,2,10,2,0,19,0,2,4,0,3,4,14
Drori,0,20,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2


### Calculating Winner Pipelines and Average Rank

In [5]:
ranks = calculate_rank(performances)
ranks

Unnamed: 0,124_120_mnist_8747,124_138_cifar100_1858,124_16_fashion_mnist,124_174_cifar10_MIN_METADATA,124_188_usps_MIN_METADATA,124_214_coil20_MIN_METADATA,124_95_uc_merced_land_use_MIN_METADATA,1491_one_hundred_plants_margin_MIN_METADATA,1567_poker_hand_MIN_METADATA,185_baseball_MIN_METADATA,...,political_instability_MIN_METADATA,uu1_datasmash_MIN_METADATA,uu2_gp_hyperparameter_estimation_MIN_METADATA,uu3_world_development_indicators_MIN_METADATA,uu3_world_development_indicators_raw,uu4_SPECT_MIN_METADATA,uu5_heartstatlog_MIN_METADATA,uu6_hepatitis_MIN_METADATA,uu7_pima_diabetes_MIN_METADATA,uu_101_object_categories_MIN_METADATA
AlphaD3M,1.0,1.0,1.0,1.0,1.0,1.0,1.0,6.0,1.0,5.0,...,6.0,1.0,3.0,1.0,3.0,7.0,3.0,7.0,4.0,1.0
AutonML,2.0,2.0,4.0,3.0,2.0,1.0,8.0,7.0,2.0,1.0,...,1.0,1.0,2.0,4.0,2.0,1.0,4.0,2.0,5.0,2.0
Ensemble,4.0,4.0,3.0,2.0,3.0,1.0,2.0,2.0,4.0,7.0,...,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,2.0,3.0
Aika,5.0,5.0,5.0,5.0,5.0,5.0,3.0,1.0,3.0,2.0,...,1.0,1.0,3.0,8.0,8.0,3.0,6.0,4.0,5.0,4.0
Distil AutoML,2.0,2.0,2.0,4.0,4.0,4.0,8.0,8.0,7.0,2.0,...,1.0,7.0,8.0,3.0,8.0,5.0,7.0,6.0,2.0,8.0
Autoflow,6.0,6.0,6.0,6.0,6.0,8.0,5.0,3.0,6.0,4.0,...,8.0,1.0,8.0,8.0,8.0,3.0,1.0,5.0,1.0,5.0
Axolotl,8.0,8.0,8.0,8.0,7.0,8.0,4.0,5.0,8.0,5.0,...,5.0,1.0,8.0,8.0,8.0,6.0,5.0,3.0,5.0,8.0
Drori,8.0,8.0,8.0,8.0,8.0,8.0,8.0,3.0,5.0,8.0,...,8.0,8.0,3.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0


In [6]:
ranks['average_rank'] = ranks.mean(axis=1) # Add a column with average rank
ranks = ranks.round({'average_rank': 3})
ranks['winner_pipelines'] = ranks[ranks == 1.0].count(axis=1) # Add a column with winner pipelines
ranks[['winner_pipelines', 'average_rank']]

Unnamed: 0,winner_pipelines,average_rank
AlphaD3M,49,2.848
AutonML,39,2.893
Ensemble,30,2.902
Aika,21,3.991
Distil AutoML,20,4.679
Autoflow,11,5.321
Axolotl,10,5.732
Drori,7,6.848


### Calculating Average Rank by Task

In [7]:
ranks_by_task = calculate_rank_by_task(performances)
ranks_by_task

Unnamed: 0,image_classification,tabular_classification,tabular_regression,image_regression,text_classification,audio_classification,graph_matching,time_series_forecasting,link_prediction,collaborative_filtering,time_series_classification,community_detection,video_classification,vertex_classification,object_detection,semisupervised_classification,lupi
AlphaD3M,1.111,3.75,2.273,4.0,2.556,1.5,1.0,3.385,3.333,3.0,3.263,1.0,2.5,1.0,1.5,3.5,5.25
AutonML,2.778,3.3,3.182,2.0,3.333,1.0,3.333,3.615,2.333,8.0,2.263,1.0,1.0,4.0,1.0,2.333,3.0
Ensemble,2.778,3.35,3.0,2.0,2.222,3.5,3.0,2.615,2.333,2.0,2.158,8.0,3.0,3.25,8.0,2.333,1.25
Aika,4.556,3.85,5.727,1.0,3.0,5.0,2.333,2.231,1.667,1.0,4.684,3.333,3.5,4.25,4.5,6.0,4.5
Distil AutoML,4.333,4.85,4.273,7.0,3.556,5.5,4.667,7.308,4.667,8.0,3.789,3.333,8.0,4.0,4.5,2.833,5.0
Autoflow,6.222,4.65,5.727,5.0,5.778,5.0,3.333,5.077,6.667,4.0,5.316,6.333,4.5,6.5,8.0,6.0,2.5
Axolotl,7.444,5.85,7.545,5.0,4.333,6.0,6.333,5.077,5.0,8.0,4.526,8.0,5.5,3.5,8.0,6.833,4.75
Drori,8.0,3.55,4.364,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0


### Calculating Winner Pipelines for the Top 3 Systems

In [8]:
winner_pipelines_top_3(performances)

Unnamed: 0,winner_pipelines
AlphaD3M,48
AutonML,38
Ensemble,42
