In [6]:
import os
import sys

from ray import tune

# Results grid analysis notebook

Running the trials and experiments via Ray produces a number of files that are related to the "experiment" (dataset and all the strategies + all the repeated runs, each being a "trial"). Each trial is stored individually, but will be read and collected by a `result_grid` object that can be used to analyse results at the experiment and trial level.

This notebook is intended to be used for all the experiments, adapt as necessary to produce the relevant plots.

In [7]:
# Load result_grid
# Adapt as needed
experiment_name = "breastcancer"
storage_path = os.path.join(os.getcwd(), f"{experiment_name}_results")
experiment_path = os.path.join(storage_path, experiment_name)

print(f"Loading results from {experiment_path}...")

restored_tuner = tune.Tuner.restore(experiment_path, trainable="trial")
result_grid = restored_tuner.get_results()


Loading results from c:\Users\morio\Documents\workspace\pyrelational\benchmarking\breastcancer_results\breastcancer...


In [8]:
# First we will do some housekeeping to make sure our results grid is as we expect

# Check if there have been errors
if result_grid.errors:
    print("One of the trials failed!")
else:
    print("No errors!")

num_results = len(result_grid)
print("Number of results:", num_results)

# Iterate over results
for i, result in enumerate(result_grid):
    if result.error:
        print(f"Trial #{i} had an error:", result.error)
        continue

    print(
        f"Trial #{i} finished successfully with a test metric of:",
        result.metrics["score"]
    )

No errors!
Number of results: 20
Trial #0 finished successfully with a test metric of: 253.2469973009446
Trial #1 finished successfully with a test metric of: 240.39271433805567
Trial #2 finished successfully with a test metric of: 270.2554488733984
Trial #3 finished successfully with a test metric of: 261.77553360064434
Trial #4 finished successfully with a test metric of: 269.5056949089558
Trial #5 finished successfully with a test metric of: 238.6668458706384
Trial #6 finished successfully with a test metric of: 269.5883102297523
Trial #7 finished successfully with a test metric of: 234.88558983135005
Trial #8 finished successfully with a test metric of: 254.33543406431764
Trial #9 finished successfully with a test metric of: 231.2103479473631
Trial #10 finished successfully with a test metric of: 262.94021352147513
Trial #11 finished successfully with a test metric of: 271.09669811210154
Trial #12 finished successfully with a test metric of: 234.29315536289542
Trial #13 finished su

In [9]:
results_df = result_grid.get_dataframe()

In [10]:
results_df

Unnamed: 0,score,iteration_metrics,timestamp,checkpoint_dir_name,done,training_iteration,trial_id,date,time_this_iter_s,time_total_s,pid,hostname,node_ip,time_since_restore,iterations_since_restore,config/seed,config/strategy,logdir
0,253.246997,"[0.8730769230769231, 0.8730769230769231, 0.873...",1725051832,,False,1,4d5cb_00016,2024-08-30_22-03-52,3.096578,3.096578,27962,nemo.cl.cam.ac.uk,128.232.64.154,3.096578,1,2,ratio_confidence,4d5cb_00016
1,240.392714,"[0.8287921348314606, 0.8287921348314606, 0.828...",1725051832,,False,1,4d5cb_00015,2024-08-30_22-03-52,3.073937,3.073937,27961,nemo.cl.cam.ac.uk,128.232.64.154,3.073937,1,1,ratio_confidence,4d5cb_00015
2,270.255449,"[0.9318877551020408, 0.9318877551020408, 0.931...",1725051819,,False,1,4d5cb_00000,2024-08-30_22-03-39,3.141029,3.141029,26933,nemo.cl.cam.ac.uk,128.232.64.154,3.141029,1,1,least_confidence,4d5cb_00000
3,261.775534,"[0.9026342975206612, 0.9026342975206612, 0.902...",1725051819,,False,1,4d5cb_00003,2024-08-30_22-03-39,3.184099,3.184099,26937,nemo.cl.cam.ac.uk,128.232.64.154,3.184099,1,4,least_confidence,4d5cb_00003
4,269.505695,"[0.9293075684380032, 0.9293075684380032, 0.929...",1725051832,,False,1,4d5cb_00013,2024-08-30_22-03-52,3.21637,3.21637,27959,nemo.cl.cam.ac.uk,128.232.64.154,3.21637,1,4,marginal_confidence,4d5cb_00013
5,238.666846,"[0.82275960170697, 0.82275960170697, 0.8227596...",1725051819,,False,1,4d5cb_00002,2024-08-30_22-03-39,3.255765,3.255765,26936,nemo.cl.cam.ac.uk,128.232.64.154,3.255765,1,3,least_confidence,4d5cb_00002
6,269.58831,"[0.9295918367346938, 0.9295918367346938, 0.929...",1725051832,,False,1,4d5cb_00014,2024-08-30_22-03-52,3.185276,3.185276,27960,nemo.cl.cam.ac.uk,128.232.64.154,3.185276,1,5,marginal_confidence,4d5cb_00014
7,234.88559,"[0.8098039215686275, 0.8098039215686275, 0.809...",1725051818,,False,1,4d5cb_00005,2024-08-30_22-03-38,3.114383,3.114383,26948,nemo.cl.cam.ac.uk,128.232.64.154,3.114383,1,1,entropy,4d5cb_00005
8,254.335434,"[0.8769230769230769, 0.8769230769230769, 0.876...",1725051830,,False,1,4d5cb_00010,2024-08-30_22-03-50,3.26504,3.26504,27889,nemo.cl.cam.ac.uk,128.232.64.154,3.26504,1,1,marginal_confidence,4d5cb_00010
9,231.210348,"[0.7970297029702971, 0.7970297029702971, 0.797...",1725051819,,False,1,4d5cb_00006,2024-08-30_22-03-39,3.226365,3.226365,26949,nemo.cl.cam.ac.uk,128.232.64.154,3.226365,1,2,entropy,4d5cb_00006


In [None]:
# Lets use the multiple trials based on seed number and compare the iteration_metrics performances between strategies (along with confidence intervals)
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Filter out the trials that have not finished
finished_results = [result for result in result_grid if not result.error]

# Get the number of trials
num_trials = len(finished_results)
