# Run experiments with py_experimenter
### This involves specifying different active learning pipelines and evaluating them on different active learning problems. The results are then stored in a database. To specify the exact experiments and parameters to be run, the file config/exp_config.yml is used as well as parameters are filled in the function experimenter.fill_table_from_combination() below. In our case, the results will be filled into ALPBenchmark.db
#### For further information we refer to the docs https://tornede.github.io/py_experimenter/

In [1]:
import numpy as np
from py_experimenter.exceptions import DatabaseConnectionError
from py_experimenter.experimenter import PyExperimenter, ResultProcessor
import types
from ALP.benchmark.BenchmarkConnector import DataFileBenchmarkConnector
from ALP.benchmark.BenchmarkSuite import TabZillaBenchmarkSuite
from ALP.evaluation.experimenter.DefaultSetup import ensure_default_setup
from ALP.evaluation.experimenter.LogTableObserver import LogTableObserver, SparseLogTableObserver
from ALP.pipeline.ActiveLearningPipeline import ActiveLearningPipeline
from ALP.pipeline.Oracle import Oracle
import sqlite3
import pandas as pd

### get ids of the tabzilla benchmark suite

In [2]:
tabzilla = TabZillaBenchmarkSuite()
tabzilla_ids = tabzilla.get_openml_dataset_ids()

### setup experiment runner, that loads parameters from the grid and runs each active learning pipeline on every active learning problem

In [3]:
class ExperimentRunner:

    def __init__(self):
        pass

    def run_experiment(self, parameters: dict, result_processor: ResultProcessor, custom_config: dict):

        dbbc = DataFileBenchmarkConnector()

        connector: DataFileBenchmarkConnector = dbbc

        OPENML_ID = int(parameters["openml_id"])
        SETTING_NAME = parameters["setting_name"]
        TEST_SPLIT_SEED = int(parameters["test_split_seed"])
        TRAIN_SPLIT_SEED = int(parameters["train_split_seed"])
        SEED = int(parameters["seed"])

        setting = connector.load_setting_by_name(SETTING_NAME)
        scenario = connector.load_or_create_scenario(
            openml_id=OPENML_ID,
            test_split_seed=TEST_SPLIT_SEED,
            train_split_seed=TRAIN_SPLIT_SEED,
            seed=SEED,
            setting_id=setting.get_setting_id(),
        )

        X_l, y_l, X_u, y_u, X_test, y_test = scenario.get_data_split()

        QUERY_STRATEGY = connector.load_query_strategy_by_name(parameters["query_strategy_name"])
        LEARNER = connector.load_learner_by_name(parameters["learner_name"])

        OBSERVER = [SparseLogTableObserver(result_processor, X_test, y_test)]

        ALP = ActiveLearningPipeline(
            learner=LEARNER,
            query_strategy=QUERY_STRATEGY,
            observer_list=OBSERVER,
            num_iterations=setting.get_number_of_iterations(),
            num_queries_per_iteration=setting.get_number_of_queries(),
        )

        oracle = Oracle(X_u, y_u)
        ALP.active_fit(X_l, y_l, X_u, oracle)

### run default setup, choose learning algorithms, query strategies and parameters (chosen algorithms and their parameters are saved in alpbench/ in .json files
#### to not make these experiments run for too long, we restrict ourselves to the first 4 dataset ids, 1 setting, 1 seed, 2 learning algorithms and 2 query strategies

In [4]:
def run(run_setup=False, reset_experiments=False):

    exp_config_file = "config/exp_config.yml"

    experimenter = PyExperimenter(experiment_configuration_file_path=exp_config_file)

    if run_setup:
        benchmark_connector = DataFileBenchmarkConnector()
        ensure_default_setup(dbbc=benchmark_connector)

        benchmark_connector.cleanup()

        setting_combinations = []
        setting_combinations += [{"setting_name": "small"}]

        if reset_experiments:
            experimenter.reset_experiments("running", "failed")

        else:
            experimenter.fill_table_from_combination(
                parameters={
                    "learner_name": ["rf_entropy", "svm_rbf"],  # "xgb", "knn_3", "mlp",
                    # "tabpfn", "tabnet", "catboost"],
                    "query_strategy_name": [  # "core_set", "falcun",
                        "margin",
                        # "least_confident", "entropy",
                        # "power_margin", "bald",
                        # "power_bald",
                        # "max_entropy",
                        # "qbc_variance_ratio", "kmeans",
                        # "cluster_margin", "typ_cluster",
                        # "weighted_cluster",
                        "random",
                    ],
                    "test_split_seed": np.arange(1),
                    "train_split_seed": np.arange(1),
                    "seed": np.arange(1),
                    "openml_id": tabzilla_ids[:1],
                },
                fixed_parameter_combinations=setting_combinations,
            )

    else:
        er = ExperimentRunner()
        experimenter.execute(er.run_experiment, -1)

In [5]:
run(run_setup=True, reset_experiments=False)

2024-06-12 16:40:10,114  | py-experimenter - INFO     | Found 7 keyfields
2024-06-12 16:40:10,121  | py-experimenter - INFO     | Found 2 logtables
2024-06-12 16:40:10,123  | py-experimenter - INFO     | Found logtable results__accuracy_log
2024-06-12 16:40:10,125  | py-experimenter - INFO     | Found logtable results__labeling_log
2024-06-12 16:40:10,129  | py-experimenter - INFO     | Initialized and connected to database
2024-06-12 16:40:10,224  | py-experimenter - INFO     | 4 rows successfully added to database. 0 rows were skipped.


## display (empty) tables

In [6]:
# Specify the path to your .db file
db_path = "ALPBenchmark.db"

# Connect to the database
conn = sqlite3.connect(db_path)

# Create a cursor object to interact with the database
cursor = conn.cursor()

# Get the list of tables in the database
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

# Display the contents of each table
for table_name in tables:
    table_name = table_name[0]
    print(f"Contents of table {table_name}:")
    query = f"SELECT * FROM {table_name}"
    df = pd.read_sql_query(query, conn)
    display(df)
    print("\n")

# Close the connection
conn.close()

Contents of table results:


Unnamed: 0,ID,setting_name,openml_id,learner_name,query_strategy_name,test_split_seed,train_split_seed,seed,creation_date,status,start_date,name,machine,end_date,error
0,1,small,11,rf_entropy,margin,0,0,0,2024-06-12 16:40:10,created,,,,,
1,2,small,11,svm_rbf,margin,0,0,0,2024-06-12 16:40:10,created,,,,,
2,3,small,11,rf_entropy,random,0,0,0,2024-06-12 16:40:10,created,,,,,
3,4,small,11,svm_rbf,random,0,0,0,2024-06-12 16:40:10,created,,,,,




Contents of table sqlite_sequence:


Unnamed: 0,name,seq
0,results,4




Contents of table results__accuracy_log:


Unnamed: 0,ID,experiment_id,timestamp,model_dict




Contents of table results__labeling_log:


Unnamed: 0,ID,experiment_id,timestamp,data_dict




Contents of table results_codecarbon:


Unnamed: 0,ID,experiment_id,codecarbon_timestamp,project_name,run_id,duration_seconds,emissions_kg,emissions_rate_kg_sec,cpu_power_watt,gpu_power_watt,...,cpu_model,gpu_count,gpu_model,longitude,latitude,ram_total_size,tracking_mode,on_cloud,power_usage_efficiency,offline_mode






In [7]:
run(run_setup=False, reset_experiments=False)

2024-06-12 16:40:12,349  | py-experimenter - INFO     | Found 7 keyfields
2024-06-12 16:40:12,355  | py-experimenter - INFO     | Found 2 logtables
2024-06-12 16:40:12,357  | py-experimenter - INFO     | Found logtable results__accuracy_log
2024-06-12 16:40:12,362  | py-experimenter - INFO     | Found logtable results__labeling_log
2024-06-12 16:40:12,373  | py-experimenter - INFO     | Initialized and connected to database
[codecarbon INFO @ 16:40:12] [setup] RAM Tracking...
[codecarbon INFO @ 16:40:12] [setup] GPU Tracking...
[codecarbon INFO @ 16:40:13] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 16:40:13] [setup] CPU Tracking...
[codecarbon INFO @ 16:40:15] CPU Model on constant consumption mode: 12th Gen Intel(R) Core(TM) i7-12700H
[codecarbon INFO @ 16:40:15] >>> Tracker's metadata:
[codecarbon INFO @ 16:40:15]   Platform system: Linux-6.2.0-34-generic-x86_64-with-glibc2.35
[codecarbon INFO @ 16:40:15]   Python version: 3.10.14
[codecarbon INFO @ 16:40:15]   CodeCarbon vers

[codecarbon INFO @ 16:40:31] CPU Model on constant consumption mode: 12th Gen Intel(R) Core(TM) i7-12700H
[codecarbon INFO @ 16:40:31] >>> Tracker's metadata:
[codecarbon INFO @ 16:40:31]   Platform system: Linux-6.2.0-34-generic-x86_64-with-glibc2.35
[codecarbon INFO @ 16:40:31]   Python version: 3.10.14
[codecarbon INFO @ 16:40:31]   CodeCarbon version: 2.3.1
[codecarbon INFO @ 16:40:31]   Available RAM : 31.014 GB
[codecarbon INFO @ 16:40:31]   CPU count: 20
[codecarbon INFO @ 16:40:31]   CPU model: 12th Gen Intel(R) Core(TM) i7-12700H
[codecarbon INFO @ 16:40:31]   GPU count: 1
[codecarbon INFO @ 16:40:31]   GPU model: 1 x NVIDIA GeForce RTX 3050 Ti Laptop GPU
2024-06-12 16:40:34,540  | py-experimenter - ERROR    | Traceback (most recent call last):
  File "/home/valentin/mambaforge/envs/alp_saltbench/lib/python3.10/site-packages/py_experimenter/experimenter.py", line 372, in _execute_experiment
    final_status = experiment_function(keyfield_values, result_processor, self.config.c

### display (filled) table

In [8]:
# Specify the path to your .db file
db_path = "ALPBenchmark.db"

# Connect to the database
conn = sqlite3.connect(db_path)

# Create a cursor object to interact with the database
cursor = conn.cursor()

# Get the list of tables in the database
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

# Display the contents of each table
for table_name in tables:
    table_name = table_name[0]
    print(f"Contents of table {table_name}:")
    query = f"SELECT * FROM {table_name}"
    df = pd.read_sql_query(query, conn)
    display(df)
    print("\n")

# Close the connection
conn.close()

Contents of table results:


Unnamed: 0,ID,setting_name,openml_id,learner_name,sampling_strategy_name,test_split_seed,train_split_seed,seed,creation_date,status,start_date,name,machine,end_date,error
0,1,small,11,rf_entropy,margin,0,0,0,2024-06-12 16:07:41,done,2024-06-12 16:07:46,PyExperimenter,valentin-XPS-15-9520,2024-06-12 16:07:56,
1,2,small,11,svm_rbf,margin,0,0,0,2024-06-12 16:07:41,done,2024-06-12 16:08:12,PyExperimenter,valentin-XPS-15-9520,2024-06-12 16:08:17,
2,3,small,11,rf_entropy,random,0,0,0,2024-06-12 16:07:41,done,2024-06-12 16:08:02,PyExperimenter,valentin-XPS-15-9520,2024-06-12 16:08:12,
3,4,small,11,svm_rbf,random,0,0,0,2024-06-12 16:07:41,done,2024-06-12 16:07:56,PyExperimenter,valentin-XPS-15-9520,2024-06-12 16:08:02,




Contents of table sqlite_sequence:


Unnamed: 0,name,seq
0,results,4
1,results__labeling_log,4
2,results__accuracy_log,4
3,results_codecarbon,4




Contents of table results__accuracy_log:


Unnamed: 0,ID,experiment_id,timestamp,model_dict
0,1,1,2024-06-12 16:07:56,"{""0"": {""iteration"": 0, ""test_f1"": 0.71644904, ..."
1,2,4,2024-06-12 16:08:02,"{""0"": {""iteration"": 0, ""test_f1"": 0.78834966, ..."
2,3,3,2024-06-12 16:08:12,"{""0"": {""iteration"": 0, ""test_f1"": 0.72995095, ..."
3,4,2,2024-06-12 16:08:17,"{""0"": {""iteration"": 0, ""test_f1"": 0.78834966, ..."




Contents of table results__labeling_log:


Unnamed: 0,ID,experiment_id,timestamp,data_dict
0,1,1,2024-06-12 16:07:56,"{""0"": {""iteration"": 0, ""len_X_sel"": 30, ""len_X..."
1,2,4,2024-06-12 16:08:02,"{""0"": {""iteration"": 0, ""len_X_sel"": 30, ""len_X..."
2,3,3,2024-06-12 16:08:11,"{""0"": {""iteration"": 0, ""len_X_sel"": 30, ""len_X..."
3,4,2,2024-06-12 16:08:17,"{""0"": {""iteration"": 0, ""len_X_sel"": 30, ""len_X..."




Contents of table results_codecarbon:


Unnamed: 0,ID,experiment_id,codecarbon_timestamp,project_name,run_id,duration_seconds,emissions_kg,emissions_rate_kg_sec,cpu_power_watt,gpu_power_watt,...,cpu_model,gpu_count,gpu_model,longitude,latitude,ram_total_size,tracking_mode,on_cloud,power_usage_efficiency,offline_mode
0,1,1,2024-06-12T16:07:56,codecarbon,5c25055d-f500-49e1-892c-3ce78c3512b4,4.664417,3e-05,6e-06,42.5,9.981038,...,12th Gen Intel(R) Core(TM) i7-12700H,1.0,1 x NVIDIA GeForce RTX 3050 Ti Laptop GPU,11.5683,48.1663,31.013721,machine,N,1.0,0
1,2,4,2024-06-12T16:08:02,codecarbon,1937069e-b66f-4a5c-9e9c-a42699a0fda1,0.752235,7e-06,9e-06,42.5,40.89936,...,12th Gen Intel(R) Core(TM) i7-12700H,1.0,1 x NVIDIA GeForce RTX 3050 Ti Laptop GPU,11.5683,48.1663,31.013721,machine,N,1.0,0
2,3,3,2024-06-12T16:08:12,codecarbon,92ec1060-cc9a-490e-b5d9-f1ce6c71f746,4.493434,2.8e-05,6e-06,42.5,7.097738,...,12th Gen Intel(R) Core(TM) i7-12700H,1.0,1 x NVIDIA GeForce RTX 3050 Ti Laptop GPU,11.5683,48.1663,31.013721,machine,N,1.0,0
3,4,2,2024-06-12T16:08:17,codecarbon,8df713e8-9aef-4307-830a-a880034938e6,0.639189,8e-06,1.2e-05,42.5,65.162139,...,12th Gen Intel(R) Core(TM) i7-12700H,1.0,1 x NVIDIA GeForce RTX 3050 Ti Laptop GPU,11.5683,48.1663,31.013721,machine,N,1.0,0




