# Run experiments with py_experimenter
### This involves specifying different active learning pipelines and evaluating them on different active learning problems. The results are then stored in a database. To specify the exact experiments and parameters to be run, the file config/exp_config.yml is used as well as parameters are filled in the function experimenter.fill_table_from_combination() below. In our case, the results will be filled into ALPBenchmark.db
#### For further information we refer to the docs https://tornede.github.io/py_experimenter/

In [1]:
import numpy as np
from py_experimenter.exceptions import DatabaseConnectionError
from py_experimenter.experimenter import PyExperimenter, ResultProcessor
import types
from ALP.benchmark.BenchmarkConnector import DataFileBenchmarkConnector
from ALP.benchmark.BenchmarkSuite import TabZillaBenchmarkSuite
from ALP.evaluation.experimenter.DefaultSetup import ensure_default_setup
from ALP.evaluation.experimenter.LogTableObserver import LogTableObserver, SparseLogTableObserver
from ALP.pipeline.ActiveLearningPipeline import ActiveLearningPipeline
from ALP.pipeline.Oracle import Oracle
import sqlite3
import pandas as pd

### get ids of the tabzilla benchmark suite

In [2]:
tabzilla = TabZillaBenchmarkSuite()
tabzilla_ids = tabzilla.get_openml_dataset_ids()

### setup experiment runner, that loads parameters from the grid and runs each active learning pipeline on every active learning problem

In [3]:
class ExperimentRunner:

    def __init__(self):
        pass

    def run_experiment(self, parameters: dict, result_processor: ResultProcessor, custom_config: dict):

        dbbc = DataFileBenchmarkConnector()

        connector: DataFileBenchmarkConnector = dbbc

        OPENML_ID = int(parameters["openml_id"])
        SETTING_NAME = parameters["setting_name"]
        TEST_SPLIT_SEED = int(parameters["test_split_seed"])
        TRAIN_SPLIT_SEED = int(parameters["train_split_seed"])
        SEED = int(parameters["seed"])

        setting = connector.load_setting_by_name(SETTING_NAME)
        scenario = connector.load_or_create_scenario(
            openml_id=OPENML_ID,
            test_split_seed=TEST_SPLIT_SEED,
            train_split_seed=TRAIN_SPLIT_SEED,
            seed=SEED,
            setting_id=setting.get_setting_id(),
        )

        X_l, y_l, X_u, y_u, X_test, y_test = scenario.get_data_split()

        QUERY_STRATEGY = connector.load_query_strategy_by_name(parameters["query_strategy_name"])
        LEARNER = connector.load_learner_by_name(parameters["learner_name"])

        OBSERVER = [SparseLogTableObserver(result_processor, X_test, y_test)]

        ALP = ActiveLearningPipeline(
            learner=LEARNER,
            query_strategy=QUERY_STRATEGY,
            observer_list=OBSERVER,
            num_iterations=setting.get_number_of_iterations(),
            num_queries_per_iteration=setting.get_number_of_queries(),
        )

        oracle = Oracle(X_u, y_u)
        ALP.active_fit(X_l, y_l, X_u, oracle)

### run default setup, choose learning algorithms, query strategies and parameters (chosen algorithms and their parameters are saved in alpbench/ in .json files
#### to not make these experiments run for too long, we restrict ourselves to the first 4 dataset ids, 1 setting, 1 seed, 2 learning algorithms and 2 query strategies

In [4]:
def run(run_setup=False, reset_experiments=False):

    exp_config_file = "config/exp_config.yml"

    experimenter = PyExperimenter(experiment_configuration_file_path=exp_config_file)

    if run_setup:
        benchmark_connector = DataFileBenchmarkConnector()
        ensure_default_setup(dbbc=benchmark_connector)

        benchmark_connector.cleanup()

        setting_combinations = []
        setting_combinations += [{"setting_name": "small"}]

        if reset_experiments:
            experimenter.reset_experiments("running", "failed")

        else:
            experimenter.fill_table_from_combination(
                parameters={
                    "learner_name": ["rf_entropy", "svm_rbf"],  # "xgb", "knn_3", "mlp",
                    # "tabpfn", "tabnet", "catboost"],
                    "query_strategy_name": [  # "core_set", "falcun",
                        "margin",
                        # "least_confident", "entropy",
                        # "power_margin", "bald",
                        # "power_bald",
                        # "max_entropy",
                        # "qbc_variance_ratio", "kmeans",
                        # "cluster_margin", "typ_cluster",
                        # "weighted_cluster",
                        "random",
                    ],
                    "test_split_seed": np.arange(1),
                    "train_split_seed": np.arange(1),
                    "seed": np.arange(1),
                    "openml_id": tabzilla_ids[:1],
                },
                fixed_parameter_combinations=setting_combinations,
            )

    else:
        er = ExperimentRunner()
        experimenter.execute(er.run_experiment, -1)

In [5]:
run(run_setup=True, reset_experiments=False)

2024-06-12 16:51:25,058  | py-experimenter - INFO     | Found 7 keyfields
2024-06-12 16:51:25,061  | py-experimenter - INFO     | Found 2 logtables
2024-06-12 16:51:25,062  | py-experimenter - INFO     | Found logtable results__accuracy_log
2024-06-12 16:51:25,063  | py-experimenter - INFO     | Found logtable results__labeling_log
2024-06-12 16:51:25,068  | py-experimenter - INFO     | Initialized and connected to database
2024-06-12 16:51:25,163  | py-experimenter - INFO     | 4 rows successfully added to database. 0 rows were skipped.


## display (empty) tables

In [6]:
# Specify the path to your .db file
db_path = "ALPBenchmark.db"

# Connect to the database
conn = sqlite3.connect(db_path)

# Create a cursor object to interact with the database
cursor = conn.cursor()

# Get the list of tables in the database
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

# Display the contents of each table
for table_name in tables:
    table_name = table_name[0]
    print(f"Contents of table {table_name}:")
    query = f"SELECT * FROM {table_name}"
    df = pd.read_sql_query(query, conn)
    display(df)
    print("\n")

# Close the connection
conn.close()

Contents of table results:
   ID setting_name  openml_id learner_name query_strategy_name  \
0   1        small         11   rf_entropy              margin   
1   2        small         11      svm_rbf              margin   
2   3        small         11   rf_entropy              random   
3   4        small         11      svm_rbf              random   

   test_split_seed  train_split_seed  seed        creation_date   status  \
0                0                 0     0  2024-06-12 16:51:25  created   
1                0                 0     0  2024-06-12 16:51:25  created   
2                0                 0     0  2024-06-12 16:51:25  created   
3                0                 0     0  2024-06-12 16:51:25  created   

  start_date  name machine end_date error  
0       None  None    None     None  None  
1       None  None    None     None  None  
2       None  None    None     None  None  
3       None  None    None     None  None  


Contents of table sqlite_sequence:
    

In [7]:
run(run_setup=False, reset_experiments=False)

2024-06-12 16:51:25,279  | py-experimenter - INFO     | Found 7 keyfields
2024-06-12 16:51:25,285  | py-experimenter - INFO     | Found 2 logtables
2024-06-12 16:51:25,288  | py-experimenter - INFO     | Found logtable results__accuracy_log
2024-06-12 16:51:25,291  | py-experimenter - INFO     | Found logtable results__labeling_log
2024-06-12 16:51:25,299  | py-experimenter - INFO     | Initialized and connected to database
[codecarbon INFO @ 16:51:25] [setup] RAM Tracking...
[codecarbon INFO @ 16:51:25] [setup] GPU Tracking...
[codecarbon INFO @ 16:51:25] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 16:51:25] [setup] CPU Tracking...
[codecarbon INFO @ 16:51:27] CPU Model on constant consumption mode: 12th Gen Intel(R) Core(TM) i7-12700H
[codecarbon INFO @ 16:51:27] >>> Tracker's metadata:
[codecarbon INFO @ 16:51:27]   Platform system: Linux-6.2.0-34-generic-x86_64-with-glibc2.35
[codecarbon INFO @ 16:51:27]   Python version: 3.10.14
[codecarbon INFO @ 16:51:27]   CodeCarbon vers

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[codecarbon INFO @ 16:51:31] Energy consumed for RAM : 0.000002 kWh. RAM Power : 11.63014554977417 W
[codecarbon INFO @ 16:51:31] Energy consumed for all GPUs : 0.000002 kWh. Total GPU Power : 12.654900355347309 W
[codecarbon INFO @ 16:51:31] Energy consumed for all CPUs : 0.000008 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 16:51:31] 0.000012 kWh of electricity used since the beginning.
[codecarbon INFO @ 16:51:31] [setup] RAM Tracking...
[codecarbon INFO @ 16:51:31] [setup] GPU Tracking...
[codecarbon INFO @ 16:51:31] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 16:51:31] [setup] CPU Tracking...
[codecarbon INFO @ 16:51:33] CPU Model on constant consumption mode: 12th Gen Intel(R) Core(TM) i7-12700H
[codecarbon INFO @ 16:51:33] >>> Tracker's metadata:
[codeca

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[codecarbon INFO @ 16:51:58] Energy consumed for RAM : 0.000002 kWh. RAM Power : 11.63014554977417 W
[codecarbon INFO @ 16:51:58] Energy consumed for all GPUs : 0.000021 kWh. Total GPU Power : 109.66879598410394 W
[codecarbon INFO @ 16:51:58] Energy consumed for all CPUs : 0.000008 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 16:51:58] 0.000032 kWh of electricity used since the beginning.
2024-06-12 16:51:58,351  | py-experimenter - INFO     | All configured executions finished.


### display (filled) table

In [8]:
# Specify the path to your .db file
db_path = "ALPBenchmark.db"

# Connect to the database
conn = sqlite3.connect(db_path)

# Create a cursor object to interact with the database
cursor = conn.cursor()

# Get the list of tables in the database
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

# Display the contents of each table
for table_name in tables:
    table_name = table_name[0]
    print(f"Contents of table {table_name}:")
    query = f"SELECT * FROM {table_name}"
    df = pd.read_sql_query(query, conn)
    display(df)
    print("\n")

# Close the connection
conn.close()

Contents of table results:
   ID setting_name  openml_id learner_name query_strategy_name  \
0   1        small         11   rf_entropy              margin   
1   2        small         11      svm_rbf              margin   
2   3        small         11   rf_entropy              random   
3   4        small         11      svm_rbf              random   

   test_split_seed  train_split_seed  seed        creation_date status  \
0                0                 0     0  2024-06-12 16:51:25   done   
1                0                 0     0  2024-06-12 16:51:25   done   
2                0                 0     0  2024-06-12 16:51:25   done   
3                0                 0     0  2024-06-12 16:51:25   done   

            start_date            name               machine  \
0  2024-06-12 16:51:41  PyExperimenter  valentin-XPS-15-9520   
1  2024-06-12 16:51:52  PyExperimenter  valentin-XPS-15-9520   
2  2024-06-12 16:51:31  PyExperimenter  valentin-XPS-15-9520   
3  2024-06-12 1