In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import (load_diabetes, load_breast_cancer, 
                            load_digits, load_iris, load_wine)
from sklearn.model_selection import train_test_split
from train import TrainingSchool

def run_experiment(X, y, dataset_name, is_sequential=False):
    """
    Run AutoML pipeline on a dataset and print results
    """
    print(f"\n=== {dataset_name} ===")
    print(f"Dataset shape: {X.shape}")
    print(f"Number of unique target values: {len(np.unique(y))}")
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Initialize and fit pipeline
    trainer = TrainingSchool(search_method="simulated_annealing", max_iter=30)
    trainer.fit(X, y, sequence_data=False)
    
    # Make predictions
    predictions = trainer.predict(X_test)
    
    # Print example predictions
    print("\nExample Predictions (first 5 samples):")
    print(f"Actual:    {y_test[:5]}")
    print(f"Predicted: {predictions[:5]}")
    print("-" * 50)
    
    return trainer

# Load regression datasets
print("Loading regression datasets...")

# Diabetes dataset
diabetes = load_diabetes()
print("\nRunning AutoML on Diabetes Dataset (Regression)")
diabetes_pipeline = run_experiment(
    diabetes.data, diabetes.target,
    "Diabetes Disease Progression Dataset"
)

# Load classification datasets
print("\nLoading classification datasets...")

# Breast Cancer dataset
cancer = load_breast_cancer()
print("\nRunning AutoML on Breast Cancer Dataset (Binary Classification)")
cancer_pipeline = run_experiment(
    cancer.data, cancer.target,
    "Breast Cancer Classification Dataset"
)

# Digits dataset
digits = load_digits()
print("\nRunning AutoML on Digits Dataset (Multiclass Classification)")
digits_pipeline = run_experiment(
    digits.data, digits.target,
    "Handwritten Digits Classification Dataset"
)

# Iris dataset
iris = load_iris()
print("\nRunning AutoML on Iris Dataset (Multiclass Classification)")
iris_pipeline = run_experiment(
    iris.data, iris.target,
    "Iris Flower Classification Dataset"
)

# Wine dataset
wine = load_wine()
print("\nRunning AutoML on Wine Dataset (Multiclass Classification)")
wine_pipeline = run_experiment(
    wine.data, wine.target,
    "Wine Classification Dataset"
)

# Print summary of best models for each dataset
print("\n=== Summary of Best Models ===")
datasets = {
    "Diabetes": diabetes_pipeline,
    "Breast Cancer": cancer_pipeline,
    "Digits": digits_pipeline,
    "Iris": iris_pipeline,
    "Wine": wine_pipeline
}

for dataset_name, pipeline in datasets.items():
    best_model_info = pipeline.get_best_model()
    print(f"\n{dataset_name}:")
    print(f"Best Model: {best_model_info['model_name']}")
    print(f"Best Scaler: {type(best_model_info['scaler']).__name__}")
    print(f"Best Score: {best_model_info['score']:.4f}")

2025-01-08 00:27:53.051462: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-08 00:27:53.105709: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736276273.166481   49917 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736276273.186245   49917 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-08 00:27:53.255098: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Loading regression datasets...

Running AutoML on Diabetes Dataset (Regression)

=== Diabetes Disease Progression Dataset ===
Dataset shape: (442, 10)
Number of unique target values: 214
Detected task type: regression





Chosen best config => ('robust', 'ridge'), Score: -54.9831

Example Predictions (first 5 samples):
Actual:    [219.  70. 202. 230. 111.]
Predicted: [146.8214184  174.9516059  144.53950946 285.1652829  123.96032718]
--------------------------------------------------

Loading classification datasets...

Running AutoML on Breast Cancer Dataset (Binary Classification)

=== Breast Cancer Classification Dataset ===
Dataset shape: (569, 30)
Number of unique target values: 2
Detected task type: classification


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




Chosen best config => ('minmax', 'svc'), Score: 0.9789

Example Predictions (first 5 samples):
Actual:    [1 0 0 1 1]
Predicted: [1 0 0 1 1]
--------------------------------------------------

Running AutoML on Digits Dataset (Multiclass Classification)

=== Handwritten Digits Classification Dataset ===
Dataset shape: (1797, 64)
Number of unique target values: 10
Detected task type: classification


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




Chosen best config => ('standard', 'svc'), Score: 0.9594

Example Predictions (first 5 samples):
Actual:    [6 9 3 7 2]
Predicted: [6 9 3 7 2]
--------------------------------------------------

Running AutoML on Iris Dataset (Multiclass Classification)

=== Iris Flower Classification Dataset ===
Dataset shape: (150, 4)
Number of unique target values: 3
Detected task type: classification


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




Chosen best config => ('minmax', 'svc'), Score: 0.9733

Example Predictions (first 5 samples):
Actual:    [1 0 2 1 1]
Predicted: [1 0 2 1 1]
--------------------------------------------------

Running AutoML on Wine Dataset (Multiclass Classification)

=== Wine Classification Dataset ===
Dataset shape: (178, 13)
Number of unique target values: 3
Detected task type: classification


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




Chosen best config => ('standard', 'svc'), Score: 0.9831

Example Predictions (first 5 samples):
Actual:    [0 0 2 0 1]
Predicted: [0 0 2 0 1]
--------------------------------------------------

=== Summary of Best Models ===

Diabetes:
Best Model: ('robust', 'ridge')
Best Scaler: RobustScaler
Best Score: -54.9831

Breast Cancer:
Best Model: ('minmax', 'svc')
Best Scaler: MinMaxScaler
Best Score: 0.9789

Digits:
Best Model: ('standard', 'svc')
Best Scaler: StandardScaler
Best Score: 0.9594

Iris:
Best Model: ('minmax', 'svc')
Best Scaler: MinMaxScaler
Best Score: 0.9733

Wine:
Best Model: ('standard', 'svc')
Best Scaler: StandardScaler
Best Score: 0.9831
