In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import (load_diabetes, load_breast_cancer, 
                            load_digits, load_iris, load_wine)
from sklearn.model_selection import train_test_split
from trainingSchool import TrainingSchool

def run_experiment(X, y, dataset_name, is_sequential=False):
    """
    Run AutoML pipeline on a dataset and print results
    """
    print(f"\n=== {dataset_name} ===")
    print(f"Dataset shape: {X.shape}")
    print(f"Number of unique target values: {len(np.unique(y))}")
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Initialize and fit pipeline
    pipeline = TrainingSchool()
    pipeline.fit(X_train, y_train, sequence_data=is_sequential)
    
    # Make predictions
    predictions = pipeline.predict(X_test)
    
    # Print example predictions
    print("\nExample Predictions (first 5 samples):")
    print(f"Actual:    {y_test[:5]}")
    print(f"Predicted: {predictions[:5]}")
    print("-" * 50)
    
    return pipeline

# Load regression datasets
print("Loading regression datasets...")

# Diabetes dataset
diabetes = load_diabetes()
print("\nRunning AutoML on Diabetes Dataset (Regression)")
diabetes_pipeline = run_experiment(
    diabetes.data, diabetes.target,
    "Diabetes Disease Progression Dataset"
)

# Load classification datasets
print("\nLoading classification datasets...")

# Breast Cancer dataset
cancer = load_breast_cancer()
print("\nRunning AutoML on Breast Cancer Dataset (Binary Classification)")
cancer_pipeline = run_experiment(
    cancer.data, cancer.target,
    "Breast Cancer Classification Dataset"
)

# Digits dataset
digits = load_digits()
print("\nRunning AutoML on Digits Dataset (Multiclass Classification)")
digits_pipeline = run_experiment(
    digits.data, digits.target,
    "Handwritten Digits Classification Dataset"
)

# Iris dataset
iris = load_iris()
print("\nRunning AutoML on Iris Dataset (Multiclass Classification)")
iris_pipeline = run_experiment(
    iris.data, iris.target,
    "Iris Flower Classification Dataset"
)

# Wine dataset
wine = load_wine()
print("\nRunning AutoML on Wine Dataset (Multiclass Classification)")
wine_pipeline = run_experiment(
    wine.data, wine.target,
    "Wine Classification Dataset"
)

# Print summary of best models for each dataset
print("\n=== Summary of Best Models ===")
datasets = {
    "Diabetes": diabetes_pipeline,
    "Breast Cancer": cancer_pipeline,
    "Digits": digits_pipeline,
    "Iris": iris_pipeline,
    "Wine": wine_pipeline
}

for dataset_name, pipeline in datasets.items():
    best_model_info = pipeline.get_best_model()
    print(f"\n{dataset_name}:")
    print(f"Best Model: {best_model_info['model_name']}")
    print(f"Best Scaler: {type(best_model_info['scaler']).__name__}")
    print(f"Best Score: {best_model_info['score']:.4f}")

2025-01-08 00:17:33.971758: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-08 00:17:34.005335: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736275654.041370   44924 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736275654.052716   44924 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-08 00:17:34.090921: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Loading regression datasets...

Running AutoML on Diabetes Dataset (Regression)

=== Diabetes Disease Progression Dataset ===
Dataset shape: (442, 10)
Number of unique target values: 214
Detected task type: regression
Score for linear with standard scaler: 55.9721
Score for svr with standard scaler: 72.5375

Best model: svr
Best scaler: StandardScaler
Best score: 72.5375

Example Predictions (first 5 samples):
Actual:    [219.  70. 202. 230. 111.]
Predicted: [142.35294557 142.76997294 141.79521687 152.15746672 136.80688143]
--------------------------------------------------

Loading classification datasets...

Running AutoML on Breast Cancer Dataset (Binary Classification)

=== Breast Cancer Classification Dataset ===
Dataset shape: (569, 30)
Number of unique target values: 2
Detected task type: classification
Score for logistic with standard scaler: 0.9736
Score for rf with standard scaler: 0.9560
Score for svc with standard scaler: 0.9758
Score for xgb with standard scaler: 0.9626
Sc

# Custom Data

In [3]:
import pandas as pd
import numpy as np

# Create a synthetic dataset
np.random.seed(42)
num_samples = 200
X_synthetic = np.random.rand(num_samples, 5)  # 5 features
y_synthetic = (X_synthetic.sum(axis=1) > 2.5).astype(int)  # A binary classification target

df = pd.DataFrame(X_synthetic, columns=[f"feature_{i}" for i in range(X_synthetic.shape[1])])
df["target"] = y_synthetic

# Save to CSV
df.to_csv("example_data.csv", index=False)
print("example_data.csv created!")


example_data.csv created!


In [4]:
import pandas as pd
from trainingSchoolV2 import TrainingSchool

# Load the CSV
df_loaded = pd.read_csv("example_data.csv")
print(f"Loaded dataset shape: {df_loaded.shape}")

# Split into features (X) and target (y)
X = df_loaded.drop("target", axis=1).values
y = df_loaded["target"].values

# Initialize the TrainingSchool
trainer = TrainingSchool()

# Fit on the loaded data
trainer.fit(X, y, sequence_data=False)  # or True if your data is sequential

# The best model and scaler:
best_model_info = trainer.get_best_model()
print("\nBest Model Info:")
print(best_model_info)


Loaded dataset shape: (200, 6)
Detected task type: classification
Score for logistic with standard scaler: 0.9800
Score for rf with standard scaler: 0.8650
Score for svc with standard scaler: 0.9250
Score for xgb with standard scaler: 0.8700
Score for mlp with standard scaler: 0.9700

Best model: logistic
Best scaler: StandardScaler
Best score: 0.9800

Best Model Info:
{'model': LogisticRegression(), 'scaler': StandardScaler(), 'score': 0.9800000000000001, 'model_name': 'logistic'}


In [3]:
import pandas as pd
from trainingSchool import TrainingSchool

# Load the CSV
df_loaded = pd.read_csv("example_data.csv")
print(f"Loaded dataset shape: {df_loaded.shape}")

# Split into features (X) and target (y)
X = df_loaded.drop("target", axis=1).values
y = df_loaded["target"].values

school = TrainingSchool('config.yaml')

# Train on your data
school.fit(X, y)

best_model_info = school.get_best_model()
print("\nBest Model Info:")
print(best_model_info)

Loaded dataset shape: (200, 6)
Detected task type: classification
Score for logistic with standard scaler: 0.9800
Score for rf with standard scaler: 0.8800


2025-01-09 19:50:12.811692: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


Score for LSTM with standard scaler: 0.8800
Score for Transformer with standard scaler: 0.5250

Best model: logistic
Best scaler: StandardScaler
Best score: 0.9800

Best Model Info:
{'model': LogisticRegression(max_iter=1000), 'scaler': StandardScaler(), 'score': 0.9800000000000001, 'model_name': 'logistic'}
