In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
# Clone the repository
!git clone https://github.com/hyperopt/hyperopt-sklearn.git

# Change to the directory
%cd hyperopt-sklearn

# Install the package in editable mode
!pip install -e .


Cloning into 'hyperopt-sklearn'...
remote: Enumerating objects: 3023, done.[K
remote: Counting objects: 100% (312/312), done.[K
remote: Compressing objects: 100% (114/114), done.[K
remote: Total 3023 (delta 212), reused 276 (delta 196), pack-reused 2711 (from 1)[K
Receiving objects: 100% (3023/3023), 2.31 MiB | 13.65 MiB/s, done.
Resolving deltas: 100% (1914/1914), done.
/kaggle/working/hyperopt-sklearn
Obtaining file:///kaggle/working/hyperopt-sklearn
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Collecting scikit-learn>=1.3.0 (from hpsklearn==1.0.3)
  Downloading scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Downloading scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K   [90m━━━━━━━

In [4]:
!pip install openml

Collecting openml
  Downloading openml-0.15.1-py3-none-any.whl.metadata (10 kB)
Collecting liac-arff>=2.4.0 (from openml)
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting xmltodict (from openml)
  Downloading xmltodict-0.14.2-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting minio (from openml)
  Downloading minio-7.2.15-py3-none-any.whl.metadata (6.7 kB)
Downloading openml-0.15.1-py3-none-any.whl (160 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.4/160.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading minio-7.2.15-py3-none-any.whl (95 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.1/95.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xmltodict-0.14.2-py2.py3-none-any.whl (10.0 kB)
Building wheels for collected packages: liac-arff
  Building wheel for liac-arff (setup.py) ... [?25l[?25hdone
  Created wheel for liac-arff: filename

In [8]:
import openml
import time
import numpy as np
import pandas as pd
from hpsklearn import HyperoptEstimator, any_regressor
from hyperopt import tpe
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# ✅ Define datasets to search (OpenML IDs)
dataset_ids = [ 8, 195,531,204]  # California Housing, Dataset 8, Dataset 531
results = []

# ✅ Iterate over datasets
for dataset_id in dataset_ids:
    print(f"\n🔍 Loading dataset {dataset_id} from OpenML...")
    dataset = openml.datasets.get_dataset(dataset_id)
    
    # Get data from OpenML dataset
    X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute)
    
    # ✅ Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # ✅ Define Hyperopt-Sklearn Estimator
    hyperopt_automl = HyperoptEstimator(
        regressor=any_regressor('reg'),  # Search all regressors
        algo=tpe.suggest,                # Use TPE (Tree-structured Parzen Estimator)
        max_evals=50,                     # Increase trials for better performance
        trial_timeout=300                  # Limit time per model evaluation
    )
    
    # ✅ Train the model
    print(f"🚀 Training Hyperopt-Sklearn on dataset {dataset_id}...")
    start_time = time.time()
    hyperopt_automl.fit(X_train.to_numpy(), y_train.to_numpy())  # Convert to NumPy arrays
    fit_time = time.time() - start_time
    
    # ✅ Make predictions
    y_pred = hyperopt_automl.predict(X_test.to_numpy())
    
    # ✅ Evaluate performance
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # ✅ Store results
    result = {
        "Dataset ID": dataset_id,
        "Hyperopt R² Score": r2,
        "Hyperopt MSE": mse,
        "Hyperopt Time": fit_time,
        "Hyperopt Best Models": hyperopt_automl.best_model()
    }
    results.append(result)
    
    print(f"✅ Completed dataset {dataset_id} -> MSE: {mse:.4f}, R²: {r2:.4f}")

# ✅ Save results to CSV
results_df = pd.DataFrame(results)
results_df.to_csv("hyperopt_results.csv", index=False)

print("\n📁 Results saved to hyperopt_results.csv!")



🔍 Loading dataset 8 from OpenML...
🚀 Training Hyperopt-Sklearn on dataset 8...
100%|██████████| 1/1 [00:00<00:00,  4.89trial/s, best loss: 0.8774762464269104]
100%|██████████| 2/2 [00:00<00:00, 10.28trial/s, best loss: 0.8774762464269104]
100%|██████████| 3/3 [05:00<00:00, 300.18s/trial, best loss: 0.8774762464269104]
100%|██████████| 4/4 [00:00<00:00, 10.46trial/s, best loss: 0.7913505582242394]
100%|██████████| 5/5 [00:00<00:00, 10.33trial/s, best loss: 0.7913505582242394]
100%|██████████| 6/6 [00:00<00:00,  2.89trial/s, best loss: 0.7913505582242394]
100%|██████████| 7/7 [00:00<00:00,  9.56trial/s, best loss: 0.7913505582242394]
100%|██████████| 8/8 [00:00<00:00,  1.56trial/s, best loss: 0.7913505582242394]
100%|██████████| 9/9 [00:00<00:00,  7.37trial/s, best loss: 0.7913505582242394]
100%|██████████| 10/10 [00:00<00:00,  8.30trial/s, best loss: 0.7913505582242394]
100%|██████████| 11/11 [00:00<00:00,  2.32trial/s, best loss: 0.7913505582242394]
100%|██████████| 12/12 [00:00<00:00

  numerator = xp.sum(weight * (y_true - y_pred) ** 2, axis=0)



100%|██████████| 50/50 [00:01<00:00,  1.85s/trial, best loss: 0.6976104874059909]
✅ Completed dataset 8 -> MSE: 9.4551, R²: 0.1118

🔍 Loading dataset 195 from OpenML...
🚀 Training Hyperopt-Sklearn on dataset 195...
100%|██████████| 1/1 [00:00<00:00,  9.95trial/s, best loss: 0.42875907030665994]
100%|██████████| 2/2 [00:00<00:00,  9.67trial/s, best loss: 0.2720792758470941]
100%|██████████| 3/3 [00:00<00:00,  7.08trial/s, best loss: 0.2378487424083594]
100%|██████████| 4/4 [00:00<00:00,  9.38trial/s, best loss: 0.2378487424083594]
100%|██████████| 5/5 [00:00<00:00,  2.95trial/s, best loss: 0.2378487424083594]
100%|██████████| 6/6 [00:01<00:00,  1.94s/trial, best loss: 0.22269897983806042]
100%|██████████| 7/7 [00:00<00:00,  2.92trial/s, best loss: 0.22269897983806042]
100%|██████████| 8/8 [00:00<00:00,  9.98trial/s, best loss: 0.22269897983806042]
100%|██████████| 9/9 [00:00<00:00,  5.22trial/s, best loss: 0.22269897983806042]
100%|██████████| 10/10 [00:00<00:00,  2.43trial/s, best loss




100%|██████████| 45/45 [00:00<00:00,  2.30trial/s, best loss: 0.1783250173225669]
100%|██████████| 46/46 [00:00<00:00,  6.36trial/s, best loss: 0.1783250173225669]
100%|██████████| 47/47 [00:00<00:00,  2.58trial/s, best loss: 0.1783250173225669]
100%|██████████| 48/48 [00:00<00:00,  3.78trial/s, best loss: 0.1783250173225669]
100%|██████████| 49/49 [00:01<00:00,  1.04s/trial, best loss: 0.1783250173225669]
100%|██████████| 50/50 [00:02<00:00,  2.66s/trial, best loss: 0.1783250173225669]
✅ Completed dataset 195 -> MSE: 6224542.1875, R²: 0.6502

🔍 Loading dataset 531 from OpenML...
🚀 Training Hyperopt-Sklearn on dataset 531...
100%|██████████| 1/1 [05:00<00:00, 300.14s/trial, best loss=?]


AllTrialsFailed: 

In [6]:
import openml
import time
import numpy as np
import pandas as pd
from hpsklearn import HyperoptEstimator, any_regressor
from hyperopt import tpe
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# ✅ Define datasets to search (OpenML IDs)
dataset_ids = [ 204]  # California Housing, Dataset 8, Dataset 531
results = []

# ✅ Iterate over datasets
for dataset_id in dataset_ids:
    print(f"\n🔍 Loading dataset {dataset_id} from OpenML...")
    dataset = openml.datasets.get_dataset(dataset_id)
    
    # Get data from OpenML dataset
    X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute)
    mask = X.isna().any(axis=1) | y.isna()

    # Drop those rows from both X and y
    X = X[~mask]
    y = y[~mask]
    
    # ✅ Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # ✅ Define Hyperopt-Sklearn Estimator
    hyperopt_automl = HyperoptEstimator(
        regressor=any_regressor('reg'),  # Search all regressors
        algo=tpe.suggest,                # Use TPE (Tree-structured Parzen Estimator)
        max_evals=50,                     # Increase trials for better performance
        trial_timeout=300                  # Limit time per model evaluation
    )
    
    # ✅ Train the model
    print(f"🚀 Training Hyperopt-Sklearn on dataset {dataset_id}...")
    start_time = time.time()
    hyperopt_automl.fit(X_train.to_numpy(), y_train.to_numpy())  # Convert to NumPy arrays
    fit_time = time.time() - start_time
    
    # ✅ Make predictions
    y_pred = hyperopt_automl.predict(X_test.to_numpy())
    
    # ✅ Evaluate performance
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # ✅ Store results
    result = {
        "Dataset ID": dataset_id,
        "Hyperopt R² Score": r2,
        "Hyperopt MSE": mse,
        "Hyperopt Time": fit_time,
        "Hyperopt Best Models": hyperopt_automl.best_model()
    }
    results.append(result)
    
    print(f"✅ Completed dataset {dataset_id} -> MSE: {mse:.4f}, R²: {r2:.4f}")

# ✅ Save results to CSV
results_df = pd.DataFrame(results)
results_df.to_csv("hyperopt_results.csv", index=False)

print("\n📁 Results saved to hyperopt_results.csv!")



🔍 Loading dataset 204 from OpenML...
🚀 Training Hyperopt-Sklearn on dataset 204...
100%|██████████| 1/1 [00:00<00:00,  4.48trial/s, best loss: 0.99110866924139]
100%|██████████| 2/2 [00:02<00:00,  2.20s/trial, best loss: 0.99110866924139]
100%|██████████| 3/3 [00:00<00:00,  2.80trial/s, best loss: 0.99110866924139]
100%|██████████| 4/4 [00:00<00:00,  8.83trial/s, best loss: 0.99110866924139]
100%|██████████| 5/5 [00:02<00:00,  2.58s/trial, best loss: 0.99110866924139]
100%|██████████| 6/6 [00:00<00:00,  1.71trial/s, best loss: 0.99110866924139]
100%|██████████| 7/7 [00:00<00:00, 10.55trial/s, best loss: 0.99110866924139]
100%|██████████| 8/8 [00:00<00:00,  1.52trial/s, best loss: 0.9788300089158094]
100%|██████████| 9/9 [00:00<00:00, 10.19trial/s, best loss: 0.9788300089158094]
100%|██████████| 10/10 [00:00<00:00,  5.36trial/s, best loss: 0.9788300089158094]
100%|██████████| 11/11 [00:00<00:00,  1.28trial/s, best loss: 0.9788300089158094]
100%|██████████| 12/12 [00:00<00:00,  9.39tria

In [7]:
result

{'Dataset ID': 204,
 'Hyperopt R² Score': -0.06479912778759478,
 'Hyperopt MSE': 2674.632252675257,
 'Hyperopt Time': 30.684821605682373,
 'Hyperopt Best Models': {'learner': ExtraTreeRegressor(criterion='friedman_mse', max_depth=3, max_features='log2',
                     max_leaf_nodes=15, random_state=3, splitter='best'),
  'preprocs': (Normalizer(),),
  'ex_preprocs': ()}}

In [1]:
import openml

ModuleNotFoundError: No module named 'openml'