In [14]:
import autosklearn.regression
import sklearn.model_selection
import sklearn.metrics
import openml
import numpy as np
import pandas as pd
import time
import json




In [2]:
import openml
import autosklearn.regression
import sklearn.model_selection
import sklearn.metrics
import numpy as np
import time
import pandas as pd

# ✅ Select Regression Datasets
dataset_ids = [195, 8, 531]  # Modify with your own dataset IDs if needed
results = []

for openml_id in dataset_ids:
    print(f"\n🚀 Running Auto-Sklearn on Dataset {openml_id}")
    
    # ✅ Load dataset
    dataset = openml.datasets.get_dataset(openml_id)
    X, y, _, feature_names = dataset.get_data(target=dataset.default_target_attribute)
    
    # ✅ Debug Target (y)
    print(f"🔍 Target (y) dtype: {y.dtype}")
    print(f"🔍 Unique values in y: {np.unique(y)}")
    print(f"🔍 Number of unique values in y: {len(np.unique(y))}")
    
    # ✅ Ensure y is a float for regression
    y = y.astype(np.float64)
    
    # ✅ Split Data
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
        X, y, test_size=0.33, random_state=42
    )
    
    # ✅ Auto-Sklearn Configuration
    automl = autosklearn.regression.AutoSklearnRegressor(
        time_left_for_this_task=1800,  
        per_run_time_limit=300,        
        ensemble_kwargs={'ensemble_size': 20},  
        n_jobs=-1,                      
        resampling_strategy='holdout',  
        initial_configurations_via_metalearning=10,  
    )

    # ✅ Train Auto-Sklearn
    start_time = time.time()
    automl.fit(X_train, y_train, dataset_name=f"Dataset-{openml_id}")
    fit_time = time.time() - start_time

    # ✅ Make Predictions
    y_pred = automl.predict(X_test)

    # ✅ Compute Metrics
    mse = sklearn.metrics.mean_squared_error(y_test, y_pred)
    r2 = sklearn.metrics.r2_score(y_test, y_pred)

    # ✅ Store Results
    results.append({
        "Dataset ID": openml_id,
        "AutoSklearn R² Score": round(r2, 4),
        "AutoSklearn MSE": round(mse, 4),
        "AutoSklearn Time (s)": round(fit_time, 2),
    })
    
    print(f"✅ Auto-Sklearn - Dataset {openml_id}: R² Score {r2:.4f}, MSE {mse:.4f}, Time {fit_time:.2f}s")

# ✅ Save Results to CSV
results_df = pd.DataFrame(results)
results_df.to_csv("autosklearn_results.csv", index=False)

print("\n📁 Results saved to 'autosklearn_results.csv'")
print("🎉 Regression Experiment Completed!")


🚀 Running Auto-Sklearn on Dataset 195
🔍 Target (y) dtype: float64
🔍 Unique values in y: [ 5118.  5151.  5195.  5348.  5389.  5399.  5499.  5572.  6095.  6189.
  6229.  6295.  6338.  6377.  6479.  6488.  6529.  6575.  6649.  6669.
  6692.  6695.  6795.  6849.  6855.  6918.  6938.  6989.  7053.  7099.
  7126.  7129.  7198.  7295.  7299.  7349.  7395.  7463.  7499.  7603.
  7609.  7689.  7738.  7775.  7788.  7799.  7895.  7898.  7957.  7975.
  7995.  7999.  8013.  8058.  8189.  8195.  8238.  8249.  8358.  8449.
  8495.  8499.  8778.  8845.  8921.  8948.  8949.  9095.  9233.  9258.
  9279.  9298.  9495.  9538.  9549.  9639.  9959.  9960.  9980.  9988.
  9989.  9995. 10198. 10245. 10295. 10345. 10595. 10698. 10898. 11199.
 11245. 11248. 11259. 11549. 11694. 11850. 11900. 12170. 12940. 12945.
 12964. 13200. 13415. 13499. 13950. 14399. 15040. 15510. 15580. 15690.
 15985. 15998. 16430. 16515. 16558. 16630. 16845. 16900. 16925. 17199.
 17450. 17669. 17710. 17950. 18150. 18280. 18399. 18420. 18

  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,
  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,
  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,
  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,
  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,
  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,
  numerator = (weight * (y_true - y_pred) ** 2).sum(axis=0,


✅ Auto-Sklearn - Dataset 8: R² Score 0.1295, MSE 7.8644, Time 1801.75s

🚀 Running Auto-Sklearn on Dataset 531
🔍 Target (y) dtype: float64
🔍 Unique values in y: [ 5.   5.6  6.3  7.   7.2  7.4  7.5  8.1  8.3  8.4  8.5  8.7  8.8  9.5
  9.6  9.7 10.2 10.4 10.5 10.8 10.9 11.  11.3 11.5 11.7 11.8 11.9 12.
 12.1 12.3 12.5 12.6 12.7 12.8 13.  13.1 13.2 13.3 13.4 13.5 13.6 13.8
 13.9 14.  14.1 14.2 14.3 14.4 14.5 14.6 14.8 14.9 15.  15.1 15.2 15.3
 15.4 15.6 15.7 16.  16.1 16.2 16.3 16.4 16.5 16.6 16.7 16.8 17.  17.1
 17.2 17.3 17.4 17.5 17.6 17.7 17.8 17.9 18.  18.1 18.2 18.3 18.4 18.5
 18.6 18.7 18.8 18.9 19.  19.1 19.2 19.3 19.4 19.5 19.6 19.7 19.8 19.9
 20.  20.1 20.2 20.3 20.4 20.5 20.6 20.7 20.8 20.9 21.  21.1 21.2 21.4
 21.5 21.6 21.7 21.8 21.9 22.  22.1 22.2 22.3 22.4 22.5 22.6 22.7 22.8
 22.9 23.  23.1 23.2 23.3 23.4 23.5 23.6 23.7 23.8 23.9 24.  24.1 24.2
 24.3 24.4 24.5 24.6 24.7 24.8 25.  25.1 25.2 25.3 26.2 26.4 26.5 26.6
 26.7 27.  27.1 27.5 27.9 28.  28.1 28.2 28.4 28.5 28.6 28.7

In [7]:


# ✅ Select Regression Datasets
dataset_ids = [204]  # Modify with your own dataset IDs if needed

for openml_id in dataset_ids:
    print(f"\n🚀 Running Auto-Sklearn on Dataset {openml_id}")
    
    # ✅ Load dataset
    dataset = openml.datasets.get_dataset(openml_id)
    X, y, _, feature_names = dataset.get_data(target=dataset.default_target_attribute)
    
    # ✅ Debug Target (y)
    print(f"🔍 Target (y) dtype: {y.dtype}")
    print(f"🔍 Unique values in y: {np.unique(y)}")
    print(f"🔍 Number of unique values in y: {len(np.unique(y))}")
    
    # ✅ Ensure y is a float for regression
    y = y.astype(np.float64)
    
    # ✅ Split Data
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
        X, y, test_size=0.33, random_state=42
    )
    
    # ✅ Auto-Sklearn Configuration
    automl = autosklearn.regression.AutoSklearnRegressor(
        time_left_for_this_task=1800,  
        per_run_time_limit=300,        
        ensemble_kwargs={'ensemble_size': 20},  
        n_jobs=-1,                      
        resampling_strategy='holdout',  
        initial_configurations_via_metalearning=10,  
    )

    # ✅ Train Auto-Sklearn
    start_time = time.time()
    automl.fit(X_train, y_train, dataset_name=f"Dataset-{openml_id}")
    fit_time = time.time() - start_time

    # ✅ Make Predictions
    y_pred = automl.predict(X_test)

    # ✅ Compute Metrics
    mse = sklearn.metrics.mean_squared_error(y_test, y_pred)
    r2 = sklearn.metrics.r2_score(y_test, y_pred)

    # ✅ Store Results
    results.append({
        "Dataset ID": openml_id,
        "AutoSklearn R² Score": round(r2, 4),
        "AutoSklearn MSE": round(mse, 4),
        "AutoSklearn Time (s)": round(fit_time, 2),
    })
    
    print(f"✅ Auto-Sklearn - Dataset {openml_id}: R² Score {r2:.4f}, MSE {mse:.4f}, Time {fit_time:.2f}s")

# ✅ Save Results to CSV
results_df = pd.DataFrame(results)
results_df.to_csv("autosklearn_results.csv", index=False)

print("\n📁 Results saved to 'autosklearn_results.csv'")
print("🎉 Regression Experiment Completed!")


🚀 Running Auto-Sklearn on Dataset 204
🔍 Target (y) dtype: float64
🔍 Unique values in y: [126. 131. 141. 149. 157. 160. 164. 166. 167. 168. 169. 172. 174. 175.
 176. 177. 178. 180. 182. 183. 184. 185. 186. 187. 188. 192. 193. 195.
 196. 197. 198. 199. 200. 201. 203. 204. 205. 206. 207. 208. 209. 210.
 211. 212. 213. 214. 215. 216. 217. 218. 219. 220. 221. 222. 223. 224.
 225. 226. 227. 228. 229. 230. 231. 232. 233. 234. 235. 236. 237. 239.
 240. 241. 242. 243. 244. 245. 246. 247. 248. 249. 250. 252. 253. 254.
 255. 256. 257. 258. 259. 260. 261. 262. 263. 264. 265. 266. 267. 268.
 269. 270. 271. 273. 274. 275. 276. 277. 278. 281. 282. 283. 284. 286.
 288. 289. 290. 293. 294. 295. 298. 299. 300. 302. 303. 304. 305. 306.
 307. 308. 309. 311. 313. 315. 318. 319. 321. 322. 325. 326. 327. 330.
 335. 340. 341. 342. 353. 354. 360. 394. 407. 409. 417. 564.]
🔍 Number of unique values in y: 152
✅ Auto-Sklearn - Dataset 204: R² Score 0.0336, MSE 3452.4542, Time 1803.62s

📁 Results saved to 'autosk

In [9]:
results

[{'Dataset ID': 195,
  'AutoSklearn R² Score': 0.8389,
  'AutoSklearn MSE': 4357469.1431,
  'AutoSklearn Time (s)': 1796.96},
 {'Dataset ID': 8,
  'AutoSklearn R² Score': 0.1295,
  'AutoSklearn MSE': 7.8644,
  'AutoSklearn Time (s)': 1801.75},
 {'Dataset ID': 531,
  'AutoSklearn R² Score': 0.8573,
  'AutoSklearn MSE': 10.8029,
  'AutoSklearn Time (s)': 1801.97},
 {'Dataset ID': 204,
  'AutoSklearn R² Score': 0.0336,
  'AutoSklearn MSE': 3452.4542,
  'AutoSklearn Time (s)': 1803.62}]

In [5]:
for openml_id in results:
    results[openml_id]["AutoSklearn Best Models"] = [
        (weight, str(model)) for weight, model in results[openml_id]["AutoSklearn Best Models"]
    ]


In [6]:
import json

with open("fast_evaluation_reg_results.json", "w") as f:
    json.dump(results, f, indent=4)

print("\n🎉 Fast AutoML Experiment Results Saved Successfully!")



🎉 Fast AutoML Experiment Results Saved Successfully!


In [16]:
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

life=pd.read_csv("Life Expectancy Data (1).csv")
life.columns=['Country', 'Year', 'Status', 'Life Expectancy', 'Adult Mortality',
       'Infant Deaths', 'Alcohol', 'Percentage Expenditure', 'Hepatitis B',
       'Measles', ' BMI', 'Under-Five Deaths', 'Polio', 'Total Expenditure',
       'Diphtheria','HIV/AIDS', 'GDP', 'Population',
       'Thinness  1-19 years', 'Thinness 5-9 years',
       'Income composition of resources', 'Schooling']

imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
imputedData=imp_median.fit_transform(life[['Life Expectancy','Alcohol','Hepatitis B','Polio','Diphtheria']])
life[['Life Expectancy','Alcohol','Hepatitis B','Polio','Diphtheria']]=imputedData
print(life.isna().sum())
X=life[['Alcohol', 'Hepatitis B','Measles','Polio','Diphtheria', 'HIV/AIDS']]
y=life['Life Expectancy']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

for openml_id in dataset_ids:
    print(f"\n🚀 Running Auto-Sklearn on Dataset {openml_id}")
    
    # ✅ Load dataset
    # dataset = openml.datasets.get_dataset(openml_id)
    # X, y, _, feature_names = dataset.get_data(target=dataset.default_target_attribute)
    
    # ✅ Auto-Sklearn Configuration
    automl = autosklearn.regression.AutoSklearnRegressor(
        time_left_for_this_task=1800,  
        per_run_time_limit=300,        
        ensemble_kwargs={'ensemble_size': 20},  
        n_jobs=-1,                      
        resampling_strategy='holdout',  
        initial_configurations_via_metalearning=10,  
    )

    # ✅ Train Auto-Sklearn
    start_time = time.time()
    automl.fit(X_train, y_train, dataset_name=f"Dataset-LifeExpectancy")
    fit_time = time.time() - start_time

    # ✅ Make Predictions
    y_pred = automl.predict(X_test)

    # ✅ Compute Metrics
    mse = sklearn.metrics.mean_squared_error(y_test, y_pred)
    r2 = sklearn.metrics.r2_score(y_test, y_pred)

    # ✅ Store Results
    results.append({
        "Dataset ID": 'Life',
        "AutoSklearn R² Score": round(r2, 4),
        "AutoSklearn MSE": round(mse, 4),
        "AutoSklearn Time (s)": round(fit_time, 2),
    })
    
    print(f"✅ Auto-Sklearn - Dataset {openml_id}: R² Score {r2:.4f}, MSE {mse:.4f}, Time {fit_time:.2f}s")

# ✅ Save Results to CSV
results_df = pd.DataFrame(results)
results_df.to_csv("autosklearn_results_life.csv", index=False)

print("\n📁 Results saved to 'autosklearn_results_life.csv'")
print("🎉 Regression Experiment Completed!")



Country                              0
Year                                 0
Status                               0
Life Expectancy                      0
Adult Mortality                     10
Infant Deaths                        0
Alcohol                              0
Percentage Expenditure               0
Hepatitis B                          0
Measles                              0
 BMI                                34
Under-Five Deaths                    0
Polio                                0
Total Expenditure                  226
Diphtheria                           0
HIV/AIDS                             0
GDP                                448
Population                         652
Thinness  1-19 years                34
Thinness 5-9 years                  34
Income composition of resources    167
Schooling                          163
dtype: int64

🚀 Running Auto-Sklearn on Dataset 204
✅ Auto-Sklearn - Dataset 204: R² Score 0.8528, MSE 13.7622, Time 1801.03s

📁 Results saved

In [18]:
results_df

Unnamed: 0,Dataset ID,AutoSklearn R² Score,AutoSklearn MSE,AutoSklearn Time (s)
0,195,0.8389,4357469.0,1796.96
1,8,0.1295,7.8644,1801.75
2,531,0.8573,10.8029,1801.97
3,204,0.0336,3452.454,1803.62
4,Life,0.8528,13.7622,1801.03


In [22]:
import openml

# Define the dataset ID
#8,531,204
dataset_id = 43505  # Example: California Housing dataset

# Fetch dataset details
dataset = openml.datasets.get_dataset(dataset_id)

# Print key details
print(f"Dataset Name: {dataset.name}")
print(f"Dataset Version: {dataset.version}")
print(f"Dataset URL: {dataset.url}")
print(f"Number of Instances: {dataset.qualities['NumberOfInstances']}")
print(f"Number of Features: {dataset.qualities['NumberOfFeatures']}")
print(f"Default Target Attribute: {dataset.default_target_attribute}")
print(f"Data Format: {dataset.format}")
print(f"Description: {dataset.description[:500]}...")  # Print first 500 characters


Dataset Name: WHO-national-life-expectancy
Dataset Version: 1
Dataset URL: https://api.openml.org/data/v1/download/22102330/WHO-national-life-expectancy.arff
Number of Instances: 3111.0
Number of Features: 32.0
Default Target Attribute: None
Data Format: arff
Description: Context
I am developing my data science skills in areas outside of my previous work. An interesting problem for me was to identify which factors influence life expectancy on a national level. There is an existing Kaggle data set that explored this, but that information was corrupted. Part of the problem solving process is to step back periodically and ask "does this make sense?" Without reasonable data, it is harder to notice mistakes in my analysis code (as opposed to unusual behavior due to th...
