In [2]:
import autosklearn 

In [3]:
import autosklearn.regression
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np

In [4]:
# Load the dataset
df = pd.read_csv("combined_regression_normal.csv") ##dataset without scraping_time and Ad_time, leads to error

df = df.dropna(subset=['Price'])  # Remove rows with missing prices

# Separate features and target
X = df.drop(['Price'], axis=1)
y = df['Price']

# One-Hot Encode categorical variables
categorical_cols = X.select_dtypes(include=['object']).columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_cols)],
    remainder='passthrough')

X = preprocessor.fit_transform(X)


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Initialize and train the AutoSklearnRegressor
#max time here 4h

automl_regressor = autosklearn.regression.AutoSklearnRegressor(
        time_left_for_this_task= 14400,
    per_run_time_limit= 600,
    n_jobs = 8, 
    memory_limit=8192  
)

In [5]:
automl_regressor.fit(X_train, y_train, dataset_name="housing_prices")




In [6]:
print(automl_regressor.leaderboard())

          rank  ensemble_weight         type      cost    duration
model_id                                                          
663          1             0.32  extra_trees  0.492763  244.025619
575          2             0.22  extra_trees  0.495521  597.142307
551          3             0.46  extra_trees  0.498173  531.738704


In [7]:
import sklearn.metrics as skm


y_pred = automl_regressor.predict(X_test)

print(f"R2:                         {skm.r2_score(y_test, y_pred)}")
print(f"Mean Squared Error:         {skm.mean_squared_error(y_test, y_pred)}")
print(f"Root Mean Squared Error:    {skm.mean_squared_error(y_test, y_pred, squared=False)}")
print(f"Mean Absolute Error:        {skm.mean_absolute_error(y_test, y_pred)}")
print(f"Explained Variance:         {skm.explained_variance_score(y_test, y_pred)}")
print(f"Max Error:                  {skm.max_error(y_test, y_pred)}")
print(f"Median Absolute Error       {skm.median_absolute_error(y_test, y_pred)}")

R2:                         0.46597024945984333
Mean Squared Error:         2656487008210.888
Root Mean Squared Error:    1629873.310478728
Mean Absolute Error:        795152.1126189835
Explained Variance:         0.46900142075510176
Max Error:                  48862843.03125
Median Absolute Error       366032.9921875


In [8]:
#leaderboard to df
leaderboard_df_normal_reg = automl_regressor.leaderboard(detailed=True)
leaderboard_df_normal_reg = pd.DataFrame(leaderboard_df_normal_reg)

print(leaderboard_df_normal_reg)

# download leaderboard
from IPython.display import FileLink
leaderboard_df_normal_reg.to_csv('autosklearn_regression_leaderboard_normal.csv', index=False)
FileLink('autosklearn_regression_leaderboard_normal.csv')

          rank  ensemble_weight         type      cost    duration  config_id  \
model_id                                                                        
663          1             0.32  extra_trees  0.492763  244.025619        662   
575          2             0.22  extra_trees  0.495521  597.142307        574   
551          3             0.46  extra_trees  0.498173  531.738704        550   

          train_loss  seed    start_time      end_time  budget  \
model_id                                                         
663         0.076552     0  1.724603e+09  1.724603e+09     0.0   
575         0.061555     0  1.724599e+09  1.724599e+09     0.0   
551         0.081645     0  1.724598e+09  1.724598e+09     0.0   

                      status data_preprocessors feature_preprocessors  \
model_id                                                                
663       StatusType.SUCCESS                 []          [polynomial]   
575       StatusType.SUCCESS                

In [9]:
metrics_dict = {
    'Metric': [
        'R2',
        'Mean Squared Error',
        'Root Mean Squared Error',
        'Mean Absolute Error',
        'Explained Variance',
        'Max Error',
        'Median Absolute Error'
    ],
    'Value': [
        skm.r2_score(y_test, y_pred),
        skm.mean_squared_error(y_test, y_pred),
        skm.mean_squared_error(y_test, y_pred, squared=False),
        skm.mean_absolute_error(y_test, y_pred),
        skm.explained_variance_score(y_test, y_pred),
        skm.max_error(y_test, y_pred),
        skm.median_absolute_error(y_test, y_pred)
    ]
}

#to df
metrics_df_normal_reg = pd.DataFrame(metrics_dict)


print(metrics_df_normal_reg)

metrics_df_normal_reg.to_csv('autosklearn_regression_metrics_normal.csv', index=False)

                    Metric         Value
0                       R2  4.659702e-01
1       Mean Squared Error  2.656487e+12
2  Root Mean Squared Error  1.629873e+06
3      Mean Absolute Error  7.951521e+05
4       Explained Variance  4.690014e-01
5                Max Error  4.886284e+07
6    Median Absolute Error  3.660330e+05


In [10]:
from IPython.display import FileLink
FileLink('autosklearn_regression_metrics_normal.csv')