In [None]:
!pip install deap update_checker tqdm stopit tpot

Collecting deap
  Downloading deap-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting update_checker
  Downloading update_checker-0.18.0-py3-none-any.whl.metadata (2.3 kB)
Collecting stopit
  Downloading stopit-1.1.2.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tpot
  Downloading TPOT-0.12.2-py3-none-any.whl.metadata (2.0 kB)
Collecting scikit-learn>=1.4.1 (from tpot)
  Downloading scikit_learn-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting nvidia-nccl-cu12 (from xgboost>=1.1.0->tpot)
  Downloading nvidia_nccl_cu12-2.22.3-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)
Downloading deap-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (135 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.4/135.4 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hDow

In [None]:
import pandas as pd
import numpy as np
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


# Load the dataset
df = pd.read_csv("combined_regression_normal.csv") ##dataset without scraping_time and ad_time, leads to error

df = df.dropna(subset=['Price'])  # Remove rows with missing prices

# Separate features and target
X = df.drop(['Price'], axis=1)
y = df['Price']

# One-Hot Encode categorical variables
categorical_cols = X.select_dtypes(include=['object']).columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_cols)],
    remainder='passthrough')

X = preprocessor.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


In [None]:
# initializing tpot with parameters
#max_time_mins=None (per default) - here max time 4h
#config_dic due to one hot encoding
tpot = TPOTRegressor(generations=5, population_size=50,
                     cv=5, config_dict='TPOT sparse', verbosity=2, n_jobs=-1, max_time_mins=240,
                     periodic_checkpoint_folder='/content/results')

In [None]:
#starting the training
tpot.fit(X_train, y_train)

Optimization Progress:   0%|          | 0/5 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -3135226901462.7183

Generation 2 - Current best internal CV score: -3135226901462.7183

Best pipeline: RandomForestRegressor(input_matrix, bootstrap=True, max_features=0.7000000000000001, min_samples_leaf=6, min_samples_split=20, n_estimators=100)


In [None]:
import sklearn.metrics as skm

y_pred = tpot.predict(X_test)

print("RESULTS OF BEST MODEL:\n")

print(f"R2:                         {skm.r2_score(y_test, y_pred)}")
print(f"Mean Squared Error:         {skm.mean_squared_error(y_test, y_pred)}")
print(f"Root Mean Squared Error:    {skm.mean_squared_error(y_test, y_pred, squared=False)}")
print(f"Mean Absolute Error:        {skm.mean_absolute_error(y_test, y_pred)}")
print(f"Explained Variance:         {skm.explained_variance_score(y_test, y_pred)}")
print(f"Max Error:                  {skm.max_error(y_test, y_pred)}")
print(f"Median Absolute Error       {skm.median_absolute_error(y_test, y_pred)}")


RESULTS OF BEST MODEL:

R2:                         0.46008692573975585
Mean Squared Error:         2953343869441.299
Root Mean Squared Error:    1718529.5660655068
Mean Absolute Error:        954269.2799393244
Explained Variance:         0.4602554415852057
Max Error:                  38256056.77042082
Median Absolute Error       571874.9239247751




In [None]:
from google.colab import files
# output of values to dictionary
metrics_dict = {
    'Metric': [
        'R2',
        'Mean Squared Error',
        'Root Mean Squared Error',
        'Mean Absolute Error',
        'Explained Variance',
        'Max Error',
        'Median Absolute Error'
    ],
    'Value': [
        skm.r2_score(y_test, y_pred),
        skm.mean_squared_error(y_test, y_pred),
        skm.mean_squared_error(y_test, y_pred, squared=False),
        skm.mean_absolute_error(y_test, y_pred),
        skm.explained_variance_score(y_test, y_pred),
        skm.max_error(y_test, y_pred),
        skm.median_absolute_error(y_test, y_pred)
    ]
}

#to df
metrics_df = pd.DataFrame(metrics_dict)


print(metrics_df)

metrics_df.to_csv('tpot_regression_metrics_normal.csv', index=False)
files.download('tpot_regression_metrics_normal.csv')

                    Metric         Value
0                       R2  4.600869e-01
1       Mean Squared Error  2.953344e+12
2  Root Mean Squared Error  1.718530e+06
3      Mean Absolute Error  9.542693e+05
4       Explained Variance  4.602554e-01
5                Max Error  3.825606e+07
6    Median Absolute Error  5.718749e+05




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
tpot.export('tpot_regression_model_normal')