In [2]:
import tpot




In [4]:
import pandas as pd
import numpy as np
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


# Load the dataset
df = pd.read_csv("combined_regression_normal.csv") ##dataset without scraping_time and ad_time, leads to error

df = df.dropna(subset=['Price'])  # Remove rows with missing prices

# Separate features and target
X = df.drop(['Price'], axis=1)
y = df['Price']

# One-Hot Encode categorical variables
categorical_cols = X.select_dtypes(include=['object']).columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_cols)],
    remainder='passthrough')

X = preprocessor.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [14]:
# initializing tpot with parameters
#max_time_mins=None (per default) - here max time 4h
#config_dic due to one hot encoding
tpot = TPOTRegressor(generations=5, population_size=50,
                     cv=5, config_dict='TPOT sparse', verbosity=2, n_jobs=16, max_time_mins=240,
                     periodic_checkpoint_folder='/content/results')

In [15]:
#starting the training
tpot.fit(X_train, y_train)

Version 0.12.1 of tpot is outdated. Version 0.12.2 was released Friday February 23, 2024.


Optimization Progress:   0%|          | 0/50 [00:00<?, ?pipeline/s]




Generation 1 - Current best internal CV score: -2807169769292.6494

Generation 2 - Current best internal CV score: -2737018035638.432

Generation 3 - Current best internal CV score: -2737018035638.432

Generation 4 - Current best internal CV score: -2714621318969.518

Generation 5 - Current best internal CV score: -2667895165963.82

Best pipeline: RandomForestRegressor(input_matrix, bootstrap=False, max_features=0.4, min_samples_leaf=2, min_samples_split=12, n_estimators=100)


In [16]:
import sklearn.metrics as skm

y_pred = tpot.predict(X_test)

print("RESULTS OF BEST MODEL:\n")

print(f"R2:                         {skm.r2_score(y_test, y_pred)}")
print(f"Mean Squared Error:         {skm.mean_squared_error(y_test, y_pred)}")
print(f"Root Mean Squared Error:    {skm.mean_squared_error(y_test, y_pred, squared=False)}")
print(f"Mean Absolute Error:        {skm.mean_absolute_error(y_test, y_pred)}")
print(f"Explained Variance:         {skm.explained_variance_score(y_test, y_pred)}")
print(f"Max Error:                  {skm.max_error(y_test, y_pred)}")
print(f"Median Absolute Error       {skm.median_absolute_error(y_test, y_pred)}")


RESULTS OF BEST MODEL:

R2:                         0.5573229736024927
Mean Squared Error:         2421459202196.3364
Root Mean Squared Error:    1556103.8532811159
Mean Absolute Error:        818699.5089797908
Explained Variance:         0.5574480498486267
Max Error:                  34354225.47986328
Median Absolute Error       420493.29575433116


In [17]:
#from IPython.display import FileLink
#output of values to dictionary
metrics_dict = {
    'Metric': [
        'R2',
        'Mean Squared Error',
        'Root Mean Squared Error',
        'Mean Absolute Error',
        'Explained Variance',
        'Max Error',
        'Median Absolute Error'
    ],
    'Value': [
        skm.r2_score(y_test, y_pred),
        skm.mean_squared_error(y_test, y_pred),
        skm.mean_squared_error(y_test, y_pred, squared=False),
        skm.mean_absolute_error(y_test, y_pred),
        skm.explained_variance_score(y_test, y_pred),
        skm.max_error(y_test, y_pred),
        skm.median_absolute_error(y_test, y_pred)
    ]
}

#to df
metrics_df = pd.DataFrame(metrics_dict)


print(metrics_df)

metrics_df.to_csv('tpot_regression_metrics_normal.csv', index=False)

                    Metric         Value
0                       R2  5.573230e-01
1       Mean Squared Error  2.421459e+12
2  Root Mean Squared Error  1.556104e+06
3      Mean Absolute Error  8.186995e+05
4       Explained Variance  5.574480e-01
5                Max Error  3.435423e+07
6    Median Absolute Error  4.204933e+05


In [18]:
from IPython.display import FileLink
FileLink('tpot_regression_metrics_normal.csv')

In [19]:
#showing best models as there is no leaderboard
import pandas as pd

# Converting all evaluated models to a list of tuples
my_dict = list(tpot.evaluated_individuals_.items())

# empty list for dictionaries
model_list = []

for model in my_dict:
    model_name = model[0]
    model_info = model[1]
    cv_score = model_info.get('internal_cv_score')  # Pull out cv_score as a column (i.e., sortable)
    
    # Append dictionary to the list
    model_list.append({
        'model': model_name,
        'cv_score': cv_score,
        'model_info': model_info,
    })

# Convert list to a DataFrame
model_scores = pd.DataFrame(model_list)

# Sort the DataFrame by cv_score
model_scores = model_scores.sort_values('cv_score', ascending=False)

# output
model_scores

Unnamed: 0,model,cv_score,model_info
267,"RandomForestRegressor(input_matrix, RandomFore...",-2.667895e+12,"{'generation': 5, 'mutation_count': 2, 'crosso..."
210,"RandomForestRegressor(input_matrix, RandomFore...",-2.714621e+12,"{'generation': 4, 'mutation_count': 3, 'crosso..."
252,"RandomForestRegressor(input_matrix, RandomFore...",-2.723505e+12,"{'generation': 5, 'mutation_count': 4, 'crosso..."
241,"RandomForestRegressor(input_matrix, RandomFore...",-2.729536e+12,"{'generation': 5, 'mutation_count': 2, 'crosso..."
116,"RandomForestRegressor(input_matrix, RandomFore...",-2.737018e+12,"{'generation': 2, 'mutation_count': 1, 'crosso..."
...,...,...,...
89,"XGBRegressor(input_matrix, XGBRegressor__learn...",-8.395475e+12,"{'generation': 1, 'mutation_count': 1, 'crosso..."
30,"XGBRegressor(input_matrix, XGBRegressor__learn...",-8.541108e+12,"{'generation': 0, 'mutation_count': 0, 'crosso..."
55,RandomForestRegressor(SelectFromModel(input_ma...,-inf,"{'generation': 1, 'mutation_count': 1, 'crosso..."
11,RandomForestRegressor(SelectFromModel(input_ma...,-inf,"{'generation': 0, 'mutation_count': 0, 'crosso..."


In [23]:
model_scores.to_csv('TPOT_normal_regression_model_score.csv', index=False)

In [25]:
from IPython.display import FileLink
FileLink('TPOT_normal_regression_model_score.csv')

In [22]:
tpot.export('tpot_regression_model_normal')