In [5]:
import tpot


In [7]:
import pandas as pd
import numpy as np
from tpot import TPOTRegressor


df_test=pd.read_csv("test_reg_df.csv")

df_train=pd.read_csv("train_reg_df.csv")

X_train = df_train.drop(["Price"], axis=1).values  # TPOT needs 2D-Array
y_train = df_train["Price"].values  #price must be 1D

X_test = df_test.drop(["Price"], axis=1).values
y_test = df_test["Price"].values

In [8]:
# initializing tpot with parameters
#max_time_mins=None (per default) - here max time 4h

tpot = TPOTRegressor(generations=5, population_size=50,
                     cv=5, verbosity=2, n_jobs=16, max_time_mins=240,
                     periodic_checkpoint_folder='/content/results/regPre')

In [9]:
#starting the training
tpot.fit(X_train, y_train)

Version 0.12.1 of tpot is outdated. Version 0.12.2 was released Friday February 23, 2024.


Optimization Progress:   0%|          | 0/50 [00:00<?, ?pipeline/s]




Generation 1 - Current best internal CV score: -2797534854083.995

Generation 2 - Current best internal CV score: -2797534854083.995

Generation 3 - Current best internal CV score: -2781879432235.2593

Generation 4 - Current best internal CV score: -2774453417321.6636

Generation 5 - Current best internal CV score: -2767308106249.8467

Best pipeline: RandomForestRegressor(input_matrix, bootstrap=True, max_features=0.7000000000000001, min_samples_leaf=2, min_samples_split=9, n_estimators=100)


In [10]:
import sklearn.metrics as skm

y_pred = tpot.predict(X_test)

print("RESULTS OF BEST MODEL:\n")

print(f"R2:                         {skm.r2_score(y_test, y_pred)}")
print(f"Mean Squared Error:         {skm.mean_squared_error(y_test, y_pred)}")
print(f"Root Mean Squared Error:    {skm.mean_squared_error(y_test, y_pred, squared=False)}")
print(f"Mean Absolute Error:        {skm.mean_absolute_error(y_test, y_pred)}")
print(f"Explained Variance:         {skm.explained_variance_score(y_test, y_pred)}")
print(f"Max Error:                  {skm.max_error(y_test, y_pred)}")
print(f"Median Absolute Error       {skm.median_absolute_error(y_test, y_pred)}")


RESULTS OF BEST MODEL:

R2:                         0.3500521993390916
Mean Squared Error:         2667039282992.469
Root Mean Squared Error:    1633107.247853756
Mean Absolute Error:        1224222.8981310371
Explained Variance:         0.35010936386710845
Max Error:                  7738625.99884328
Median Absolute Error       928867.427698385


In [33]:
#from IPython.display import FileLink
#output of values to dictionary
metrics_dict = {
    'Metric': [
        'R2',
        'Mean Squared Error',
        'Root Mean Squared Error',
        'Mean Absolute Error',
        'Explained Variance',
        'Max Error',
        'Median Absolute Error'
    ],
    'Value': [
        skm.r2_score(y_test, y_pred),
        skm.mean_squared_error(y_test, y_pred),
        skm.mean_squared_error(y_test, y_pred, squared=False),
        skm.mean_absolute_error(y_test, y_pred),
        skm.explained_variance_score(y_test, y_pred),
        skm.max_error(y_test, y_pred),
        skm.median_absolute_error(y_test, y_pred)
    ]
}

#to df
metrics_df = pd.DataFrame(metrics_dict)


print(metrics_df)

metrics_df.to_csv('tpot_regression_metrics_preprocessed_TEST.csv', index=False)

                    Metric         Value
0                       R2  1.267643e-01
1       Mean Squared Error  3.507104e+12
2  Root Mean Squared Error  1.872726e+06
3      Mean Absolute Error  1.475839e+06
4       Explained Variance  1.268721e-01
5                Max Error  7.874222e+06
6    Median Absolute Error  1.269800e+06


In [34]:
from IPython.display import FileLink
FileLink('tpot_regression_metrics_preprocessed_TEST.csv')

In [35]:
#showing best models as there is no leaderboard
import pandas as pd

# Converting all evaluated models to a list of tuples
my_dict = list(tpot.evaluated_individuals_.items())

# empty list for dictionaries
model_list = []

for model in my_dict:
    model_name = model[0]
    model_info = model[1]
    cv_score = model_info.get('internal_cv_score')  # Pull out cv_score as a column (i.e., sortable)
    
    # Append dictionary to the list
    model_list.append({
        'model': model_name,
        'cv_score': cv_score,
        'model_info': model_info,
    })

# Convert list to a DataFrame
model_scores = pd.DataFrame(model_list)

# Sort the DataFrame by cv_score
model_scores = model_scores.sort_values('cv_score', ascending=False)

# output
model_scores

Unnamed: 0,model,cv_score,model_info
190,"ExtraTreesRegressor(RidgeCV(input_matrix), Ext...",-2.908306e+12,"{'generation': 3, 'mutation_count': 2, 'crosso..."
263,"ExtraTreesRegressor(RidgeCV(input_matrix), Ext...",-2.915362e+12,"{'generation': 5, 'mutation_count': 4, 'crosso..."
220,"ExtraTreesRegressor(RidgeCV(input_matrix), Ext...",-2.917025e+12,"{'generation': 4, 'mutation_count': 3, 'crosso..."
96,"ExtraTreesRegressor(RidgeCV(input_matrix), Ext...",-2.917473e+12,"{'generation': 2, 'mutation_count': 1, 'crosso..."
255,"ExtraTreesRegressor(RidgeCV(input_matrix), Ext...",-2.919992e+12,"{'generation': 5, 'mutation_count': 4, 'crosso..."
...,...,...,...
236,"RandomForestRegressor(input_matrix, RandomFore...",-inf,"{'generation': 4, 'mutation_count': 2, 'crosso..."
59,"XGBRegressor(PCA(input_matrix, PCA__iterated_p...",-inf,"{'generation': 1, 'mutation_count': 1, 'crosso..."
54,ExtraTreesRegressor(AdaBoostRegressor(input_ma...,-inf,"{'generation': 1, 'mutation_count': 1, 'crosso..."
53,"AdaBoostRegressor(input_matrix, AdaBoostRegres...",-inf,"{'generation': 1, 'mutation_count': 1, 'crosso..."


In [36]:
model_scores.to_csv('TPOT_preprocessed_TEST_regression_model_score.csv', index=False)

In [37]:
from IPython.display import FileLink
FileLink('TPOT_preprocessed_TEST_regression_model_score.csv')

In [38]:
tpot.export('tpot_regression_TEST_preprocessed_normal')