In [4]:
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
RANDOM_STATE = 42

In [6]:
data = pd.read_csv('get_around_pricing_project.csv')
data.drop('Unnamed: 0', axis=1, inplace=True)
print(data.shape)
data.sample(5)

(4843, 14)


Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
1295,Renault,159844,135,diesel,black,estate,True,True,True,False,True,False,True,131
3022,Audi,195470,230,diesel,black,sedan,True,True,True,True,True,False,True,164
2660,Citroën,170879,122,diesel,grey,sedan,True,True,False,False,True,False,True,112
3393,Mercedes,139118,105,diesel,black,sedan,False,True,False,False,True,True,True,128
1744,Citroën,103095,110,diesel,silver,estate,False,True,False,False,False,False,True,57


In [7]:
# for col in data.columns:
#     if col == target:
#         continue

#     plt.figure(figsize=(5, 3))

#     if col in num_cols:
#         sns.scatterplot(x=data[col], y=data[target])
        
#     elif col in cat_cols:
#         sns.boxplot(data[[target, col]], x=col, y=target)
#         # sns.barplot(data[[col, target]].groupby(col).mean(), x=col, y=target)
#         plt.xticks(rotation=45)
    
#     plt.show()

In [8]:
data.isna().sum().sum()

0

We have lots of meaningful relations between our indepedent variables and our dependent variable it seems like.

In [9]:
target = 'rental_price_per_day'

x = data.drop(target, axis=1)
y = data[[target]]

num_cols = x.select_dtypes(include=np.number).columns.tolist()
cat_cols = [col for col in x.columns if col not in num_cols]

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=RANDOM_STATE)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.4, random_state=RANDOM_STATE)

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

numeric_transformer = Pipeline([
    ('scaler', StandardScaler())
])

categoric_transformer = Pipeline([
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, num_cols),
    ('cat', categoric_transformer, cat_cols)
])

x_train = preprocessor.fit_transform(x_train)
x_test = preprocessor.transform(x_test)



In [10]:
# for col in data.columns:
#     if col == target:
#         continue

#     plt.figure(figsize=(5, 3))

#     if col in num_cols:
#         sns.scatterplot(x=data[col], y=data[target])
        
#     elif col in cat_cols:
#         sns.boxplot(data[[target, col]], x=col, y=target)
#         # sns.barplot(data[[col, target]].groupby(col).mean(), x=col, y=target)
#         plt.xticks(rotation=45)
    
#     plt.show()

# Model

## Optimization

In [11]:
import optuna

class StopWhenNoProgress:
    def __init__(self, threshold: int):
        self.threshold = threshold
        self._consecutive_stall_count = 0
        self._last_reset_value = 0
        
    def __call__(self, study:optuna.study.Study, trial: optuna.trial.FrozenTrial) -> None:
        # If score is worse than best, count as stall
        if trial.value < study.best_value:
            self._consecutive_stall_count += 1

        # If score is better or equal to best, and hasn't been used to reset before, then reset
        elif (trial.value >= study.best_value) & (trial.value != self._last_reset_value):
            self._consecutive_stall_count = 0
            self._last_reset_value = study.best_value   

        # Stop once over threshold
        if self._consecutive_stall_count > self.threshold:
            study.stop()

from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import r2_score, mean_absolute_error


def objective(trial):
    model_params = {
    'n_estimators': trial.suggest_int('n_estimators', 1e1, 1e2, log=True),
    'max_depth': trial.suggest_int('max_depth', 3, 10, log=True),
    'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1, log=True),
    'subsample': trial.suggest_float('subsample', 0.5, 1),
    'min_samples_split': trial.suggest_int('min_samples_split', 2, 100),
    'max_features': trial.suggest_int('max_features', 1, x_train.shape[1])
    }
    
    model = GradientBoostingRegressor(**model_params)

    model.fit(x_train, y_train.values.ravel())
    score = model.score(x_test, y_test)
    
    return score

study = optuna.create_study(direction='maximize')
study.optimize(
    objective,
    n_trials=1000,
    callbacks=[StopWhenNoProgress(50)]
)

[I 2024-10-13 10:03:40,371] A new study created in memory with name: no-name-e700614d-2387-4eb0-b398-ba0583f950ce
[I 2024-10-13 10:03:40,709] Trial 0 finished with value: 0.6507075177135794 and parameters: {'n_estimators': 34, 'max_depth': 6, 'learning_rate': 0.04830773082840886, 'subsample': 0.7960388215488805, 'min_samples_split': 76, 'max_features': 50}. Best is trial 0 with value: 0.6507075177135794.
[I 2024-10-13 10:03:40,835] Trial 1 finished with value: 0.4330590133424135 and parameters: {'n_estimators': 71, 'max_depth': 4, 'learning_rate': 0.017726365255916775, 'subsample': 0.7066632477531829, 'min_samples_split': 73, 'max_features': 6}. Best is trial 0 with value: 0.6507075177135794.
[I 2024-10-13 10:03:40,971] Trial 2 finished with value: 0.12286138866265661 and parameters: {'n_estimators': 11, 'max_depth': 6, 'learning_rate': 0.01013791962397521, 'subsample': 0.8324162272072535, 'min_samples_split': 4, 'max_features': 51}. Best is trial 0 with value: 0.6507075177135794.
[I 2

In [12]:
# Refit and check our best estimator
model = GradientBoostingRegressor(**study.best_params)

model.fit(x_train, y_train.values.ravel())
print(f'Our best test cross-validated test scores are: R2 of {r2_score(y_test, model.predict(x_test)):.3f} and MAE of ${mean_absolute_error(y_test, model.predict(x_test)):.2f}')

Our best test cross-validated test scores are: R2 of 0.749 and MAE of $10.65


In [13]:
from pickle import dump
with open('prediction_api/model.pkl', 'wb') as f:
    dump(model, f, protocol=5)
    
with open('prediction_api/preprocessor.pkl', 'wb') as f:
    dump(preprocessor, f, protocol=5)