## Import librairies

In [1]:
import numpy as np
import pandas as pd

import joblib

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor
from xgboost import XGBRegressor


pd.options.display.max_columns = None
pd.options.display.max_rows = None

import os 

import warnings
warnings.filterwarnings('ignore')



In [2]:
df = pd.read_csv(r"C:\Users\33760\Desktop\FULLSTACK\FULLSTACK\Projet\BLOC 5\Get around\api\data\clean_pricing_project.csv")
df.head()

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
2,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
3,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183
4,Citroën,152352,225,petrol,black,convertible,True,True,False,False,True,True,True,131


In [3]:
# Delete 'model_key_column' (high cardinality)
df = df.iloc[:,1:]
df.columns

Index(['mileage', 'engine_power', 'fuel', 'paint_color', 'car_type',
       'private_parking_available', 'has_gps', 'has_air_conditioning',
       'automatic_car', 'has_getaround_connect', 'has_speed_regulator',
       'winter_tires', 'rental_price_per_day'],
      dtype='object')

In [4]:
# Separate target variable Y from features X
print("Separating labels from features...")

features_list = [
    "mileage",
    "engine_power",
    "fuel",
    "paint_color", 
    "car_type",
    "private_parking_available", 
    "has_gps",
    "has_air_conditioning", 
    "automatic_car",
    "has_getaround_connect", 
    "has_speed_regulator",
    "winter_tires"
    ]

target_variable = "rental_price_per_day"

X = df.loc[:,features_list]
Y = df.loc[:,target_variable]

print("...Done.")
print()

print('Y : ')
print(Y.head())
print()
print('X :')
print(X.head())

Separating labels from features...
...Done.

Y : 
0    106
1    101
2    158
3    183
4    131
Name: rental_price_per_day, dtype: int64

X :
   mileage  engine_power    fuel paint_color     car_type  \
0   140411           100  diesel       black  convertible   
1   183297           120  diesel       white  convertible   
2   128035           135  diesel         red  convertible   
3    97097           160  diesel      silver  convertible   
4   152352           225  petrol       black  convertible   

   private_parking_available  has_gps  has_air_conditioning  automatic_car  \
0                       True     True                 False          False   
1                      False    False                 False          False   
2                       True     True                 False          False   
3                       True     True                 False          False   
4                       True     True                 False          False   

   has_getaround_connec

In [5]:
# Automatically detect names of numeric/categorical columns
numeric_features = []
categorical_features = []
for i,t in X.dtypes.iteritems():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)

print('\nFound numeric features ', numeric_features)
print('\nFound categorical features ', categorical_features)


Found numeric features  ['mileage', 'engine_power']

Found categorical features  ['fuel', 'paint_color', 'car_type', 'private_parking_available', 'has_gps', 'has_air_conditioning', 'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']


# Preprocessor

In [6]:
# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Create pipeline for categorical features
categorical_transformer = Pipeline(
    steps=[
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

## Separate data (X_TRAIN / Y_TRAIN)

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=0)

print("Preprocessing X_train...")
print()
X_train = preprocessor.fit_transform(X_train)
print("...Done!")
print(X_train[0:5,:]) # X_train is now a numpy array
print()

# Test pipeline
print("Preprocessing X_test...")
print()
X_test = preprocessor.transform(X_test) # Don't fit again !! The test set is used for validating decisions
# we made based on the training set, therefore we can only apply transformations that were parametered using the training set.
# Otherwise this creates what is called a leak from the test set which will introduce a bias in all your results.
print("...Done!")
print(X_test[0:5,:]) # X_test is now a numpy array
print()

Preprocessing X_train...

...Done!
  (0, 0)	0.6775767536894776
  (0, 1)	0.9644456050336777
  (0, 5)	1.0
  (0, 15)	1.0
  (0, 21)	1.0
  (0, 22)	1.0
  (0, 23)	1.0
  (0, 27)	1.0
  (1, 0)	0.6202090944865828
  (1, 1)	-0.7584179358043706
  (1, 5)	1.0
  (1, 16)	1.0
  (1, 22)	1.0
  (1, 25)	1.0
  (1, 27)	1.0
  (2, 0)	-0.725511557799086
  (2, 1)	-0.6148459740678665
  (2, 13)	1.0
  (2, 19)	1.0
  (2, 22)	1.0
  (2, 24)	1.0
  (2, 26)	1.0
  (2, 27)	1.0
  (3, 0)	0.4499389494402912
  (3, 1)	-0.47127401233136246
  (3, 13)	1.0
  (3, 19)	1.0
  (3, 21)	1.0
  (3, 22)	1.0
  (3, 25)	1.0
  (3, 27)	1.0
  (4, 0)	-0.5234967527721511
  (4, 1)	-0.7584179358043706
  (4, 9)	1.0
  (4, 19)	1.0
  (4, 21)	1.0
  (4, 22)	1.0
  (4, 25)	1.0
  (4, 26)	1.0
  (4, 27)	1.0

Preprocessing X_test...

...Done!
  (0, 0)	1.3261062275665474
  (0, 1)	1.5387334519796938
  (0, 13)	1.0
  (0, 17)	1.0
  (0, 21)	1.0
  (0, 22)	1.0
  (0, 25)	1.0
  (0, 27)	1.0
  (1, 0)	0.13082447200208006
  (1, 1)	-0.7584179358043706
  (1, 5)	1.0
  (1, 17)	1.0
  

## No free lunch theorem method for optimization

## Linear Regression

In [9]:
# Train model
print("Train model...")
linreg = LinearRegression()
linreg.fit(X_train, Y_train)
print("...Done.")

# Predictions on training set
print("Predictions on training set...")
Y_train_pred = linreg.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

# Predictions on test set
print("Predictions on test set...")
Y_test_pred = linreg.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

# Print R^2 scores
print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))

Train model...
...Done.
Predictions on training set...
...Done.
[120.48294151 106.05062487 135.1621775  ...  98.95759523 141.38200537
  91.88165016]

Predictions on test set...
...Done.
[135.19605787 111.13323369 103.81900296  89.56512138 117.79891321
  99.15317263  80.41344838 103.66891913 125.67815836 132.73253934
  89.82393273 178.08345604 159.0340164  109.95922679 164.64546625
 133.62656205 105.60570317 128.82308263 111.62216248  89.28432737
 106.75470739 155.77016642 138.35711181 129.32841621 143.46363327
  97.33850071  95.86271908 116.58003159 128.24799205 156.57171883
 102.83865313 108.91753752 136.65260458 131.18256888  88.7863179
 122.39345868 104.98810984 123.46811521 123.14961899 156.95407722
 134.59762412 130.17380022 135.30919064 149.92964533 118.47618676
  78.48322629 126.32855242 128.85047501 123.22461965 123.78760494
 128.22328184 130.29673161 103.74956327 129.81511965 104.45139885
 132.55263865  87.53638341 119.70928242 106.00683945 145.55586193
 111.64649855  92.32624


The R2 score on the training set is 0.656, which means that the model explains approximately 65.6% of the observed variance in the training data. In other words, it reasonably captures the relationship between the independent variables and the target variable in the training dataset.

The R2 score on the test set is 0.695, indicating that our model generalizes its predictions to new data with an accuracy of 69.5%. This means that it explains approximately 69.5% of the variance in the test data. The score on the test set is slightly higher than the score on the training set, suggesting that your model is not overly overfit and is capable of generalizing its predictions to new data.

Let's go further with a Lasso regularization (feature selection model) and griedsearch !

## Lasso regularization

In [14]:
# create model
lasso = Lasso(random_state=0)

# Grid of values to be tested
params = {
    'alpha': [0.01, 0.018, 0.02]
}

gridsearch_lasso = GridSearchCV(lasso, param_grid = params, cv = 4) # cv : the number of folds to be used for CV
gridsearch_lasso.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch_lasso.best_params_)

# Print R^2 scores
print("R2 score on training set : ", gridsearch_lasso.score(X_train, Y_train))
print("R2 score on test set : ", gridsearch_lasso.score(X_test, Y_test))
print("Best validation R2 : ", gridsearch_lasso.best_score_)

...Done.
Best hyperparameters :  {'alpha': 0.018}
R2 score on training set :  0.6550840393170925
R2 score on test set :  0.6906686689100536
Best validation R2 :  0.6474408210765583


Following the regularisation of Lasso and the griedsearch, we have not seen any real improvement in the model's performance.

## XG BOOST regressor

In [16]:
# Perform grid search
print("Grid search...")
xgb = XGBRegressor()

# Grid of values to be tested

params = {
    'max_depth': [5, 10, 15],
    'learning_rate': [0.05, 0.1],
    'n_estimators': [150],
    'colsample_bytree': [0.3, 0.5, 0.6],
    'subsample': [0.5, 0.8, 0.9]
}

xgb_gridsearch = GridSearchCV(xgb, param_grid = params, cv = 4) # cv : the number of folds to be used for CV
xgb_gridsearch.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", xgb_gridsearch.best_params_)

print("R2 XGBoost default train {}".format(xgb_gridsearch.score(X_train, Y_train)))
print("R2 XGBoost default test {}".format(xgb_gridsearch.score(X_test, Y_test)))
print("Best validation R2 : ", xgb_gridsearch.best_score_)

Grid search...


...Done.
Best hyperparameters :  {'colsample_bytree': 0.5, 'learning_rate': 0.05, 'max_depth': 10, 'n_estimators': 150, 'subsample': 0.8}
R2 XGBoost default train 0.9185012881989117
R2 XGBoost default test 0.7707262786265997
Best validation R2 :  0.7231961731388532


## Decision tree regressor

In [17]:
tree_regressor = DecisionTreeRegressor(max_depth=3)
tree_regressor.fit(X_train, Y_train)

print("R2 Tree max depth 3 train {}".format(tree_regressor.score(X_train, Y_train)))
print("R2 Tree max depth 3 test {}".format(tree_regressor.score(X_test, Y_test)))

R2 Tree max depth 3 train 0.49788605226304095
R2 Tree max depth 3 test 0.5289806676175105


## Decision tree regressor Griedsearch

In [34]:
# Decision tree
# Perform grid search
print("Grid search...")
dtg = DecisionTreeRegressor()

# Grid of values to be tested
params = {
    'max_depth': [7, 10, 15], 
    'min_samples_leaf': [10, 12, 14],
    'min_samples_split': [3, 4, 5]
}
dt_opt = GridSearchCV(dtg, param_grid = params, cv = 3) # cv : the number of folds to be used for CV
dt_opt.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", dt_opt.best_params_)
print()
print("R2 on training set : ", dt_opt.score(X_train, Y_train))
print("R2 on test set : ", dt_opt.score(X_test, Y_test))
print("Best validation R2 : ", dt_opt.best_score_)


Grid search...
...Done.
Best hyperparameters :  {'max_depth': 10, 'min_samples_leaf': 12, 'min_samples_split': 4}

R2 on training set :  0.7321175728116128
R2 on test set :  0.6659107182117999
Best validation R2 :  0.608875587201032


## Random forest regressor

In [19]:
# Perform grid search
print("Random Forest with default hyperparameters...")
rf = RandomForestRegressor() # we must use a regressor here!
rf.fit(X_train, Y_train)
print("...Done.")
print()
# Print R^2 scores
print("R2 score on training set : ", rf.score(X_train, Y_train))
print("R2 score on test set : ", rf.score(X_test, Y_test))

Random Forest with default hyperparameters...
...Done.

R2 score on training set :  0.9583129336772048
R2 score on test set :  0.7423322484008744


## Random forest regressor gridsearch

In [43]:
# Perform grid search
print("Grid search...")
rfg = RandomForestRegressor()

# Grid of values to be tested
params = {
    'max_depth': [30, 35, 40],
    'min_samples_split': [4, 6, 8],
    'n_estimators': [250]
}
rfg_gridsearch = GridSearchCV(rfg, param_grid = params, cv = 5)
rfg_gridsearch.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", rfg_gridsearch.best_params_)
print()
# Print R^2 scores
print("R2 score on training set : ", rfg_gridsearch.score(X_train, Y_train))
print("R2 score on test set : ", rfg_gridsearch.score(X_test, Y_test))
print("Best validation R2 : ", rfg_gridsearch.best_score_)

Grid search...
...Done.
Best hyperparameters :  {'max_depth': 35, 'min_samples_split': 6, 'n_estimators': 250}

R2 score on training set :  0.9177594705922483
R2 score on test set :  0.7438267506301783
Best validation R2 :  0.7048688789852193


## Adaboost regressor

In [45]:
# Perform grid search
print("Grid search...")
adaboost = AdaBoostRegressor()

# Grid of values to be tested
params = {
    'n_estimators':[80, 90, 100],
    "learning_rate":[0.01, 0.05, 1.0]
}
print(params)
ada_gridsearch = GridSearchCV(adaboost, param_grid = params, cv = 5) # cv : the number of folds to be used for CV
ada_gridsearch.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", ada_gridsearch.best_params_)

print()
print("R2 on training set : ", ada_gridsearch.score(X_train, Y_train))
print("R2 on test set : ", ada_gridsearch.score(X_test, Y_test))
print("Best validation R2 : ", ada_gridsearch.best_score_)

Grid search...
{'n_estimators': [50, 60, 70, 80, 90, 100], 'learning_rate': [0.01, 0.05, 1.0, 1.5, 2]}
...Done.
Best hyperparameters :  {'learning_rate': 0.05, 'n_estimators': 90}

R2 on training set :  0.5837760596969497
R2 on test set :  0.5994624203040251
Best validation R2 :  0.5612378769523838


## Display results

In [None]:
# Define model names
model_names = ["linear_regression", 
               "lasso", 
               "xgb_regressor", 
               "decision_tree_r", 
               "decision_tree_g",
               "random_forest_r",
               "random_forest_g",
               "adaboost_regressor_g"]

# Set R2 scores for test set
test_r2_scores = [r2_score(Y_test, Y_test_pred), 
                 gridsearch_lasso.score(X_test, Y_test), 
                 xgb_gridsearch.score(X_test, Y_test),
                 tree_regressor.score(X_test, Y_test),
                 dt_opt.score(X_test, Y_test),
                 rf.score(X_test, Y_test),
                 rfg_gridsearch.score(X_test, Y_test),
                 ada_gridsearch.score(X_test, Y_test)
]

# Set R2 scores for training set
train_r2_scores = [r2_score(Y_train, Y_train_pred), 
                   gridsearch_lasso.score(X_train, Y_train), 
                   xgb_gridsearch.score(X_train, Y_train),
                   tree_regressor.score(X_train, Y_train),
                   dt_opt.score(X_train, Y_train),
                   rf.score(X_train, Y_train),
                   rfg_gridsearch.score(X_train, Y_train),
                   ada_gridsearch.score(X_train, Y_train)
]

# Create the DataFrame
df_scores = pd.DataFrame({"Model": model_names,
                          "Train R2": train_r2_scores,
                          "Test R2": test_r2_scores})

df_sorted = df_scores.sort_values(by='Test R2', ascending=False)

df_sorted

Unnamed: 0,Model,Train R2,Test R2
2,xgb_regressor,0.918501,0.770726
6,random_forest_g,0.884417,0.744979
5,random_forest_r,0.958936,0.741947
0,linear_regression,0.656002,0.695278
1,lasso,0.655084,0.690669
4,decision_tree_g,0.676867,0.664391
7,adaboost_regressor_g,0.576523,0.592496
3,decision_tree_r,0.497886,0.528981


The best model is XGBBOOST Regressor gridsearch

## Voting

In [None]:
# Voting
voting = VotingRegressor(estimators=[("linear_regression", linreg), ("random_forest_grid", rfg), ('xgboost_r', xgb), ("decision_tree", dt_opt), ("adaboost", ada_gridsearch)])
voting.fit(X_train, Y_train)
print("R2 on training set : ", voting.score(X_train, Y_train))
print("R2 on test set : ", voting.score(X_test, Y_test))

R2 on training set :  0.8285923861125634
R2 on test set :  0.7429685498975729


# Export best model

In [None]:
import joblib

# Path
path = r'C:\Users\33760\Desktop\FULLSTACK\FULLSTACK\Projet\Get around\api\ml_best_model.pkl'

# Download model
joblib.dump(voting, path)

['C:\\Users\\33760\\Desktop\\FULLSTACK\\FULLSTACK\\Projet\\Get around\\api\\ml_best_model.pkl']