![snap](https://lever-client-logos.s3.amazonaws.com/2bd4cdf9-37f2-497f-9096-c2793296a75f-1568844229943.png)

# Web dashboard

Dashboard : https://terorra-gar-cdsd-analysis.hf.space/



# Machine Learning

## Libraries

In [30]:
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import  StandardScaler, OneHotEncoder
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
import joblib

## Data

In [31]:
data_pricing = pd.read_csv('https://full-stack-assets.s3.eu-west-3.amazonaws.com/Deployment/get_around_pricing_project.csv')
data_pricing.head()

Unnamed: 0.1,Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
2,2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183


In [33]:
data_pricing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4843 entries, 0 to 4842
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Unnamed: 0                 4843 non-null   int64 
 1   model_key                  4843 non-null   object
 2   mileage                    4843 non-null   int64 
 3   engine_power               4843 non-null   int64 
 4   fuel                       4843 non-null   object
 5   paint_color                4843 non-null   object
 6   car_type                   4843 non-null   object
 7   private_parking_available  4843 non-null   bool  
 8   has_gps                    4843 non-null   bool  
 9   has_air_conditioning       4843 non-null   bool  
 10  automatic_car              4843 non-null   bool  
 11  has_getaround_connect      4843 non-null   bool  
 12  has_speed_regulator        4843 non-null   bool  
 13  winter_tires               4843 non-null   bool  
 14  rental_p

In [32]:
data_pricing.describe(include='all')

Unnamed: 0.1,Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
count,4843.0,4843,4843.0,4843.0,4843,4843,4843,4843,4843,4843,4843,4843,4843,4843,4843.0
unique,,28,,,4,10,8,2,2,2,2,2,2,2,
top,,Citroën,,,diesel,black,estate,True,True,False,False,False,False,True,
freq,,969,,,4641,1633,1606,2662,3839,3865,3881,2613,3674,4514,
mean,2421.0,,140962.8,128.98823,,,,,,,,,,,121.214536
std,1398.198007,,60196.74,38.99336,,,,,,,,,,,33.568268
min,0.0,,-64.0,0.0,,,,,,,,,,,10.0
25%,1210.5,,102913.5,100.0,,,,,,,,,,,104.0
50%,2421.0,,141080.0,120.0,,,,,,,,,,,119.0
75%,3631.5,,175195.5,135.0,,,,,,,,,,,136.0


## Preprocessing

In [5]:
data_pricing = data_pricing.drop('Unnamed: 0', axis=1)

# X, y split 
target_variable = "rental_price_per_day"
X = data_pricing.drop(target_variable, axis=1)
y = data_pricing[target_variable]

# Train / test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [6]:
# Preprocessing 
categorical_features = X_train.select_dtypes(include=["object", "bool"]).columns
categorical_transformer = OneHotEncoder(categories=[sorted(X[col].unique().tolist()) for col in categorical_features], drop='first')

numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("categorical_transformer", categorical_transformer, categorical_features),
        ("numerical_transformer", numerical_transformer, numerical_features)
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(random_state=42))
])

## Model Training

In [None]:
# Perform grid search
print("Grid search...")

params = {
    'regressor__n_estimators':[10,20,30,40, 50, 100, 150, 200, 250, 300, 350, 400],
    "regressor__learning_rate":[2, 1.5, 1.0, 0.5, 0.1, 0.05, 0.01],
    'regressor__max_depth': [2, 3, 4, 5],
    }

print(params)
gridsearch = GridSearchCV(
    pipeline,
    param_grid = params,
    cv = 3)

gridsearch.fit(X_train, y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best validation R2 : ", gridsearch.best_score_)
print()
print("R2 on training set : ", gridsearch.score(X_train, y_train))
print("R2 on test set : ", gridsearch.score(X_test, y_test))

best_model = gridsearch.best_estimator_

y_pred = best_model.predict(X_test)
r2 = r2_score(y_test, y_pred)

print(f"\n--- Évaluation du modèle sur l'ensemble de test ---")
print(f"R² sur l'ensemble de test : {r2:.2f}")

## Saving model

In [None]:
filename = 'modele_GAR.joblib'

#joblib.dump(gridsearch, filename)

print(f"Modèle enregistré sous : {filename}")

Modèle enregistré sous : modele_GAR.joblib


# API Prediction

API : https://terorra-gar-cdsd-pred.hf.space/docs

## Test requests on API

In [None]:
import requests

data = {"model_key": "Renault", 
        "mileage": 109839, 
        "engine_power": 135, 
        "fuel": "diesel", 
        "paint_color": "black", 
        "car_type": "sedan", 
        "private_parking_available": True, 
        "has_gps": True, 
        "has_air_conditioning": False, 
        "automatic_car": False, 
        "has_getaround_connect": True, 
        "has_speed_regulator": False, 
        "winter_tires": True 
        }

response = requests.post("https://terorra-gar-cdsd-pred.hf.space/predict", json=data)

response.json()

{'prediction': 137.91529846191406}

In [34]:
df_delay = pd.read_excel("https://full-stack-assets.s3.eu-west-3.amazonaws.com/Deployment/get_around_delay_analysis.xlsx")
df_delay.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21310 entries, 0 to 21309
Data columns (total 7 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   rental_id                                   21310 non-null  int64  
 1   car_id                                      21310 non-null  int64  
 2   checkin_type                                21310 non-null  object 
 3   state                                       21310 non-null  object 
 4   delay_at_checkout_in_minutes                16346 non-null  float64
 5   previous_ended_rental_id                    1841 non-null   float64
 6   time_delta_with_previous_rental_in_minutes  1841 non-null   float64
dtypes: float64(3), int64(2), object(2)
memory usage: 1.1+ MB


In [35]:
df_delay_delta = df_delay[(df_delay['time_delta_with_previous_rental_in_minutes'].notna()) \
                          & (df_delay['time_delta_with_previous_rental_in_minutes'] >= 0) \
                            & (df_delay['time_delta_with_previous_rental_in_minutes'] <= 60)]
df_delay_delta.info()

<class 'pandas.core.frame.DataFrame'>
Index: 584 entries, 23 to 21197
Data columns (total 7 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   rental_id                                   584 non-null    int64  
 1   car_id                                      584 non-null    int64  
 2   checkin_type                                584 non-null    object 
 3   state                                       584 non-null    object 
 4   delay_at_checkout_in_minutes                480 non-null    float64
 5   previous_ended_rental_id                    584 non-null    float64
 6   time_delta_with_previous_rental_in_minutes  584 non-null    float64
dtypes: float64(3), int64(2), object(2)
memory usage: 36.5+ KB


In [42]:
for i in range (0, 90, 30): 
    delay = df_delay_delta[df_delay_delta['time_delta_with_previous_rental_in_minutes'] <= i]
    fig = px.histogram(delay, 
                       x='state',
                       color='checkin_type', 
                       text_auto=True)
    fig.update_layout({"title" : f"for {i} min"})
    fig.show()