# Getaround - ML training for rental price prediction

In [11]:
import mlflow
import pandas as pd
import numpy as np
import math
from itertools import combinations

from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, GridSearchCV
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from mlflow.models.signature import infer_signature

## 1. Load data and make all pre-process

In [12]:
# Read dataset
data = pd.read_csv("../00_Data/get_around_pricing_project.csv", index_col = 0)
data.head()

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183


In [13]:
# Remove inconsistencies
print(f"Number of cars registered: {len(data)}")
data = data[(data["mileage"] > 0) & (data["engine_power"] > 0)]
print(f"Number of cars after removing inconsistencies : {len(data)}")

Number of cars registered: 4843
Number of cars after removing inconsistencies : 4841


In [14]:
# Group fewly populated labels
for feature in ["model_key", "fuel", "paint_color", "car_type"]:
    label_counts = data[feature].value_counts()
    fewly_populated_labels = list(label_counts[label_counts < 0.5 / 100 * len(data)].index)
    for label in fewly_populated_labels:
        data.loc[data[feature] == label,feature] = 'other'

In [15]:
# Separate features from target
X = data.drop("rental_price_per_day", axis = 1)
Y = data["rental_price_per_day"]

In [16]:
# Automatically detect feature category
numerical_features = []
binary_features = []
categorical_features = []
for i,t in X.dtypes.items():
    if ('float' in str(t)) or ('int' in str(t)) :
        numerical_features.append(i)
    elif ('bool' in str(t)):
        binary_features.append(i)
    else :
        categorical_features.append(i)

print(f"Numerical features: {numerical_features}")
print(f"Binary_features: {binary_features}")
print(f"Categorical features: {categorical_features}")

Numerical features: ['mileage', 'engine_power']
Binary_features: ['private_parking_available', 'has_gps', 'has_air_conditioning', 'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']
Categorical features: ['model_key', 'fuel', 'paint_color', 'car_type']


In [17]:
# Train / test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [18]:
# Features preprocessing pipeline
categorical_transformer = OneHotEncoder(drop='first', sparse = False)
numerical_transformer = StandardScaler()
binary_transformer = FunctionTransformer(None, feature_names_out = 'one-to-one')
preprocessor = ColumnTransformer(
    transformers=[
        ("categorical_transformer", categorical_transformer, categorical_features),
        ("numerical_transformer", numerical_transformer, numerical_features),
        ("binary_transformer", binary_transformer, binary_features)
    ]
)

In [19]:
# Setup MLflow for model and performances tracking

EXPERIMENT_NAME = "getaround_pricing_predictor"

mlflow.set_tracking_uri("https://ojo-getaround-mlflow-7508b1c17441.herokuapp.com/")
mlflow.set_experiment(EXPERIMENT_NAME)

experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

In [20]:
# First model tracking -- Linear Regression --

mlflow.sklearn.autolog(log_models = False)

with mlflow.start_run(experiment_id = experiment.experiment_id) as run:

    # Set tags for model
    mlflow.set_tag("user", "Ophélie")
    mlflow.set_tag("model", "linear_regression")
    mlflow.set_tag("description", "lr")

    # Instanciate and train the linear regression model
    model = LinearRegression()
    processor = Pipeline(steps = [
        ('Features_preprocessing', preprocessor),
        ("model", model)
    ])
    processor.fit(X_train, Y_train)

    # Make predictions
    Y_train_pred = processor.predict(X_train)
    Y_test_pred = processor.predict(X_test)

    # Log MSE and R2 score on train set
    mlflow.log_metric("linear_regression_MSE_train", mean_absolute_error(Y_train, Y_train_pred))
    mlflow.log_metric("linear_regression_R2_train", processor.score(X_train, Y_train))

    # Log MSE and R2 score for test set 
    mlflow.log_metric("linear_regression_MSE_test", mean_absolute_error(Y_test, Y_test_pred))
    mlflow.log_metric("linear_regression_R2_test", processor.score(X_test, Y_test))

    # End mlflow autolog for retraining model on whole dataset (train + test) 
    mlflow.sklearn.autolog(disable = True)
    processor.fit(X, Y)

    # Log model separately
    mlflow.sklearn.log_model(
        sk_model = processor,
        artifact_path = "car_rental_price_predictor",
        registered_model_name = "LinearModel_car_rental_price_predictor",
        signature = infer_signature(X, Y)
    )



                                 OneHotEncoder(drop='first', sparse=False),
                                 ['model_key', 'fuel', 'paint_color',
                                  'car_type']),
                                ('numerical_transformer', StandardScaler(),
                                 ['mileage', 'engine_power']),
                                ('binary_transformer',
                     ...`
                                 OneHotEncoder(drop='first', sparse=False),
                                 ['model_key', 'fuel', 'paint_color',
                                  'car_type']),
                                ('numerical_transformer', StandardScaler(),
                                 ['mileage', 'engine_power']),
                                ('binary_transformer',
                                 FunctionTransfor...`
  inputs = _infer_schema(model_input)
  outputs = _infer_schema(model_output) if model_output is not None else None
Registered model 'LinearMode

In [21]:
# Second model tracking -- Ridge Regularization --

mlflow.sklearn.autolog(log_models = False)

with mlflow.start_run(experiment_id = experiment.experiment_id) as run:

    # Set tags for model
    mlflow.set_tag("user", "Ophélie")
    mlflow.set_tag("model", "ridge")
    mlflow.set_tag("description", "ridge")

    # Instanciate and train the ridge regularization model
    model = Ridge()
    params = {
        "alpha": [0.0, 0.01, 0.05, 0.1, 0.5, 1, 5, 10]
    }
    gridsearchridge = GridSearchCV(model, param_grid = params, cv = 10)
    processor = Pipeline(steps = [
        ('Features_preprocessing', preprocessor),
        ("model", gridsearchridge)
    ])
    processor.fit(X_train, Y_train)

    # Make predictions
    Y_train_pred = processor.predict(X_train)
    Y_test_pred = processor.predict(X_test)

    # Log MSE and R2 score on train set
    mlflow.log_metric("ridge_MSE_train", mean_absolute_error(Y_train, Y_train_pred))
    mlflow.log_metric("ridge_R2_train", processor.score(X_train, Y_train))

    # Log MSE and R2 score for test set 
    mlflow.log_metric("ridge_MSE_test", mean_absolute_error(Y_test, Y_test_pred))
    mlflow.log_metric("ridge_R2_test", processor.score(X_test, Y_test))

    # End mlflow autolog for retraining model on whole dataset (train + test) 
    mlflow.sklearn.autolog(disable = True)
    processor.fit(X, Y)

    # Log model separately
    mlflow.sklearn.log_model(
        sk_model = processor,
        artifact_path = "car_rental_price_predictor",
        registered_model_name = "Ridge_car_rental_price_predictor",
        signature = infer_signature(X, Y)
    )



                                 OneHotEncoder(drop='first', sparse=False),
                                 ['model_key', 'fuel', 'paint_color',
                                  'car_type']),
                                ('numerical_transformer', StandardScaler(),
                                 ['mileage', 'engine_power']),
                                ('binary_transformer',
                     ...`
                                 OneHotEncoder(drop='first', sparse=False),
                                 ['model_key', 'fuel', 'paint_color',
                                  'car_type']),
                                ('numerical_transformer', StandardScaler(),
                                 ['mileage', 'engine_power']),
                                ('binary_transformer',
                                 FunctionTransfor...`
  inputs = _infer_schema(model_input)
  outputs = _infer_schema(model_output) if model_output is not None else None
Registered model 'Ridge_car_

In [22]:
# Third model tracking -- Random Forest Regressor --

mlflow.sklearn.autolog(log_models = False)

with mlflow.start_run(experiment_id = experiment.experiment_id) as run:

    # Set tags for model
    mlflow.set_tag("user", "Ophélie")
    mlflow.set_tag("model", "random forest regressor")
    mlflow.set_tag("description", "rfr")

    # Instanciate and train the ridge regularization model
    model = RandomForestRegressor()
    params = {
        'n_estimators': [10, 20, 40, 60, 80, 100],
        'max_depth': [2, 4, 6, 8, 10],
        'min_samples_leaf': [1, 2, 5],
        'min_samples_split': [2, 4, 8]
    }
    gridsearchRF = GridSearchCV(model, param_grid = params, cv = 10)
    processor = Pipeline(steps = [
        ('Features_preprocessing', preprocessor),
        ("model", gridsearchRF)
    ])
    processor.fit(X_train, Y_train)

    # Make predictions
    Y_train_pred = processor.predict(X_train)
    Y_test_pred = processor.predict(X_test)

    # Log MSE and R2 score on train set
    mlflow.log_metric("random_forest_regressor_MSE_train", mean_absolute_error(Y_train, Y_train_pred))
    mlflow.log_metric("random_forest_regressor_R2_train", processor.score(X_train, Y_train))

    # Log MSE and R2 score for test set 
    mlflow.log_metric("random_forest_regressor_MSE_test", mean_absolute_error(Y_test, Y_test_pred))
    mlflow.log_metric("random_forest_regressor_R2_test", processor.score(X_test, Y_test))

    # End mlflow autolog for retraining model on whole dataset (train + test) 
    mlflow.sklearn.autolog(disable = True)
    processor.fit(X, Y)

    # Log model separately
    mlflow.sklearn.log_model(
        sk_model = processor,
        artifact_path = "car_rental_price_predictor",
        registered_model_name = "Random_forest_regressor_car_rental_price_predictor",
        signature = infer_signature(X, Y)
    )



                                 OneHotEncoder(drop='first', sparse=False),
                                 ['model_key', 'fuel', 'paint_color',
                                  'car_type']),
                                ('numerical_transformer', StandardScaler(),
                                 ['mileage', 'engine_power']),
                                ('binary_transformer',
                     ...`
                                 OneHotEncoder(drop='first', sparse=False),
                                 ['model_key', 'fuel', 'paint_color',
                                  'car_type']),
                                ('numerical_transformer', StandardScaler(),
                                 ['mileage', 'engine_power']),
                                ('binary_transformer',
                                 FunctionTransfor...`
  inputs = _infer_schema(model_input)
  outputs = _infer_schema(model_output) if model_output is not None else None
Registered model 'Random_for