In [2]:
# Chargement données
df = pd.read_csv('../src/get_around_pricing_project_clean.csv')

In [3]:
# Features numériques (incluant binaires)
numeric_features = ['mileage', 'engine_power',
                  'private_parking_available', 'has_gps', 
                  'has_air_conditioning', 'automatic_car',
                  'has_getaround_connect', 'has_speed_regulator', 
                  'winter_tires']

# Features catégorielles
categorical_features = ['model_key', 'fuel', 'paint_color', 'car_type']

print("Features numériques:", len(numeric_features))
print("Features catégorielles:", len(categorical_features))

# Vérification des types
print("\nTypes de données:")
for col in numeric_features:
   print(f"{col}: {df[col].dtype}")
for col in categorical_features:
    print(f"{col}: {df[col].dtype}")

Features numériques: 9
Features catégorielles: 4

Types de données:
mileage: int64
engine_power: int64
private_parking_available: bool
has_gps: bool
has_air_conditioning: bool
automatic_car: bool
has_getaround_connect: bool
has_speed_regulator: bool
winter_tires: bool
model_key: object
fuel: object
paint_color: object
car_type: object


In [4]:
# Séparation features/target
X = df.drop(['rental_price_per_day', 'Unnamed: 0'], axis=1)
y = df['rental_price_per_day']

# Train/test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Train set:", X_train.shape)
print("Test set:", X_test.shape)

Train set: (3872, 13)
Test set: (968, 13)


In [5]:
def create_pipeline():    
    # Preprocessing numérique
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    
    # Preprocessing catégoriel
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
    ])
    
    return ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )

In [6]:
# Import des dépendances manquantes
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Création du preprocessing pipeline
preprocessor = create_pipeline()

# Fit et transform sur les données d'entraînement
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Vérification des dimensions
print("X_train transformé:", X_train_processed.shape)
print("X_test transformé:", X_test_processed.shape)

X_train transformé: (3872, 54)
X_test transformé: (968, 54)




---
## BASELINE
---

In [7]:
# Imports
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error

# Pipeline avec Linear Regression
lr_pipeline = Pipeline([
   ('regressor', LinearRegression())
])

# Fit
lr_pipeline.fit(X_train_processed, y_train)

# Prédictions
y_pred = lr_pipeline.predict(X_test_processed)

# Métriques
print("Linear Regression performances:")
print(f"RMSE: {root_mean_squared_error(y_test, y_pred):.2f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"R²: {r2_score(y_test, y_pred):.2f}")

Linear Regression performances:
RMSE: 18.41
MAE: 12.41
R²: 0.70


In [8]:
# Pipeline avec GridSearch pour Ridge et Lasso
param_grid = {
    'regressor': [Ridge(), Lasso()],
    'regressor__alpha': [0.1, 1.0, 10.0]
}

grid_pipeline = GridSearchCV(
    Pipeline([('regressor', Ridge())]),
    param_grid,
    cv=5,
    scoring='neg_mean_squared_error'
)

# Fit
grid_pipeline.fit(X_train_processed, y_train)

# Résultats
print("Meilleur modèle:", grid_pipeline.best_params_)
y_pred = grid_pipeline.predict(X_test_processed)
print("\nMeilleures performances:")
print(f"RMSE: {root_mean_squared_error(y_test, y_pred):.2f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"R²: {r2_score(y_test, y_pred):.2f}")

Meilleur modèle: {'regressor': Ridge(), 'regressor__alpha': 1.0}

Meilleures performances:
RMSE: 18.23
MAE: 12.38
R²: 0.71


Ridge avec alpha=1.0 améliore légèrement les performances :
- RMSE : 18.41 → 18.23
- MAE : 12.41 → 12.38
- R² : 0.70 → 0.71

C'est une bonne baseline avec :
- Erreur moyenne d'environ 18$
- 71% de variance expliquée
- Modèle simple et interprétable


---
## BASELINE AVEC FONCTION
---

In [9]:
def create_baseline_model():
    # Création du preprocessor
    preprocessor = create_pipeline()
    
    # Pipeline avec GridSearch pour Ridge et Lasso
    param_grid = {
        'regressor': [Ridge(), Lasso()],
        'regressor__alpha': [0.1, 1.0, 10.0]
    }
    
    base_model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', Ridge())
    ])
    
    # GridSearch
    model = GridSearchCV(
        base_model,
        param_grid,
        cv=5,
        scoring='neg_mean_squared_error'
    )
    
    return model

def train_evaluate_model(model, X, y):
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Train
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    
    # Evaluate
    metrics = {
        'RMSE': root_mean_squared_error(y_test, y_pred),
        'MAE': mean_absolute_error(y_test, y_pred),
        'R2': r2_score(y_test, y_pred)
    }
    
    return model, metrics

# Test
baseline_model = create_baseline_model()
trained_baseline, baseline_metrics = train_evaluate_model(baseline_model, X, y)

print("Baseline performances:")
print("Best params:", baseline_model.best_params_)
for metric, value in baseline_metrics.items():
    print(f"{metric}: {value:.2f}")



Baseline performances:
Best params: {'regressor': Ridge(), 'regressor__alpha': 1.0}
RMSE: 18.23
MAE: 12.38
R2: 0.71


---
## XGBOOST
---

In [10]:
# Imports
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score

xgb_model = XGBRegressor(random_state=42)
xgb_model.fit(X_train_processed, y_train)
y_pred = xgb_model.predict(X_test_processed)

# Métriques
print("XGBoost performances:")
print(f"RMSE: {root_mean_squared_error(y_test, y_pred):.2f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"R²: {r2_score(y_test, y_pred):.2f}")

XGBoost performances:
RMSE: 16.61
MAE: 10.80
R²: 0.76


In [11]:
# Imports
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error


# Définir le modèle initial
xgb_model = XGBRegressor(random_state=42)

# Entraînement initial pour vérifier que le modèle fonctionne
xgb_model.fit(X_train_processed, y_train)
y_pred = xgb_model.predict(X_test_processed)

# Afficher les métriques du modèle initial
print("Performances XGBoost initiales:")
print(f"RMSE: {root_mean_squared_error(y_test, y_pred):.2f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"R²: {r2_score(y_test, y_pred):.2f}")

# Définir les paramètres pour GridSearchCV
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 200]
}

# Configuration de GridSearchCV
grid_xgb = GridSearchCV(
    estimator=XGBRegressor(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring= 'neg_mean_squared_error',
    verbose=3  # Affiche les détails des étapes
)

# Lancer GridSearchCV
grid_xgb.fit(X_train_processed, y_train)

# Afficher les meilleurs paramètres trouvés
print("Meilleurs paramètres:", grid_xgb.best_params_)

# Faire des prédictions avec le meilleur modèle
y_pred = grid_xgb.best_estimator_.predict(X_test_processed)

# Afficher les métriques finales
print("\nPerformances du meilleur modèle XGBoost:")
print(f"RMSE: {root_mean_squared_error(y_test, y_pred):.2f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"R²: {r2_score(y_test, y_pred):.2f}")


Performances XGBoost initiales:
RMSE: 16.61
MAE: 10.80
R²: 0.76
Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=-2491.307 total time=   0.0s
[CV 2/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=-2659.432 total time=   0.0s
[CV 3/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=-2461.657 total time=   0.0s
[CV 4/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=-2497.689 total time=   0.0s
[CV 5/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=-2567.850 total time=   0.0s
[CV 1/5] END learning_rate=0.01, max_depth=3, n_estimators=200;, score=-663.861 total time=   0.0s
[CV 2/5] END learning_rate=0.01, max_depth=3, n_estimators=200;, score=-830.719 total time=   0.0s
[CV 3/5] END learning_rate=0.01, max_depth=3, n_estimators=200;, score=-652.453 total time=   0.0s
[CV 4/5] END learning_rate=0.01, max_depth=3, n_estimators=200;, score=-627.21

In [12]:
# Fit GridSearch
grid_xgb.fit(X_train_processed, y_train)


# Affichage meilleurs paramètres
print("Meilleurs paramètres:", grid_xgb.best_params_)

# Prédictions avec meilleur modèle
y_pred = grid_xgb.predict(X_test_processed)

# Métriques finales
print("\nMeilleures performances XGBoost:")
print(f"RMSE: {root_mean_squared_error(y_test, y_pred):.2f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"R²: {r2_score(y_test, y_pred):.2f}")

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=-2491.307 total time=   0.0s
[CV 2/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=-2659.432 total time=   0.0s
[CV 3/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=-2461.657 total time=   0.0s
[CV 4/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=-2497.689 total time=   0.0s
[CV 5/5] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=-2567.850 total time=   0.0s
[CV 1/5] END learning_rate=0.01, max_depth=3, n_estimators=200;, score=-663.861 total time=   0.0s
[CV 2/5] END learning_rate=0.01, max_depth=3, n_estimators=200;, score=-830.719 total time=   0.0s
[CV 3/5] END learning_rate=0.01, max_depth=3, n_estimators=200;, score=-652.453 total time=   0.0s
[CV 4/5] END learning_rate=0.01, max_depth=3, n_estimators=200;, score=-627.218 total time=   0.0s
[CV 5/5] END learning_rate=0.01, max_depth=

--- 
## Grid auto
---

In [13]:

# GridSearch pour XGBoost
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 200]
}

# créatio un nouveau XGBRegressor
xgb_model = XGBRegressor(random_state=42)

# GridSearchCV
grid_xgb = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
)

---
## entrainement avec gridauto
---

In [14]:
# Fit GridSearch
grid_xgb.fit(X_train_processed, y_train)


# Affichage meilleurs paramètres
print("Meilleurs paramètres:", grid_xgb.best_params_)

# Prédictions avec meilleur modèle
y_pred = grid_xgb.predict(X_test_processed)

# Métriques finales
print("\nMeilleures performances XGBoost:")
print(f"RMSE: {root_mean_squared_error(y_test, y_pred):.2f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"R²: {r2_score(y_test, y_pred):.2f}")

Meilleurs paramètres: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100}

Meilleures performances XGBoost:
RMSE: 16.15
MAE: 10.51
R²: 0.77


Comparaison finale avec Ridge :
1. Ridge (baseline) :
   - RMSE : 18.23
   - MAE : 12.38
   - R² : 0.71

2. XGBoost (meilleur modèle) :
   - RMSE : 16.15
   - MAE : 10.51
   - R² : 0.77


---
## VERSION XGBOOST AVEC FONCTION
---

In [15]:
def create_model(model_type='xgboost', best_params=None):
    # Création du preprocessor
    preprocessor = create_pipeline()
    
    # Choix du modèle
    if model_type == 'xgboost':
        if best_params:
            regressor = XGBRegressor(**best_params, random_state=42)
        else:
            regressor = XGBRegressor(
                learning_rate=0.1, 
                max_depth=5, 
                n_estimators=200,
                random_state=42
            )
    elif model_type == 'ridge':
        regressor = Ridge(alpha=1.0)
    
    # Création du pipeline complet
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', regressor)
    ])
    
    return model

def train_evaluate_model(model, X, y):
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Train
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    
    # Evaluate
    metrics = {
        'RMSE': root_mean_squared_error(y_test, y_pred),
        'MAE': mean_absolute_error(y_test, y_pred),
        'R2': r2_score(y_test, y_pred)
    }
    
    return model, metrics

In [16]:
# Test avec XGBoost
xgb_model = create_model('xgboost')
trained_xgb, xgb_metrics = train_evaluate_model(xgb_model, X, y)

print("XGBoost performances:")
for metric, value in xgb_metrics.items():
   print(f"{metric}: {value:.2f}")

# Test avec Ridge
ridge_model = create_model('ridge')
trained_ridge, ridge_metrics = train_evaluate_model(ridge_model, X, y)

print("\nRidge performances:")
for metric, value in ridge_metrics.items():
   print(f"{metric}: {value:.2f}")

XGBoost performances:
RMSE: 16.08
MAE: 10.45
R2: 0.77

Ridge performances:
RMSE: 18.23
MAE: 12.38
R2: 0.71




# Conclusion

## Comparaison des Modèles

1. Baseline (Ridge) :
   - RMSE : 18.23$
   - MAE : 12.38$
   - R² : 0.71
   - Avantages : Simple, interprétable
   - Limitations : Performances limitées

2. XGBoost (optimisé) :
   - RMSE : 16.08$
   - MAE : 10.45$
   - R² : 0.77
   - Avantages : Meilleures performances
   - Limitations : Plus complexe

## Recommandation
→ XGBoost comme modèle final car :
- Meilleures performances globales
- Bonne gestion des relations non-linéaires
- Compromis acceptable complexité/performance