In [2]:
import pandas as pd

from scripts.feature_engineering import get_feature_transformer

from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import cross_validate, train_test_split, HalvingGridSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, make_scorer, r2_score

In [3]:
INPUT_DATA = 'data/data_processed.csv'

In [4]:
df = pd.read_csv(INPUT_DATA)

In [5]:
target = "Ewltp (g/km)"
features = ["m (kg)", "Ft", "ec (cm3)", "ep (KW)", "age_months"] # "Country", "Mk", 

X = df[features]
y = df[target]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# models = [
#     ("LinearRegression", LinearRegression()),
#     ("Ridge", Ridge(alpha=1.0)),
#     ("RandomForest", RandomForestRegressor(n_estimators=100, max_depth=None)),
#     ("GradientBoosting", GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3))
# ]

In [7]:
feature_transformer = get_feature_transformer()

scoring = {
    "neg_mse": make_scorer(mean_squared_error, greater_is_better=False),
    "r2": "r2",
}

In [None]:
# for name, model in models:
#     pipeline = Pipeline([
#         ('features', feature_transformer),
#         ('model', model)
#     ])
#     cv_results = cross_validate(
#         pipeline, X_train, y_train,
#         cv=5,
#         scoring=scoring,
#         return_train_score=True,
#         n_jobs=-1
#     )
    
#     print(f"{name}:")
#     print(f"  Train MSE: {cv_results['train_neg_mse'].mean():.4f}")
#     print(f"  Train R2: {cv_results['train_r2'].mean():.4f}")
#     print(f"  Val MSE: {cv_results['test_neg_mse'].mean():.4f}")
#     print(f"  Val R2: {cv_results['test_r2'].mean():.4f}")

LinearRegression:
  Train MSE: -232.4599
  Train R2: 0.8646
  Val MSE: -232.4629
  Val R2: 0.8646
Ridge:
  Train MSE: -232.4599
  Train R2: 0.8646
  Val MSE: -232.4629
  Val R2: 0.8646
RandomForest:
  Train MSE: -12.4578
  Train R2: 0.9927
  Val MSE: -15.3424
  Val R2: 0.9911
GradientBoosting:
  Train MSE: -118.2623
  Train R2: 0.9311
  Val MSE: -118.3361
  Val R2: 0.9311


In [9]:
param_grid = {
    'model__n_estimators': [100, 200],  
    'model__max_depth': [None, 20],    
    'model__min_samples_split': [2, 5], 
    'model__min_samples_leaf': [1, 2],   
    'model__bootstrap': [True]           
}

In [10]:
# On utilise Halving Grid Search pour utiliser moins de ressources
pipeline = Pipeline([
    ('features', feature_transformer),
    ('model', RandomForestRegressor())
])

halving_search = HalvingGridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    factor=2,  # Pour élminer rapidement les combinaisons non prometteuses
    cv=5,
    scoring='r2',
    refit='r2',
    n_jobs=-1,
    verbose=2
)
halving_search.fit(X_train, y_train)

# Afficher les meilleurs hyperparamètres et les résultats
print("Best hyperparameters:", halving_search.best_params_)
print("Best R2 score:", halving_search.best_score_)

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 36246
max_resources_: 579941
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 16
n_resources: 36246
Fitting 5 folds for each of 16 candidates, totalling 80 fits
----------
iter: 1
n_candidates: 8
n_resources: 72492
Fitting 5 folds for each of 8 candidates, totalling 40 fits
----------
iter: 2
n_candidates: 4
n_resources: 144984
Fitting 5 folds for each of 4 candidates, totalling 20 fits
----------
iter: 3
n_candidates: 2
n_resources: 289968
Fitting 5 folds for each of 2 candidates, totalling 10 fits
----------
iter: 4
n_candidates: 1
n_resources: 579936
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best hyperparameters: {'model__bootstrap': True, 'model__max_depth': 20, 'model__min_samples_leaf': 1, 'model__min_samples_split': 5, 'model__n_estimators': 200}
Best R2 score: 0.9763973593684877


In [11]:
# Évaluer le meilleur modèle sur l'ensemble de validation
best_model = halving_search.best_estimator_
cv_results = cross_validate(
    best_model, X_train, y_train,
    cv=5,
    scoring=scoring,
    return_train_score=True,
    n_jobs=-1
)

print("Best Model Evaluation:")
print(f"  Train MSE: {cv_results['train_neg_mse'].mean():.4f}")
print(f"  Train R2: {cv_results['train_r2'].mean():.4f}")
print(f"  Val MSE: {cv_results['test_neg_mse'].mean():.4f}")
print(f"  Val R2: {cv_results['test_r2'].mean():.4f}")

Best Model Evaluation:
  Train MSE: -26.8933
  Train R2: 0.9840
  Val MSE: -39.6550
  Val R2: 0.9764
