In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from scipy.stats import randint, uniform
import joblib

In [2]:
input_file = 'C:/Users/Christian/Desktop/PA/Dataset/valeurs_foncieres_with_mean_price_neighborhood.csv'
df = pd.read_csv(input_file, sep=';', dtype={'code_postal': str, 'code_commune': str, 'code_departement': str})

In [4]:
# Ajouter une colonne 'prix_m2' (prix au mètre carré)
df['prix_m2'] = df['valeur_fonciere'] / df['surface_reelle_bati']

In [5]:
# Encodage des variables catégorielles
df = pd.get_dummies(df, columns=['type_local', 'region_name'], drop_first=True)

In [6]:
features = ['surface_reelle_bati', 'longitude', 'latitude', 'distance_moyen_voisinage', 'prix_moyen_voisinage']
features += [col for col in df.columns if col.startswith('type_local_') or col.startswith('region_name_')]
X = df[features]
y = df['prix_m2']

In [7]:
# Séparation des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Imputation des valeurs manquantes
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

In [9]:
# Définir les hyperparamètres à tester
param_dist = {
    'n_estimators': randint(100, 1000),
    'max_depth': randint(10, 50),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'max_features': ['auto', 'sqrt', 'log2']
}

In [10]:
rf_model = RandomForestRegressor(random_state=42)

In [11]:
random_search = RandomizedSearchCV(rf_model, param_distributions=param_dist, n_iter=100, cv=3, verbose=2, n_jobs=-1, random_state=42)

In [12]:
random_search.fit(X_train_imputed, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [None]:
# Enregistrer le modèle sur le disque
model_filename = 'src/loaded_model/random_forest_rs_model.pkl'
joblib.dump(random_search, model_filename)
print(f"Model saved to {model_filename}")

In [13]:
best_params = random_search.best_params_
print(f"Meilleurs hyperparamètres trouvés: {best_params}")

Model saved to random_forest_model.pkl


In [None]:
# Utiliser le meilleur modèle pour faire des prédictions
best_rf_model = random_search.best_estimator_
y_pred = best_rf_model.predict(X_test_imputed)

In [None]:
# Calculer le RMSE avec le meilleur modèle
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (RMSE) with tuned model: {rmse:.2f}")