In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from scipy.stats import randint, uniform
import joblib

In [None]:
input_file = 'C:/Users/Christian/Desktop/PA/Dataset/valeurs_foncieres_with_mean_price_neighborhood.csv'
df = pd.read_csv(input_file, sep=';', dtype={'code_postal': str, 'code_commune': str, 'code_departement': str})

In [None]:
# Ajouter une colonne 'prix_m2' (prix au mètre carré)
df['prix_m2'] = df['valeur_fonciere'] / df['surface_reelle_bati']

In [None]:
# Encodage des variables catégorielles
df = pd.get_dummies(df, columns=['type_local', 'region_name'], drop_first=True)

In [None]:
features = ['surface_reelle_bati', 'longitude', 'latitude', 'distance_moyen_voisinage', 'prix_moyen_voisinage']
features += [col for col in df.columns if col.startswith('type_local_') or col.startswith('region_name_')]
X = df[features]
y = df['prix_m2']

In [None]:
# Séparation des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Imputation des valeurs manquantes
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

In [None]:
# Entraînement du modèle RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_imputed, y_train)

In [None]:
# Prédiction sur les données de test
y_pred = model.predict(X_test_imputed)

In [None]:
# Enregistrer le modèle sur le disque
model_filename = 'src/loaded_model/random_forest_model.pkl'
joblib.dump(model, model_filename)
print(f"Model saved to {model_filename}")