In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.preprocessing import StandardScaler


In [7]:
# Load Grenoble apartment CSV from project folder
from pathlib import Path
candidates = [
    Path('../DataPreprocessing/PreprocessedData/df_grenoble_vente.csv'),
    Path('DataPreprocessing/PreprocessedData/df_grenoble_vente.csv'),
    Path('projet_dvf/DataPreprocessing/PreprocessedData/df_grenoble_vente.csv'),
]
csv_path = next((p for p in candidates if p.exists()), None)
if csv_path is None:
    tried = [str((Path.cwd()/p).resolve()) if not p.is_absolute() else str(p) for p in candidates]
    raise FileNotFoundError(f'Missing file. Tried: {tried}')
df = pd.read_csv(csv_path)
print(f'Loaded {len(df):,} rows and {len(df.columns)} columns from {csv_path}')
df.head()

Loaded 1,288 rows and 7 columns from ../DataPreprocessing/PreprocessedData/df_grenoble_vente.csv


Unnamed: 0,price,type_local,surface_bati,surface_terrain,date,nb_pieces,type_local_1234
0,112560.0,Appartement,45.0,1.0,10,2,1
1,65000.0,Appartement,29.0,1.0,8,1,1
2,9000.0,Local industriel. commercial ou assimilé,12.0,1.0,9,0,3
3,133000.0,Appartement,49.0,1.0,7,2,1
4,53000.0,Appartement,14.0,1.0,3,1,1


In [8]:
# Keep only rows with positive price and surface_bati for valid logs
df_copy = df.copy()

# Robust appartement subset (works even if 'type_local' is missing)
if 'type_local' in df_copy.columns:
    m_app = df_copy['type_local'].astype(str).str.strip().str.lower().eq('appartement')
    df_appartement = df_copy.loc[m_app].copy()
else:
    df_appartement = df_copy.copy()  # fallback: use all rows

# Drop non-numeric identifiers if present
df_copy = df_copy.drop(columns=['type_local', 'type_local_1234'], errors='ignore')
df_appartement = df_appartement.drop(columns=['type_local', 'type_local_1234'], errors='ignore')

In [9]:
# according to litterature review log(price) ~ log(surface_bati) + log(nb_piece_principale) + date+other variables
# and better if for each category of property type we have a different model
def compute_log_vars(df_in):
    # Keep strictly positive values for log transform
    mask = (df_in['price'] > 0) & (df_in['surface_bati'] > 0) & (df_in['surface_terrain'] > 0)
    df_out = df_in.loc[mask].copy()
    df_out['log_price'] = np.log(df_out['price'])
    df_out['log_surface_bati'] = np.log(df_out['surface_bati'])
    df_out['log_surface_terrain'] = np.log(df_out['surface_terrain'])
    # Also keep a duplicate name for any legacy cells using 'log_price'
    drop_cols = ['price', 'surface_bati', 'surface_terrain', 'type_local']
    df_out = df_out.drop(columns=drop_cols, errors='ignore')

    return df_out


def standard_scale_df(df, return_scaler=False):
    """
    Returns a copy of df with all numeric columns standard-scaled (mean=0, std=1).

    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame.
    return_scaler : bool, optional (default=False)
        If True, also returns the fitted StandardScaler object.

    Returns
    -------
    df_scaled : pd.DataFrame
        DataFrame with same columns, numeric ones scaled, non-numeric unchanged.
    (optional) scaler : sklearn.preprocessing.StandardScaler
        Fitted scaler, useful for transforming new data later.
    """
    if not isinstance(df, pd.DataFrame):
        raise TypeError("Input must be a pandas DataFrame")

    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if not num_cols:
        print("No numeric columns found; returning original DataFrame.")
        return (df.copy(), None) if return_scaler else df.copy()

    scaler = StandardScaler()
    df_scaled = df.copy()
    df_scaled[num_cols] = scaler.fit_transform(df_scaled[num_cols])

    return (df_scaled, scaler) if return_scaler else df_scaled


In [10]:
def standard_scale_df(df, return_scaler=False):
    """
    Returns a copy of df with all numeric columns standard-scaled (mean=0, std=1).

    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame.
    return_scaler : bool, optional (default=False)
        If True, also returns the fitted StandardScaler object.

    Returns
    -------
    df_scaled : pd.DataFrame
        DataFrame with same columns, numeric ones scaled, non-numeric unchanged.
    (optional) scaler : sklearn.preprocessing.StandardScaler
        Fitted scaler, useful for transforming new data later.
    """
    if not isinstance(df, pd.DataFrame):
        raise TypeError("Input must be a pandas DataFrame")

    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if not num_cols:
        print("No numeric columns found; returning original DataFrame.")
        return (df.copy(), None) if return_scaler else df.copy()

    scaler = StandardScaler()
    df_scaled = df.copy()
    df_scaled[num_cols] = scaler.fit_transform(df_scaled[num_cols])

    return (df_scaled, scaler) if return_scaler else df_scaled


In [14]:
# Scale df_appartement using the standard_scale_df function
df_appartementScaled = standard_scale_df(df_appartement)

# Compute log-transformed version of df_appartement
df_appartement_log = compute_log_vars(df_appartement)

# Ensure all required variables are defined and processed
df_copy_log = compute_log_vars(df_copy)
df_appartementScaled_log = compute_log_vars(df_appartementScaled)
df_copyScaled = standard_scale_df(df_copy)
df_copyScaled_log = compute_log_vars(df_copyScaled)

dataframe_list = [df_appartement, df_appartement_log, df_copy, df_copy_log,
                  df_appartementScaled, df_appartementScaled_log, df_copyScaled, df_copyScaled_log]
#scaling hurts RF in theory(most of the time) but better verify empirically


In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import pandas as pd
import numpy as np

def fit_random_forest(df, target_col=None, test_size=0.2, random_state=42,
                      n_estimators=500, max_depth=None, verbose=True):
    """
    Fits a RandomForestRegressor on a DataFrame and returns the model, R², and RMSE.

    Parameters
    ----------
    df : pd.DataFrame
        Input data.
    target_col : str, optional
        Target column name. If None, tries 'log_price' or 'price'.
    test_size : float
        Fraction of data used for testing (default 0.2).
    random_state : int
        Random seed for reproducibility.
    n_estimators : int
        Number of trees (default 500).
    max_depth : int or None
        Max depth of trees (None = fully grown).
    verbose : bool
        Whether to print metrics.

    Returns
    -------
    model : RandomForestRegressor
        Trained model.
    r2 : float
        R² score on test data.
    rmse : float
        Root Mean Squared Error on test data.
    """

    df = df.copy()

    # Auto-detect target
    if target_col is None:
        if 'log_price' in df.columns:
            target_col = 'log_price'
        elif 'price' in df.columns:
            target_col = 'price'
        else:
            raise ValueError("No 'price' or 'log_price' column found and target_col not provided.")

    # Drop rows with missing target
    df = df.dropna(subset=[target_col])

    # Select features (exclude target)
    X = df.drop(columns=[target_col])

    # Keep only numeric columns for simplicity
    X = X.select_dtypes(include=[np.number]).copy()

    if X.empty:
        raise ValueError("No numeric predictors found in the DataFrame.")

    y = df[target_col]

    # Split train/test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,
                                                        random_state=random_state)

    # Fit model
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=random_state,
        n_jobs=-1
    )
    model.fit(X_train, y_train)

    # Evaluate
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred)

    if verbose:
        print(f"Random Forest — target: {target_col}")
        print(f"R²: {r2:.4f}")
        print(f"RMSE: {rmse:.4f}")

    return model, r2, rmse


In [13]:
models = []
for i, df in enumerate(dataframe_list):
    print(f"\n--- Fitting model for DataFrame #{i+1} ---")
    model, r2, rmse = fit_random_forest(df, verbose=True)
    models.append((model, r2, rmse))


--- Fitting model for DataFrame #1 ---
Random Forest — target: price
R²: 0.8261
RMSE: 5192843465.3001

--- Fitting model for DataFrame #2 ---
Random Forest — target: log_price
R²: 0.5351
RMSE: 0.2321

--- Fitting model for DataFrame #3 ---
Random Forest — target: price
R²: 0.7250
RMSE: 5893697915.4187

--- Fitting model for DataFrame #4 ---
Random Forest — target: log_price
R²: 0.2413
RMSE: 0.3871

--- Fitting model for DataFrame #5 ---
Random Forest — target: price
R²: 0.8254
RMSE: 0.2136

--- Fitting model for DataFrame #6 ---
Random Forest — target: log_price
R²: -0.1632
RMSE: 0.1787

--- Fitting model for DataFrame #7 ---
Random Forest — target: price
R²: 0.7262
RMSE: 0.0924

--- Fitting model for DataFrame #8 ---
Random Forest — target: log_price
R²: -0.0362
RMSE: 0.8048
