# House 

In [38]:
import numpy as np
import pandas as pd
from scipy import stats
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from ydata_profiling import ProfileReport
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
import lightgbm as lgb
import optuna
import os

# Désactive les erreurs Ray parasites
os.environ["RAY_IGNORE_UNHANDLED_ERRORS"] = "1"

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test_ids = test['Id']

In [39]:
y= train["SalePrice"]
train = train.drop(columns=["SalePrice","Id"])
test = test.drop(columns=["Id"])

# MSZoning - replace NaN with mode (RL)
train["MSZoning"] = train["MSZoning"].replace(np.nan, "RL")
test["MSZoning"] = test["MSZoning"].replace(np.nan, "RL")


# LotFrontage - replace NaN with median of neighborhood + log transform
train["LotFrontage"] = train.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median())
)
test["LotFrontage"] = test.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median())
)
train["LotFrontage"] = np.log1p(train["LotFrontage"])  # log transform for skew
test["LotFrontage"] = np.log1p(test["LotFrontage"])  # log transform for skew

# Age_Built and Age_RemodAdd - create new features
train["Age_Built"] = train["YrSold"] - train["YearBuilt"]
train["Age_RemodAdd"] = train["YrSold"] - train["YearRemodAdd"]
test["Age_Built"] = test["YrSold"] - test["YearBuilt"]
test["Age_RemodAdd"] = test["YrSold"] - test["YearRemodAdd"]

# LotArea - use log transformation to reduce skewness
train["LotArea"] = np.log1p(train["LotArea"])
test["LotArea"] = np.log1p(test["LotArea"])



# MasVnrType - replace NaN with None
train["MasVnrType"] = train["MasVnrType"].replace(np.nan, "None")
test["MasVnrType"] = test["MasVnrType"].replace(np.nan, "None")

# MasVnrArea - replace NaN with 0 + log transform
train["MasVnrArea"] = train["MasVnrArea"].replace(np.nan, 0)
test["MasVnrArea"] = test["MasVnrArea"].replace(np.nan, 0)
train["MasVnrArea"] = np.log1p(train["MasVnrArea"])  # log transform for skew
test["MasVnrArea"] = np.log1p(test["MasVnrArea"])  # log transform for skew

# ExterCond - mapping
mapping = {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
train["ExterCond"] = train["ExterCond"].map(mapping)
test["ExterCond"] = test["ExterCond"].map(mapping)
# ExterQual - mapping
mapping = {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
train["ExterQual"] = train["ExterQual"].map(mapping)
test["ExterQual"] = test["ExterQual"].map(mapping)

#BstmQual - replace NaN with None + mapping
train["BsmtQual"] = train["BsmtQual"].replace(np.nan, "None")
test["BsmtQual"] = test["BsmtQual"].replace(np.nan, "None")
mapping = {"None": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
train["BsmtQual"] = train["BsmtQual"].map(mapping)
test["BsmtQual"] = test["BsmtQual"].map(mapping)

# BsmtCOnd - replace NaN with None + mapping
train["BsmtCond"] = train["BsmtCond"].replace(np.nan, "None")
test["BsmtCond"] = test["BsmtCond"].replace(np.nan, "None")
mapping = {"None": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
train["BsmtCond"] = train["BsmtCond"].map(mapping)
test["BsmtCond"] = test["BsmtCond"].map(mapping)

# BsmtHalfBath - replace NaN with 0
train["BsmtHalfBath"] = train["BsmtHalfBath"].replace(np.nan, 0)
test["BsmtHalfBath"] = test["BsmtHalfBath"].replace(np.nan, 0)

# BsmtFullBath - replace NaN with 0
train["BsmtFullBath"] = train["BsmtFullBath"].replace(np.nan, 0)
test["BsmtFullBath"] = test["BsmtFullBath"].replace(np.nan, 0)

# BsmtExposure - replace NaN with None + mapping
train["BsmtExposure"] = train["BsmtExposure"].replace(np.nan, "None")
test["BsmtExposure"] = test["BsmtExposure"].replace(np.nan, "None")
mapping = {"None": 0, "No": 1, "Mn": 2, "Av": 3, "Gd": 4}
train["BsmtExposure"] = train["BsmtExposure"].map(mapping)
test["BsmtExposure"] = test["BsmtExposure"].map(mapping)

#BsmtfinType1 - replace NaN with None + mapping
train["BsmtFinType1"] = train["BsmtFinType1"].replace(np.nan, "None")
test["BsmtFinType1"] = test["BsmtFinType1"].replace(np.nan, "None")
mapping = {"None": 0, "Unf": 1, "LwQ": 2, "Rec": 3, "BLQ": 4, "ALQ": 5, "GLQ": 6}
train["BsmtFinType1"] = train["BsmtFinType1"].map(mapping)
test["BsmtFinType1"] = test["BsmtFinType1"].map(mapping)

# BsmtFinSF2 - replace NaN with 0 + log transform + binary indicator
train["BsmtFinSF2"] = train["BsmtFinSF2"].replace(np.nan, 0)
test["BsmtFinSF2"] = test["BsmtFinSF2"].replace(np.nan, 0)
train["HasBsmtFinSF2"] = (train["BsmtFinSF2"] > 0).astype(int)
test["HasBsmtFinSF2"] = (test["BsmtFinSF2"] > 0).astype(int)
train["BsmtFinSF2"] = np.log1p(train["BsmtFinSF2"])  # log transform for skew
test["BsmtFinSF2"] = np.log1p(test["BsmtFinSF2"])  # log transform for skew

# BsmtFinSF1 - replace NaN with 0 + log transform + binary indicator
train["BsmtFinSF1"] = train["BsmtFinSF1"].replace(np.nan, 0)
test["BsmtFinSF1"] = test["BsmtFinSF1"].replace(np.nan, 0)
train["HasBsmtFinSF1"] = (train["BsmtFinSF1"] > 0).astype(int)
test["HasBsmtFinSF1"] = (test["BsmtFinSF1"] > 0).astype(int)
train["BsmtFinSF1"] = np.log1p(train["BsmtFinSF1"])  # log transform for skew
test["BsmtFinSF1"] = np.log1p(test["BsmtFinSF1"])  # log transform for skew

# BsmtFinType2 - replace NaN with None + mapping
train["BsmtFinType2"] = train["BsmtFinType2"].replace(np.nan, "None")
test["BsmtFinType2"] = test["BsmtFinType2"].replace(np.nan, "None")
mapping = {"None": 0, "Unf": 1, "LwQ": 2, "Rec": 3, "BLQ": 4, "ALQ": 5, "GLQ": 6}
train["BsmtFinType2"] = train["BsmtFinType2"].map(mapping)
test["BsmtFinType2"] = test["BsmtFinType2"].map(mapping)

# BsmtUnfSF - replace NaN with 0 + log transform
train["BsmtUnfSF"] = train["BsmtUnfSF"].replace(np.nan, 0)
test["BsmtUnfSF"] = test["BsmtUnfSF"].replace(np.nan, 0)
train["BsmtUnfSF"] = np.log1p(train["BsmtUnfSF"])  # log transform for skew
test["BsmtUnfSF"] = np.log1p(test["BsmtUnfSF"])  # log transform for skew

# TotalBsmtSF - use log transformation to reduce skewness
# replace NaN with 0 (no basement)
train["TotalBsmtSF"] = train["TotalBsmtSF"].replace(np.nan, 0)
test["TotalBsmtSF"] = test["TotalBsmtSF"].replace(np.nan, 0)
train["TotalBsmtSF"] = np.log1p(train["TotalBsmtSF"])
test["TotalBsmtSF"] = np.log1p(test["TotalBsmtSF"])

# HeatingQC 
mapping = {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
train["HeatingQC_num"] = train["HeatingQC"].map(mapping)
test["HeatingQC_num"] = test["HeatingQC"].map(mapping)

# Electrical - replace NaN with mode (SBrkr)
train["Electrical"] = train["Electrical"].replace(np.nan, "SBrkr")
test["Electrical"] = test["Electrical"].replace(np.nan, "SBrkr")

# Exterior1st - replace NaN with mode (VinylSd)
train["Exterior1st"] = train["Exterior1st"].replace(np.nan, "VinylSd")
test["Exterior1st"] = test["Exterior1st"].replace(np.nan, "VinylSd")

# Exterior2nd - replace NaN with mode (VinylSd)
train["Exterior2nd"] = train["Exterior2nd"].replace(np.nan, "VinylSd")
test["Exterior2nd"] = test["Exterior2nd"].replace(np.nan, "VinylSd")

# First Floor SF - use log transformation to reduce skewness
train["1stFlrSF"] = np.log1p(train["1stFlrSF"])
test["1stFlrSF"] = np.log1p(test["1stFlrSF"])

# 2nd Flr SF - create binary indicator + log transform
train["Has2ndFlr"] = (train["2ndFlrSF"] > 0).astype(int)
test["Has2ndFlr"] = (test["2ndFlrSF"] > 0).astype(int)
train["2ndFlrSF"] = np.log1p(train["2ndFlrSF"])  # log transform for skew
test["2ndFlrSF"] = np.log1p(test["2ndFlrSF"])  # log transform for skew

# Low Qualfin SF - binomial indicator
train["HasLowQualFinSF"] = (train["LowQualFinSF"] > 0).astype(int)
test["HasLowQualFinSF"] = (test["LowQualFinSF"] > 0).astype(int)

# GrLivArea - use log transformation to reduce skewness
train["GrLivArea"] = np.log1p(train["GrLivArea"])
test["GrLivArea"] = np.log1p(test["GrLivArea"])

# Kitchen Qual - replace NaN with mode (TA) + mapping
mapping = {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
train["KitchenQual"] = train["KitchenQual"].replace(np.nan, "TA")
test["KitchenQual"] = test["KitchenQual"].replace(np.nan, "TA")
train["KitchenQual"] = train["KitchenQual"].map(mapping)
test["KitchenQual"] = test["KitchenQual"].map(mapping)

#FireplaceQu - replace NaN with None + mapping
mapping = {"None": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}  
train["FireplaceQu"] = train["FireplaceQu"].replace(np.nan, "None")
test["FireplaceQu"] = test["FireplaceQu"].replace(np.nan, "None")
train["FireplaceQu"] = train["FireplaceQu"].map(mapping)
test["FireplaceQu"] = test["FireplaceQu"].map(mapping)

# GarageYrBlt - create new feature GarageAge + replace NaN with maxvalue 
max_year = max(train["YearBuilt"].max(), train["YearRemodAdd"].max(), train["YrSold"].max()) + 1
train["GarageYrBlt"] = train["GarageYrBlt"].replace(np.nan, max_year)
test["GarageYrBlt"] = test["GarageYrBlt"].replace(np.nan, max_year)
train["GarageAge"] = train["YrSold"] - train["GarageYrBlt"]
test["GarageAge"] = test["YrSold"] - test["GarageYrBlt"]

# GarageFinish - replace NaN with None + mapping
train["GarageFinish"] = train["GarageFinish"].replace(np.nan, "None")
test["GarageFinish"] = test["GarageFinish"].replace(np.nan, "None")
mapping = {"Fin": 1, "RFn": 2, "Unf": 3, "None": 0}
train["GarageFinish"] = train["GarageFinish"].map(mapping)
test["GarageFinish"] = test["GarageFinish"].map(mapping)

# GarageCond - mapping + replace NaN with None
mapping = {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5, "None": 0}
train["GarageCond"] = train["GarageCond"].replace(np.nan, "None")
test["GarageCond"] = test["GarageCond"].replace(np.nan, "None")
train["GarageCond"] = train["GarageCond"].map(mapping)
test["GarageCond"] = test["GarageCond"].map(mapping)

# GarageArea - create binary indicator and log transform + replace NaN with 0
train["GarageArea"] = train["GarageArea"].fillna(0)
test["GarageArea"] = test["GarageArea"].fillna(0)
train["HasGarage"] = (train["GarageArea"] > 0).astype(int)
test["HasGarage"] = (test["GarageArea"] > 0).astype(int)
train["GarageArea_log"] = np.log1p(train["GarageArea"])  # log transform for skew
test["GarageArea_log"] = np.log1p(test["GarageArea"])

# GarageType - replace NaN with None
train["GarageType"] = train["GarageType"].replace(np.nan, "None")
test["GarageType"] = test["GarageType"].replace(np.nan, "None")

# GarageQual - replace NaN with None + mapping
mapping = {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5, "None": 0}
train["GarageQual"] = train["GarageQual"].replace(np.nan, "None")
test["GarageQual"] = test["GarageQual"].replace(np.nan, "None")
train["GarageQual"] = train["GarageQual"].map(mapping)
test["GarageQual"] = test["GarageQual"].map(mapping)

#Same with WoodDeckSF
train["HasWoodDeck"] = (train["WoodDeckSF"] > 0).astype(int)
test["HasWoodDeck"] = (test["WoodDeckSF"] > 0).astype(int)
train["WoodDeckSF_log"] = np.log1p(train["WoodDeckSF"])  # log transform for skew
test["WoodDeckSF_log"] = np.log1p(test["WoodDeckSF"])

# Binary indicator
train["HasOpenPorch"] = (train["OpenPorchSF"] > 0).astype(int)
test["HasOpenPorch"] = (test["OpenPorchSF"] > 0).astype(int)

# Optionally keep raw or transform it
train["OpenPorchSF_log"] = np.log1p(train["OpenPorchSF"])  # log transform for skew
test["OpenPorchSF_log"] = np.log1p(test["OpenPorchSF"])

train["HasScreenPorch"] = (train["ScreenPorch"] > 0).astype(int)
train["ScreenPorchLog"] = np.log1p(train["ScreenPorch"])
test["HasScreenPorch"] = (test["ScreenPorch"] > 0).astype(int)
test["ScreenPorchLog"] = np.log1p(test["ScreenPorch"])

train["GotPool"] = train["PoolQC"].notnull().astype(int)
test["GotPool"] = test["PoolQC"].notnull().astype(int)

# Fence - replace NaN with None + mapping with quality and good wood
# Fence - mapping with quality and good wood
# Fence_wo: 2 if GdWo, 1 if MnWw, else 0
# Fence_Prv: 2 if GdPrv, 1 if MnPrv, else 0
def fence_wo(val):
    if val == "GdWo":
        return 2
    elif val == "MnWw":
        return 1
    else:
        return 0
def fence_prv(val):
    if val == "GdPrv":
        return 2
    elif val == "MnPrv":
        return 1
    else:
        return 0
train["Fence_wo"] = train["Fence"].apply(fence_wo)
test["Fence_wo"] = test["Fence"].apply(fence_wo)
train["Fence_Prv"] = train["Fence"].apply(fence_prv)
test["Fence_Prv"] = test["Fence"].apply(fence_prv)

mapping = {"GdWo": 2, "MnPrv": 1, "GdPrv": 2, "MnWw": 1, "None": 0}
train["Fence"] = train["Fence"].map(mapping)
test["Fence"] = test["Fence"].map(mapping)

train["MoSold_sin"] = np.sin(2 * np.pi * train["MoSold"] / 12)
train["MoSold_cos"] = np.cos(2 * np.pi * train["MoSold"] / 12)
test["MoSold_sin"] = np.sin(2 * np.pi * test["MoSold"] / 12)
test["MoSold_cos"] = np.cos(2 * np.pi * test["MoSold"] / 12)

# Sale type - replace NaN with mode (WD)
train["SaleType"] = train["SaleType"].replace(np.nan, "WD")
test["SaleType"] = test["SaleType"].replace(np.nan, "WD")

train = train.drop(columns=["BsmtFinType2","ExterCond","PoolArea","OpenPorchSF","WoodDeckSF","LowQualFinSF","YearRemodAdd","YearBuilt","GarageYrBlt","PoolQC","Fence","Functional","GarageCars","EnclosedPorch","3SsnPorch","ScreenPorch","MiscFeature","MiscVal","RoofMatl","Condition2","Alley", "Street", "Utilities", "MiscFeature"])
test = test.drop(columns=["BsmtFinType2","ExterCond","PoolArea","OpenPorchSF","WoodDeckSF","LowQualFinSF","YearRemodAdd","YearBuilt","GarageYrBlt","PoolQC","Fence","Functional","GarageCars","EnclosedPorch","3SsnPorch","ScreenPorch","MiscFeature","MiscVal","RoofMatl","Condition2","Alley", "Street", "Utilities", "MiscFeature"])

print(test.columns)


Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'LotShape',
       'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
       'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'RoofStyle',
       'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual',
       'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
       'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageArea',
       'GarageQual', 'GarageCond', 'PavedDrive', 'MoSold', 'YrSold',
       'SaleType', 'SaleCondition', 'Age_Built', 'Age_RemodAdd',
       'HasBsmtFinSF2', 'HasBsmtFinSF1', 'HeatingQC_num', 'Has2ndFlr',
       'HasLowQualFinSF', 'GarageAge', 'Has

In [40]:
#je garde à contre coeur : LandSlope, Screeporch PavedDrive?
# truc potentiellement intéréssant : MiscVal kitchenAbove
# truc ou je me suis permis des dinguerie : PoolQC (et donc poolarea) 
num_cols = ["Fence_wo","Fence_Prv","GarageQual","KitchenQual","BsmtFinType1","BsmtFinSF2","MasVnrArea","ExterQual","BsmtQual","BsmtCond","FireplaceQu","GarageFinish","BsmtExposure","BsmtFinSF1","BsmtUnfSF","TotalBsmtSF","GarageCond","HeatingQC_num","1stFlrSF","2ndFlrSF","GrLivArea","BsmtFullBath","BsmtHalfBath","FullBath","HalfBath","BedroomAbvGr","KitchenAbvGr","TotRmsAbvGrd","Fireplaces","GarageAge","GarageArea_log","WoodDeckSF_log","HasOpenPorch","HasScreenPorch","MoSold_sin","MoSold_cos","Age_Built","Age_RemodAdd","LotFrontage","LotArea","OverallQual","OverallCond"]
cat_cols = ["MasVnrType","Foundation","HasBsmtFinSF1","Heating","CentralAir","Electrical","HasBsmtFinSF2","Has2ndFlr","HasLowQualFinSF","GarageType","HasGarage","PavedDrive","HasWoodDeck","HasOpenPorch","HasScreenPorch","GotPool","YrSold","SaleType","SaleCondition","Exterior1st","Exterior2nd","RoofStyle","MSSubClass","HouseStyle","Neighborhood","BldgType","MSSubClass","LotShape","LandContour","LotConfig","LandSlope","Condition1"]

preprocessor = ColumnTransformer(
    transformers=[
        # One-hot categorical features
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        # Pass-through numerical features
        ("num", StandardScaler(), num_cols),
    ],
    remainder="drop"  # drop unused raw columns like Name, Ticket, Cabin, etc.
)

In [41]:
X_train = train
X_train2= X_train.copy()
y_train = y
y_train2 = y_train.copy()
X_test = test
X_test2 = X_test.copy()


print("X_train columns:", X_train.columns.tolist())
print("X_test columns:", X_test.columns.tolist())

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)

missing_in_test = set(X_train.columns) - set(X_test.columns)
missing_in_train = set(X_test.columns) - set(X_train.columns)
print("Colonnes manquantes dans test:", missing_in_test)
print("Colonnes manquantes dans train:", missing_in_train)

models = {
    "RandomForest": RandomForestRegressor(n_estimators=500, max_depth=None, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=3, random_state=42),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.0005, max_iter=5000),
    "SVR": SVR(kernel="rbf", C=20, epsilon=0.1),
    "KNN": KNeighborsRegressor(n_neighbors=5),
    "XGB": xgb.XGBRegressor(n_estimators=2000, learning_rate=0.05, max_depth=4, subsample=0.7, colsample_bytree=0.7, random_state=42),
    "LGBM": lgb.LGBMRegressor(n_estimators=2000, learning_rate=0.05, max_depth=-1, subsample=0.7, colsample_bytree=0.7, random_state=42),
}

model = RandomForestRegressor(n_estimators=200, random_state=42)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

# 5-fold CV
cv = KFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(
    pipeline, X_train, y_train,
    cv=cv, scoring="neg_root_mean_squared_error"
)

print("RMSE par fold:", -scores)
print("RMSE moyen:", -np.mean(scores))

X_train columns: ['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'RoofStyle', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition', 'Age_Built', 'Age_RemodAdd', 'HasBsmtFinSF2', 'HasBsmtFinSF1', 'HeatingQC_num', 'Has2ndFlr', 'HasLowQualFinSF', 'GarageAge', 'HasGarage', 'GarageArea_log', 'HasWoodDeck', 'WoodDeckSF_log', 'HasOpenPorch', 'Ope

In [42]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

In [43]:
submission = pd.DataFrame({
    "Id": test_ids,
    "SalePrice": y_pred
})
submission.to_csv("submission.csv", index=False)

In [44]:
if True:
    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    for name, model in models.items():
        pipeline = Pipeline(steps=[
            ("preprocessor", preprocessor),
            ("model", model)
        ])
        
        # --- Cross-validation ---
        scores = cross_val_score(
            pipeline, X_train2, y_train2,
            cv=cv, scoring="neg_root_mean_squared_error", n_jobs=4
        )
        
        rmse_scores = -scores
        mean_rmse, std_rmse = rmse_scores.mean(), rmse_scores.std()
        print(f"{name}: RMSE = {mean_rmse:.4f} ± {std_rmse:.4f}")
        
        # --- Train full model and predict on X_test ---
        pipeline.fit(X_train2, y_train2)
        y_pred = pipeline.predict(X_test2)
        
        submission = pd.DataFrame({
            "Id": test_ids,        # Assumes you have test_ids defined
            "SalePrice": y_pred
        })
        
        filename = f"submission_{name}.csv"
        submission.to_csv(filename, index=False)
        print(f"Prédictions sauvegardées dans {filename}")


RandomForest: RMSE = 30455.9508 ± 7513.0722
Prédictions sauvegardées dans submission_RandomForest.csv
GradientBoosting: RMSE = 29858.8303 ± 9892.7035
Prédictions sauvegardées dans submission_GradientBoosting.csv
Ridge: RMSE = 34174.6778 ± 5466.3735
Prédictions sauvegardées dans submission_Ridge.csv
Lasso: RMSE = 35033.6601 ± 5636.8305


  model = cd_fast.enet_coordinate_descent(


Prédictions sauvegardées dans submission_Lasso.csv
SVR: RMSE = 80403.1131 ± 5321.8403
Prédictions sauvegardées dans submission_SVR.csv
KNN: RMSE = 36632.2109 ± 4798.5303
Prédictions sauvegardées dans submission_KNN.csv
XGB: RMSE = 28627.7023 ± 10045.3755
Prédictions sauvegardées dans submission_XGB.csv
LGBM: RMSE = 31645.9249 ± 8553.8557
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000741 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3282
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 180
[LightGBM] [Info] Start training from score 180921.195890
Prédictions sauvegardées dans submission_LGBM.csv




# Autogluon

In [45]:
# Calculer la matrice de corrélation pour les variables numériques
corr_matrix = X_train.corr(numeric_only=True).abs()

# On ne garde que la partie supérieure de la matrice (pour éviter les doublons)
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Trouver les paires avec la plus forte corrélation (hors diagonale)
most_correlated = (
    upper.stack()
    .sort_values(ascending=False)
    .head(10)
)

print(most_correlated)

2ndFlrSF        Has2ndFlr          0.997334
HasScreenPorch  ScreenPorchLog     0.996544
BsmtFinSF2      HasBsmtFinSF2      0.988754
HasWoodDeck     WoodDeckSF_log     0.987795
BsmtFinSF1      HasBsmtFinSF1      0.969787
HasOpenPorch    OpenPorchSF_log    0.969147
HasGarage       GarageArea_log     0.967719
GarageQual      GarageCond         0.959172
GarageCond      HasGarage          0.946245
GarageQual      HasGarage          0.942499
dtype: float64


In [46]:
test_data = X_test
train_data = X_train
train_data['SalePrice'] = y_train

label = "SalePrice"

train_ag = TabularDataset(train_data)


# Création du prédicteur
predictor = TabularPredictor(
        label=label,
        eval_metric="root_mean_squared_error",
        verbosity=2
    ).fit(
        train_data=train_ag,
        time_limit=1000,   # 10 minutes d'entraînement (tu peux augmenter)
        presets="best_quality",  # meilleur modèle possible (un peu plus lent)
        num_cpus=1,       # évite Ray multi-process
        num_gpus=0        # désactive GPU si instable
    )

    # Leaderboard (tous les modèles testés et leurs scores)
predictor.leaderboard(silent=False)

# Prédictions sur le test set
test_ag = TabularDataset(test_data)

preds = predictor.predict(test_ag)

# Sauvegarde en CSV au format Kaggle
submission = pd.DataFrame({
    "Id": test_ids,
    "SalePrice": preds
})
submission.to_csv("submission_autogluon.csv", index=False)

print("✅ Fichier submission_autogluon.csv généré !")

No path specified. Models will be saved in: "AutogluonModels\ag-20250928_093925"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.12.5
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.26100
CPU Count:          16
Memory Avail:       12.44 GB / 31.92 GB (39.0%)
Disk Space Avail:   848.66 GB / 1861.39 GB (45.6%)
Presets specified: ['best_quality']
Using hyperparameters preset: hyperparameters='zeroshot'
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then

                          model     score_val              eval_metric  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0           WeightedEnsemble_L3 -23446.770467  root_mean_squared_error       2.267843  194.351807                0.000000           0.013003            3       True         56
1           WeightedEnsemble_L2 -23497.936190  root_mean_squared_error       0.932161  105.373841                0.000000           0.014003            2       True         33
2                XGBoost_BAG_L2 -24211.632594  root_mean_squared_error       2.094514  177.896679                0.075707           3.580279            2       True         40
3         ExtraTrees_r42_BAG_L2 -24593.519415  root_mean_squared_error       2.165391  174.881223                0.146584           0.564824            2       True         50
4          ExtraTreesMSE_BAG_L2 -24840.132868  root_mean_squared_error       2.189027  174.922446                0.17022

#  Optuna

In [None]:
if True:  # mettre à True pour activer l'optimisation
    def objective(trial):
        # Choix du modèle
        model_name = trial.suggest_categorical("model", ["xgb", "lgbm", "gbr"])
        
        if model_name == "xgb":
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 300, 1500),
                "max_depth": trial.suggest_int("max_depth", 3, 10),
                "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 0.3),
                "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
                "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
                "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
                "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
                "random_state": 42,
                "n_jobs": 4,
            }
            model = xgb.XGBRegressor(**params)
        
        elif model_name == "lgbm":
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 300, 1500),
                "num_leaves": trial.suggest_int("num_leaves", 20, 100),
                "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 0.3),
                "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
                "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
                "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
                "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
                "random_state": 42,
                "n_jobs": 4
            }
            model = lgb.LGBMRegressor(**params)
        
        elif model_name == "rf":
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 200, 2000),
                "max_depth": trial.suggest_int("max_depth", 5, 50),
                "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
                "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
                "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
                "random_state": 42,
                "n_jobs": 4
            }
            model = RandomForestRegressor(**params)
        
        elif model_name == "gbr":
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 300, 1500),
                "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 0.3),
                "max_depth": trial.suggest_int("max_depth", 3, 10),
                "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
                "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
                "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
                "random_state": 42
            }
            model = GradientBoostingRegressor(**params)
        
        # Pipeline avec preprocessing
        pipeline = Pipeline(steps=[
            ("preprocessor", preprocessor),
            ("model", model)
        ])
        
        # Validation croisée
        cv = KFold(n_splits=3, shuffle=True, random_state=42)
        scores = cross_val_score(
            pipeline, X_train, y_train,
            cv=cv, scoring="neg_root_mean_squared_error", n_jobs=4
        )
        
        rmse = -scores.mean()
        return rmse



    study = optuna.create_study(
        direction="minimize",
        pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=1)
        )  # on veut minimiser le RMSE
    study.optimize(objective, n_trials=500, n_jobs=2)  # 50 essais, en parallèle

    print("Best trial:")
    print("  RMSE:", study.best_value)
    print("  Params:", study.best_params)

    best_params = study.best_params
    best_model_name = best_params["model"]

    # enlever la clé "model"
    model_params = {k: v for k, v in best_params.items() if k != "model"}

    if best_model_name == "xgb":
        model = xgb.XGBRegressor(**model_params)
    elif best_model_name == "lgbm":
        model = lgb.LGBMRegressor(**model_params)
    elif best_model_name == "rf":
        model = RandomForestRegressor(**model_params)
    elif best_model_name == "gbr":
        model = GradientBoostingRegressor(**model_params)

    best_model = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ])

    # fit + prédiction
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)

    submission = pd.DataFrame({
        "Id": test_ids,  
        "SalePrice": y_pred
    })
    submission.to_csv("submission_optuna.csv", index=False)

    print("Submission saved as submission_optuna.csv")

[I 2025-09-28 11:56:07,279] A new study created in memory with name: no-name-21db261b-5699-48a4-ba8b-bb44911c2c3c
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 0.3),
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 0.3),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
[I 2025-09-28 11:56:14,613] Trial 0 finished with value: 38020.361979166664 and parameters: {'model': 'xgb', 'n_estimators': 1096, 'max_depth': 6, 'learning_rate': 0.0011642753825667589, 'subsample': 0.6468788204107296, 'colsample_bytree': 0.5957564484284796, 'reg_alpha': 2.7463220175185498e-06, 'reg_lambda': 2.8452341932917924e-08}. Best is trial 0 with value: 38020.361979166664.
  "learning_rate": trial.

Best trial:
  RMSE: 28667.032552083332
  Params: {'model': 'xgb', 'n_estimators': 693, 'max_depth': 5, 'learning_rate': 0.022667426157108015, 'subsample': 0.8076080163549024, 'colsample_bytree': 0.6096128397576875, 'reg_alpha': 1.1719534986465034e-05, 'reg_lambda': 5.332035292141814e-08}
Submission saved as submission_optuna.csv
