# House 

In [2]:
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import  cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
import lightgbm as lgb
import optuna
import os

# Désactive les erreurs Ray parasites
os.environ["RAY_IGNORE_UNHANDLED_ERRORS"] = "1"

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test_ids = test['Id']

In [3]:
y= train["SalePrice"]
train = train.drop(columns=["SalePrice","Id"])
test = test.drop(columns=["Id"])

# MSZoning - replace NaN with mode (RL)
train["MSZoning"] = train["MSZoning"].replace(np.nan, "RL")
test["MSZoning"] = test["MSZoning"].replace(np.nan, "RL")


# LotFrontage - replace NaN with median of neighborhood + log transform
train["LotFrontage"] = train.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median())
)
test["LotFrontage"] = test.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median())
)
train["LotFrontage"] = np.log1p(train["LotFrontage"])  # log transform for skew
test["LotFrontage"] = np.log1p(test["LotFrontage"])  # log transform for skew

# Age_Built and Age_RemodAdd - create new features
train["Age_Built"] = train["YrSold"] - train["YearBuilt"]
train["Age_RemodAdd"] = train["YrSold"] - train["YearRemodAdd"]
test["Age_Built"] = test["YrSold"] - test["YearBuilt"]
test["Age_RemodAdd"] = test["YrSold"] - test["YearRemodAdd"]

# LotArea - use log transformation to reduce skewness
train["LotArea"] = np.log1p(train["LotArea"])
test["LotArea"] = np.log1p(test["LotArea"])



# MasVnrType - replace NaN with None
train["MasVnrType"] = train["MasVnrType"].replace(np.nan, "None")
test["MasVnrType"] = test["MasVnrType"].replace(np.nan, "None")

# MasVnrArea - replace NaN with 0 + log transform
train["MasVnrArea"] = train["MasVnrArea"].replace(np.nan, 0)
test["MasVnrArea"] = test["MasVnrArea"].replace(np.nan, 0)
train["MasVnrArea"] = np.log1p(train["MasVnrArea"])  # log transform for skew
test["MasVnrArea"] = np.log1p(test["MasVnrArea"])  # log transform for skew

# ExterCond - mapping
mapping = {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
train["ExterCond"] = train["ExterCond"].map(mapping)
test["ExterCond"] = test["ExterCond"].map(mapping)
# ExterQual - mapping
mapping = {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
train["ExterQual"] = train["ExterQual"].map(mapping)
test["ExterQual"] = test["ExterQual"].map(mapping)

#BstmQual - replace NaN with None + mapping
train["BsmtQual"] = train["BsmtQual"].replace(np.nan, "None")
test["BsmtQual"] = test["BsmtQual"].replace(np.nan, "None")
mapping = {"None": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
train["BsmtQual"] = train["BsmtQual"].map(mapping)
test["BsmtQual"] = test["BsmtQual"].map(mapping)

# BsmtCOnd - replace NaN with None + mapping
train["BsmtCond"] = train["BsmtCond"].replace(np.nan, "None")
test["BsmtCond"] = test["BsmtCond"].replace(np.nan, "None")
mapping = {"None": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
train["BsmtCond"] = train["BsmtCond"].map(mapping)
test["BsmtCond"] = test["BsmtCond"].map(mapping)

# BsmtHalfBath - replace NaN with 0
train["BsmtHalfBath"] = train["BsmtHalfBath"].replace(np.nan, 0)
test["BsmtHalfBath"] = test["BsmtHalfBath"].replace(np.nan, 0)

# BsmtFullBath - replace NaN with 0
train["BsmtFullBath"] = train["BsmtFullBath"].replace(np.nan, 0)
test["BsmtFullBath"] = test["BsmtFullBath"].replace(np.nan, 0)

# BsmtExposure - replace NaN with None + mapping
train["BsmtExposure"] = train["BsmtExposure"].replace(np.nan, "None")
test["BsmtExposure"] = test["BsmtExposure"].replace(np.nan, "None")
mapping = {"None": 0, "No": 1, "Mn": 2, "Av": 3, "Gd": 4}
train["BsmtExposure"] = train["BsmtExposure"].map(mapping)
test["BsmtExposure"] = test["BsmtExposure"].map(mapping)

#BsmtfinType1 - replace NaN with None + mapping
train["BsmtFinType1"] = train["BsmtFinType1"].replace(np.nan, "None")
test["BsmtFinType1"] = test["BsmtFinType1"].replace(np.nan, "None")
mapping = {"None": 0, "Unf": 1, "LwQ": 2, "Rec": 3, "BLQ": 4, "ALQ": 5, "GLQ": 6}
train["BsmtFinType1"] = train["BsmtFinType1"].map(mapping)
test["BsmtFinType1"] = test["BsmtFinType1"].map(mapping)

# BsmtFinSF2 - replace NaN with 0 + log transform + binary indicator
train["BsmtFinSF2"] = train["BsmtFinSF2"].replace(np.nan, 0)
test["BsmtFinSF2"] = test["BsmtFinSF2"].replace(np.nan, 0)
train["HasBsmtFinSF2"] = (train["BsmtFinSF2"] > 0).astype(int)
test["HasBsmtFinSF2"] = (test["BsmtFinSF2"] > 0).astype(int)
train["BsmtFinSF2"] = np.log1p(train["BsmtFinSF2"])  # log transform for skew
test["BsmtFinSF2"] = np.log1p(test["BsmtFinSF2"])  # log transform for skew

# BsmtFinSF1 - replace NaN with 0 + log transform + binary indicator
train["BsmtFinSF1"] = train["BsmtFinSF1"].replace(np.nan, 0)
test["BsmtFinSF1"] = test["BsmtFinSF1"].replace(np.nan, 0)
train["HasBsmtFinSF1"] = (train["BsmtFinSF1"] > 0).astype(int)
test["HasBsmtFinSF1"] = (test["BsmtFinSF1"] > 0).astype(int)
train["BsmtFinSF1"] = np.log1p(train["BsmtFinSF1"])  # log transform for skew
test["BsmtFinSF1"] = np.log1p(test["BsmtFinSF1"])  # log transform for skew

# BsmtFinType2 - replace NaN with None + mapping
train["BsmtFinType2"] = train["BsmtFinType2"].replace(np.nan, "None")
test["BsmtFinType2"] = test["BsmtFinType2"].replace(np.nan, "None")
mapping = {"None": 0, "Unf": 1, "LwQ": 2, "Rec": 3, "BLQ": 4, "ALQ": 5, "GLQ": 6}
train["BsmtFinType2"] = train["BsmtFinType2"].map(mapping)
test["BsmtFinType2"] = test["BsmtFinType2"].map(mapping)

# BsmtUnfSF - replace NaN with 0 + log transform
train["BsmtUnfSF"] = train["BsmtUnfSF"].replace(np.nan, 0)
test["BsmtUnfSF"] = test["BsmtUnfSF"].replace(np.nan, 0)
train["BsmtUnfSF"] = np.log1p(train["BsmtUnfSF"])  # log transform for skew
test["BsmtUnfSF"] = np.log1p(test["BsmtUnfSF"])  # log transform for skew

# TotalBsmtSF - use log transformation to reduce skewness
# replace NaN with 0 (no basement)
train["TotalBsmtSF"] = train["TotalBsmtSF"].replace(np.nan, 0)
test["TotalBsmtSF"] = test["TotalBsmtSF"].replace(np.nan, 0)
train["TotalBsmtSF"] = np.log1p(train["TotalBsmtSF"])
test["TotalBsmtSF"] = np.log1p(test["TotalBsmtSF"])

# HeatingQC 
mapping = {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
train["HeatingQC_num"] = train["HeatingQC"].map(mapping)
test["HeatingQC_num"] = test["HeatingQC"].map(mapping)

# Electrical - replace NaN with mode (SBrkr)
train["Electrical"] = train["Electrical"].replace(np.nan, "SBrkr")
test["Electrical"] = test["Electrical"].replace(np.nan, "SBrkr")

# Exterior1st - replace NaN with mode (VinylSd)
train["Exterior1st"] = train["Exterior1st"].replace(np.nan, "VinylSd")
test["Exterior1st"] = test["Exterior1st"].replace(np.nan, "VinylSd")

# Exterior2nd - replace NaN with mode (VinylSd)
train["Exterior2nd"] = train["Exterior2nd"].replace(np.nan, "VinylSd")
test["Exterior2nd"] = test["Exterior2nd"].replace(np.nan, "VinylSd")

# First Floor SF - use log transformation to reduce skewness
train["1stFlrSF"] = np.log1p(train["1stFlrSF"])
test["1stFlrSF"] = np.log1p(test["1stFlrSF"])

# 2nd Flr SF - create binary indicator + log transform
train["2ndFlrSF"] = np.log1p(train["2ndFlrSF"])  # log transform for skew
test["2ndFlrSF"] = np.log1p(test["2ndFlrSF"])  # log transform for skew

# Low Qualfin SF - binomial indicator
train["HasLowQualFinSF"] = (train["LowQualFinSF"] > 0).astype(int)
test["HasLowQualFinSF"] = (test["LowQualFinSF"] > 0).astype(int)

# GrLivArea - use log transformation to reduce skewness
train["GrLivArea"] = np.log1p(train["GrLivArea"])
test["GrLivArea"] = np.log1p(test["GrLivArea"])

# Kitchen Qual - replace NaN with mode (TA) + mapping
mapping = {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
train["KitchenQual"] = train["KitchenQual"].replace(np.nan, "TA")
test["KitchenQual"] = test["KitchenQual"].replace(np.nan, "TA")
train["KitchenQual"] = train["KitchenQual"].map(mapping)
test["KitchenQual"] = test["KitchenQual"].map(mapping)

#FireplaceQu - replace NaN with None + mapping
mapping = {"None": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}  
train["FireplaceQu"] = train["FireplaceQu"].replace(np.nan, "None")
test["FireplaceQu"] = test["FireplaceQu"].replace(np.nan, "None")
train["FireplaceQu"] = train["FireplaceQu"].map(mapping)
test["FireplaceQu"] = test["FireplaceQu"].map(mapping)

# GarageYrBlt - create new feature GarageAge + replace NaN with maxvalue 
max_year = max(train["YearBuilt"].max(), train["YearRemodAdd"].max(), train["YrSold"].max()) + 1
train["GarageYrBlt"] = train["GarageYrBlt"].replace(np.nan, max_year)
test["GarageYrBlt"] = test["GarageYrBlt"].replace(np.nan, max_year)
train["GarageAge"] = train["YrSold"] - train["GarageYrBlt"]
test["GarageAge"] = test["YrSold"] - test["GarageYrBlt"]

# GarageFinish - replace NaN with None + mapping
train["GarageFinish"] = train["GarageFinish"].replace(np.nan, "None")
test["GarageFinish"] = test["GarageFinish"].replace(np.nan, "None")
mapping = {"Fin": 1, "RFn": 2, "Unf": 3, "None": 0}
train["GarageFinish"] = train["GarageFinish"].map(mapping)
test["GarageFinish"] = test["GarageFinish"].map(mapping)

# GarageCond - mapping + replace NaN with None
mapping = {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5, "None": 0}
train["GarageCond"] = train["GarageCond"].replace(np.nan, "None")
test["GarageCond"] = test["GarageCond"].replace(np.nan, "None")
train["GarageCond"] = train["GarageCond"].map(mapping)
test["GarageCond"] = test["GarageCond"].map(mapping)

# GarageCars - create binary indicator and log transform + replace NaN with 0
train["GarageCars"] = train["GarageCars"].fillna(0)
test["GarageCars"] = test["GarageCars"].fillna(0)


# GarageType - replace NaN with None
train["GarageType"] = train["GarageType"].replace(np.nan, "None")
test["GarageType"] = test["GarageType"].replace(np.nan, "None")

# GarageQual - replace NaN with None + mapping
mapping = {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5, "None": 0}
train["GarageQual"] = train["GarageQual"].replace(np.nan, "None")
test["GarageQual"] = test["GarageQual"].replace(np.nan, "None")
train["GarageQual"] = train["GarageQual"].map(mapping)
test["GarageQual"] = test["GarageQual"].map(mapping)

#Same with WoodDeckSF
train["WoodDeckSF_log"] = np.log1p(train["WoodDeckSF"])  # log transform for skew
test["WoodDeckSF_log"] = np.log1p(test["WoodDeckSF"])

# Optionally keep raw or transform it
train["OpenPorchSF_log"] = np.log1p(train["OpenPorchSF"])  # log transform for skew
test["OpenPorchSF_log"] = np.log1p(test["OpenPorchSF"])

train["HasScreenPorch"] = (train["ScreenPorch"] > 0).astype(int)
train["ScreenPorchLog"] = np.log1p(train["ScreenPorch"])
test["HasScreenPorch"] = (test["ScreenPorch"] > 0).astype(int)
test["ScreenPorchLog"] = np.log1p(test["ScreenPorch"])

train["GotPool"] = train["PoolQC"].notnull().astype(int)
test["GotPool"] = test["PoolQC"].notnull().astype(int)


# Fence - replace NaN with None + mapping with quality and good wood
# Fence - mapping with quality and good wood
# Fence_wo: 2 if GdWo, 1 if MnWw, else 0
# Fence_Prv: 2 if GdPrv, 1 if MnPrv, else 0
def fence_wo(val):
    if val == "GdWo":
        return 2
    elif val == "MnWw":
        return 1
    else:
        return 0
def fence_prv(val):
    if val == "GdPrv":
        return 2
    elif val == "MnPrv":
        return 1
    else:
        return 0
train["Fence_wo"] = train["Fence"].apply(fence_wo)
test["Fence_wo"] = test["Fence"].apply(fence_wo)
train["Fence_Prv"] = train["Fence"].apply(fence_prv)
test["Fence_Prv"] = test["Fence"].apply(fence_prv)

mapping = {"GdWo": 2, "MnPrv": 1, "GdPrv": 2, "MnWw": 1, "None": 0}
train["Fence"] = train["Fence"].map(mapping)
test["Fence"] = test["Fence"].map(mapping)

train["MoSold_sin"] = np.sin(2 * np.pi * train["MoSold"] / 12)
train["MoSold_cos"] = np.cos(2 * np.pi * train["MoSold"] / 12)
test["MoSold_sin"] = np.sin(2 * np.pi * test["MoSold"] / 12)
test["MoSold_cos"] = np.cos(2 * np.pi * test["MoSold"] / 12)





#Feature engineering 
train["TotalArea"] = train["GrLivArea"] + train["TotalBsmtSF"] + train["GarageArea"]
train["BathPerRoom"] = (train["FullBath"] + train["HalfBath"]) / (train["TotRmsAbvGrd"] + 1)

test["TotalArea"] = test["GrLivArea"] + test["TotalBsmtSF"] + test["GarageArea"]
test["BathPerRoom"] = (test["FullBath"] + test["HalfBath"]) / (test["TotRmsAbvGrd"] + 1)

train["OverallQual_GrLiv"] = train["OverallQual"] * train["GrLivArea"]
train["Qual_x_Bath"] = train["OverallQual"] * (train["FullBath"] + train["HalfBath"])

test["OverallQual_GrLiv"] = test["OverallQual"] * test["GrLivArea"]
test["Qual_x_Bath"] = test["OverallQual"] * (test["FullBath"] + test["HalfBath"])

neighborhood_means = pd.concat([train, y], axis=1).groupby("Neighborhood")["SalePrice"].mean()
train["Neighborhood_mean"] = train["Neighborhood"].map(neighborhood_means)
test["Neighborhood_mean"] = test["Neighborhood"].map(neighborhood_means)




# Sale type - replace NaN with mode (WD)
train["SaleType"] = train["SaleType"].replace(np.nan, "WD")
test["SaleType"] = test["SaleType"].replace(np.nan, "WD")

train = train.drop(columns=["BsmtFinType2","ExterCond","PoolArea","OpenPorchSF","WoodDeckSF","LowQualFinSF","YearRemodAdd","YearBuilt","GarageYrBlt","PoolQC","Fence","Functional","GarageArea","EnclosedPorch","3SsnPorch","ScreenPorch","MiscFeature","MiscVal","RoofMatl","Condition2","Alley", "Street", "Utilities", "MiscFeature"])
test = test.drop(columns=["BsmtFinType2","ExterCond","PoolArea","OpenPorchSF","WoodDeckSF","LowQualFinSF","YearRemodAdd","YearBuilt","GarageYrBlt","PoolQC","Fence","Functional","GarageArea","EnclosedPorch","3SsnPorch","ScreenPorch","MiscFeature","MiscVal","RoofMatl","Condition2","Alley", "Street", "Utilities", "MiscFeature"])

print(test.columns)


Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'LotShape',
       'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
       'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'RoofStyle',
       'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual',
       'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
       'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageCars',
       'GarageQual', 'GarageCond', 'PavedDrive', 'MoSold', 'YrSold',
       'SaleType', 'SaleCondition', 'Age_Built', 'Age_RemodAdd',
       'HasBsmtFinSF2', 'HasBsmtFinSF1', 'HeatingQC_num', 'HasLowQualFinSF',
       'GarageAge', 'WoodDeckSF_log',

In [4]:
#je garde à contre coeur : LandSlope, Screeporch PavedDrive?
# truc potentiellement intéréssant : MiscVal kitchenAbove
# truc ou je me suis permis des dinguerie : PoolQC (et donc poolarea) 
num_cols = ["Neighborhood_mean","Qual_x_Bath","OverallQual_GrLiv","BathPerRoom","TotalArea","Fence_wo","Fence_Prv","GarageQual","KitchenQual","BsmtFinType1","BsmtFinSF2","MasVnrArea","ExterQual","YrSold","BsmtQual","BsmtCond","FireplaceQu","GarageFinish","BsmtExposure","BsmtFinSF1","BsmtUnfSF","TotalBsmtSF","GarageCond","HeatingQC_num","1stFlrSF","2ndFlrSF","GrLivArea","BsmtFullBath","BsmtHalfBath","FullBath","HalfBath","BedroomAbvGr","KitchenAbvGr","TotRmsAbvGrd","Fireplaces","GarageAge","GarageCars","WoodDeckSF_log","MoSold_sin","MoSold_cos","Age_Built","Age_RemodAdd","LotFrontage","LotArea","OverallQual","OverallCond"]
cat_cols = ["MasVnrType","Foundation","HasBsmtFinSF1","Heating","CentralAir","Electrical","HasBsmtFinSF2","HasLowQualFinSF","GarageType","PavedDrive","HasScreenPorch","GotPool","SaleType","SaleCondition","Exterior1st","Exterior2nd","RoofStyle","MSSubClass","HouseStyle","BldgType","MSSubClass","LotShape","LandContour","LotConfig","LandSlope","Condition1"]

preprocessor = ColumnTransformer(
    transformers=[
        # One-hot categorical features
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        # Pass-through numerical features
        ("num", StandardScaler(), num_cols),
    ],
    remainder="drop"  # drop unused raw columns like Name, Ticket, Cabin, etc.
)

In [5]:
X_train = train
X_train2= X_train.copy()
y_train = y
y_train2 = y_train.copy()
X_test = test
X_test2 = X_test.copy()


print("X_train columns:", X_train.columns.tolist())
print("X_test columns:", X_test.columns.tolist())

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)

missing_in_test = set(X_train.columns) - set(X_test.columns)
missing_in_train = set(X_test.columns) - set(X_train.columns)
print("Colonnes manquantes dans test:", missing_in_test)
print("Colonnes manquantes dans train:", missing_in_train)

models = {
    "RandomForest": RandomForestRegressor(n_estimators=500, max_depth=None, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=3, random_state=42),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.0005, max_iter=5000),
    "SVR": SVR(kernel="rbf", C=20, epsilon=0.1),
    "KNN": KNeighborsRegressor(n_neighbors=5),
    "XGB": xgb.XGBRegressor(n_estimators=2000, learning_rate=0.05, max_depth=4, subsample=0.7, colsample_bytree=0.7, random_state=42),
    "LGBM": lgb.LGBMRegressor(n_estimators=2000, learning_rate=0.05, max_depth=-1, subsample=0.7, colsample_bytree=0.7, random_state=42),
}

model = RandomForestRegressor(n_estimators=200, random_state=42)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

# 5-fold CV
cv = KFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(
    pipeline, X_train, y_train,
    cv=cv, scoring="neg_root_mean_squared_error"
)

print("RMSE par fold:", -scores)
print("RMSE moyen:", -np.mean(scores))

X_train columns: ['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'RoofStyle', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageCars', 'GarageQual', 'GarageCond', 'PavedDrive', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition', 'Age_Built', 'Age_RemodAdd', 'HasBsmtFinSF2', 'HasBsmtFinSF1', 'HeatingQC_num', 'HasLowQualFinSF', 'GarageAge', 'WoodDeckSF_log', 'OpenPorchSF_log', 'HasScreenPorch', 'ScreenPorchLog', 'GotPool', 'Fence_wo', '

In [6]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

In [7]:
submission = pd.DataFrame({
    "Id": test_ids,
    "SalePrice": y_pred
})
submission.to_csv("submission.csv", index=False)

In [8]:
if False:   # Change to True to run all models and save their predictions
    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    for name, model in models.items():
        pipeline = Pipeline(steps=[
            ("preprocessor", preprocessor),
            ("model", model)
        ])
        
        # --- Cross-validation ---
        scores = cross_val_score(
            pipeline, X_train2, y_train2,
            cv=cv, scoring="neg_root_mean_squared_error", n_jobs=4
        )
        
        rmse_scores = -scores
        mean_rmse, std_rmse = rmse_scores.mean(), rmse_scores.std()
        print(f"{name}: RMSE = {mean_rmse:.4f} ± {std_rmse:.4f}")
        
        # --- Train full model and predict on X_test ---
        pipeline.fit(X_train2, y_train2)
        y_pred = pipeline.predict(X_test2)
        
        submission = pd.DataFrame({
            "Id": test_ids,        # Assumes you have test_ids defined
            "SalePrice": y_pred
        })
        
        filename = f"submission_{name}.csv"
        submission.to_csv(filename, index=False)
        print(f"Prédictions sauvegardées dans {filename}")


# Autogluon

In [9]:
# Calculer la matrice de corrélation pour les variables numériques
corr_matrix = X_train.corr(numeric_only=True).abs()

# On ne garde que la partie supérieure de la matrice (pour éviter les doublons)
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Trouver les paires avec la plus forte corrélation (hors diagonale)
most_correlated = (
    upper.stack()
    .sort_values(ascending=False)
    .head(10)
)

print(most_correlated)

HasScreenPorch     ScreenPorchLog       0.996544
OverallQual        OverallQual_GrLiv    0.989678
BsmtFinSF2         HasBsmtFinSF2        0.988754
BsmtFinSF1         HasBsmtFinSF1        0.969787
GarageQual         GarageCond           0.959172
GarageCars         TotalArea            0.882630
Fireplaces         FireplaceQu          0.863241
BsmtFinType1       HasBsmtFinSF1        0.854392
                   BsmtFinSF1           0.852712
OverallQual_GrLiv  Qual_x_Bath          0.838232
dtype: float64


In [10]:

test_data = X_test.copy()
train_data = X_train.copy()
train_data['SalePrice'] = y_train

label = "SalePrice"

# Création du dataset AG
train_ag = TabularDataset(train_data)

# Création et entraînement du prédicteur
predictor = TabularPredictor(
        label=label,
        eval_metric="root_mean_squared_error",
        verbosity=2
    ).fit(
        train_data=train_ag,
        time_limit=1500,   # temps max en secondes
        presets="best_quality",  # meilleure qualité
        num_cpus=1,       # pas de Ray
        num_gpus=0
    )

# Leaderboard complet
lb = predictor.leaderboard(silent=False)

# On garde les 3 meilleurs modèles
top3_models = lb.sort_values("score_val").head(3)["model"].tolist()

# Test dataset en format AG
test_ag = TabularDataset(test_data)

# Génération des prédictions pour chaque modèle du top 3
for rank, model_name in enumerate(top3_models, start=1):
    preds = predictor.predict(test_ag, model=model_name)
    
    # Sauvegarde avec nom dynamique
    filename = f"submission_{rank}_{model_name}.csv"
    submission = pd.DataFrame({
        "Id": test_ids,
        "SalePrice": preds
    })
    submission.to_csv(filename, index=False)
    print(f"Fichier {filename} généré !")

No path specified. Models will be saved in: "AutogluonModels\ag-20250928_141948"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.12.5
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.26100
CPU Count:          16
Memory Avail:       11.14 GB / 31.92 GB (34.9%)
Disk Space Avail:   795.91 GB / 1861.39 GB (42.8%)
Presets specified: ['best_quality']
Using hyperparameters preset: hyperparameters='zeroshot'
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then

[36m(_ray_fit pid=9532)[0m [1000]	valid_set's rmse: 28433.2


[36m(_dystack pid=16636)[0m 	-24898.3526	 = Validation score   (-root_mean_squared_error)
[36m(_dystack pid=16636)[0m 	6.04s	 = Training   runtime
[36m(_dystack pid=16636)[0m 	0.21s	 = Validation runtime
[36m(_dystack pid=16636)[0m Fitting model: LightGBM_BAG_L1 ... Training model for up to 228.73s of the 352.09s of remaining time.
[36m(_dystack pid=16636)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (1 workers, per: cpus=1, gpus=0, memory=0.20%)
[36m(_dystack pid=16636)[0m 	-27021.6016	 = Validation score   (-root_mean_squared_error)
[36m(_dystack pid=16636)[0m 	1.4s	 = Training   runtime
[36m(_dystack pid=16636)[0m 	0.06s	 = Validation runtime
[36m(_dystack pid=16636)[0m Fitting model: RandomForestMSE_BAG_L1 ... Training model for up to 224.47s of the 347.83s of remaining time.
[36m(_dystack pid=16636)[0m 	-28614.1102	 = Validation score   (-root_mean_squared_error)
[36m(_dystack pid=16636)[0m 	0.99s	 = Training   runt

[36m(_ray_fit pid=31200)[0m [1000]	valid_set's rmse: 38060.9[32m [repeated 10x across cluster][0m


[36m(_dystack pid=16636)[0m 	-30441.3345	 = Validation score   (-root_mean_squared_error)
[36m(_dystack pid=16636)[0m 	6.21s	 = Training   runtime
[36m(_dystack pid=16636)[0m 	0.1s	 = Validation runtime
[36m(_dystack pid=16636)[0m Fitting model: CatBoost_r177_BAG_L1 ... Training model for up to 136.92s of the 260.28s of remaining time.
[36m(_dystack pid=16636)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (1 workers, per: cpus=1, gpus=0, memory=4.27%)
[36m(_ray_fit pid=17072)[0m 	Ran out of time, early stopping on iteration 320.
[36m(_dystack pid=16636)[0m 	-24332.5509	 = Validation score   (-root_mean_squared_error)
[36m(_dystack pid=16636)[0m 	14.62s	 = Training   runtime
[36m(_dystack pid=16636)[0m 	0.1s	 = Validation runtime
[36m(_dystack pid=16636)[0m Fitting model: NeuralNetTorch_r79_BAG_L1 ... Training model for up to 119.80s of the 243.16s of remaining time.
[36m(_dystack pid=16636)[0m 	Fitting 8 child models (S1F

[36m(_ray_fit pid=23624)[0m [1000]	valid_set's rmse: 21993.6


[36m(_dystack pid=16636)[0m 	-26747.4786	 = Validation score   (-root_mean_squared_error)
[36m(_dystack pid=16636)[0m 	3.93s	 = Training   runtime
[36m(_dystack pid=16636)[0m 	0.12s	 = Validation runtime
[36m(_dystack pid=16636)[0m Fitting model: NeuralNetFastAI_r191_BAG_L1 ... Training model for up to 93.04s of the 216.40s of remaining time.
[36m(_dystack pid=16636)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (1 workers, per: cpus=1, gpus=0, memory=0.06%)
[36m(_ray_fit pid=16252)[0m 	Ran out of time, stopping training early. (Stopping on epoch 47)[32m [repeated 2x across cluster][0m
[36m(_ray_fit pid=16072)[0m 	Ran out of time, stopping training early. (Stopping on epoch 35)
[36m(_ray_fit pid=9776)[0m 	Ran out of time, stopping training early. (Stopping on epoch 33)
[36m(_dystack pid=16636)[0m 	-27266.1363	 = Validation score   (-root_mean_squared_error)
[36m(_dystack pid=16636)[0m 	14.64s	 = Training   runtime
[36m(_

[36m(_ray_fit pid=20324)[0m [1000]	valid_set's rmse: 21368.8


[36m(_dystack pid=16636)[0m 	-25276.1545	 = Validation score   (-root_mean_squared_error)
[36m(_dystack pid=16636)[0m 	7.04s	 = Training   runtime
[36m(_dystack pid=16636)[0m 	0.31s	 = Validation runtime
[36m(_ray_fit pid=4240)[0m 	Ran out of time, early stopping on iteration 39.[32m [repeated 7x across cluster][0m
[36m(_dystack pid=16636)[0m Fitting model: NeuralNetTorch_r22_BAG_L1 ... Training model for up to 53.65s of the 177.01s of remaining time.
[36m(_dystack pid=16636)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (1 workers, per: cpus=1, gpus=0, memory=0.03%)
[36m(_dystack pid=16636)[0m 	Time limit exceeded... Skipping NeuralNetTorch_r22_BAG_L1.
[36m(_dystack pid=16636)[0m Fitting model: XGBoost_r33_BAG_L1 ... Training model for up to 44.58s of the 167.94s of remaining time.
[36m(_dystack pid=16636)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (1 workers, per: cpus=1, gpus

[36m(_ray_fit pid=30744)[0m [1000]	valid_set's rmse: 28703.3[32m [repeated 20x across cluster][0m


[36m(_ray_fit pid=17788)[0m 	Ran out of time, early stopping on iteration 2343. Best iteration is:
[36m(_ray_fit pid=17788)[0m 	[2093]	valid_set's rmse: 19001.1
[36m(_ray_fit pid=26292)[0m 	Ran out of time, early stopping on iteration 5.[32m [repeated 7x across cluster][0m
[36m(_dystack pid=16636)[0m 	-26999.4661	 = Validation score   (-root_mean_squared_error)
[36m(_dystack pid=16636)[0m 	2.66s	 = Training   runtime
[36m(_dystack pid=16636)[0m 	0.12s	 = Validation runtime
[36m(_dystack pid=16636)[0m Fitting model: NeuralNetTorch_r22_BAG_L2 ... Training model for up to 7.99s of the 7.92s of remaining time.
[36m(_dystack pid=16636)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (1 workers, per: cpus=1, gpus=0, memory=0.05%)
[36m(_dystack pid=16636)[0m 	Time limit exceeded... Skipping NeuralNetTorch_r22_BAG_L2.
[36m(_dystack pid=16636)[0m Fitting model: WeightedEnsemble_L3 ... Training model for up to 360.00s of the -0.13s o

                          model     score_val              eval_metric  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0           WeightedEnsemble_L3 -22839.150083  root_mean_squared_error       2.741976  275.192782                0.000000           0.014096            3       True         65
1           WeightedEnsemble_L2 -23037.971240  root_mean_squared_error       1.348478  160.101720                0.001072           0.017421            2       True         39
2            XGBoost_r89_BAG_L2 -23635.360087  root_mean_squared_error       2.441849  249.908161                0.089715           3.191523            2       True         63
3                XGBoost_BAG_L2 -23835.516389  root_mean_squared_error       2.424677  250.696927                0.072543           3.980289            2       True         46
4            XGBoost_r33_BAG_L2 -23938.030794  root_mean_squared_error       2.532021  258.767847                0.17988

#  Optuna

In [11]:
if False:  # mettre à True pour activer l'optimisation
    def objective(trial):
        # Choix du modèle
        model_name = trial.suggest_categorical("model", ["xgb", "lgbm", "gbr"])
        
        if model_name == "xgb":
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 300, 1500),
                "max_depth": trial.suggest_int("max_depth", 3, 10),
                "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 0.3),
                "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
                "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
                "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
                "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
                "random_state": 42,
                "n_jobs": 4,
            }
            model = xgb.XGBRegressor(**params)
        
        elif model_name == "lgbm":
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 300, 1500),
                "num_leaves": trial.suggest_int("num_leaves", 20, 100),
                "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 0.3),
                "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
                "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
                "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
                "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
                "random_state": 42,
                "n_jobs": 4
            }
            model = lgb.LGBMRegressor(**params)
        
        elif model_name == "rf":
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 200, 2000),
                "max_depth": trial.suggest_int("max_depth", 5, 50),
                "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
                "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
                "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
                "random_state": 42,
                "n_jobs": 4
            }
            model = RandomForestRegressor(**params)
        
        elif model_name == "gbr":
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 300, 1500),
                "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 0.3),
                "max_depth": trial.suggest_int("max_depth", 3, 10),
                "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
                "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
                "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
                "random_state": 42
            }
            model = GradientBoostingRegressor(**params)
        
        # Pipeline avec preprocessing
        pipeline = Pipeline(steps=[
            ("preprocessor", preprocessor),
            ("model", model)
        ])
        
        # Validation croisée
        cv = KFold(n_splits=3, shuffle=True, random_state=42)
        scores = cross_val_score(
            pipeline, X_train, y_train,
            cv=cv, scoring="neg_root_mean_squared_error", n_jobs=4
        )
        
        rmse = -scores.mean()
        return rmse

    study = optuna.create_study(
        direction="minimize",
        pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=1)
        )  # on veut minimiser le RMSE
    study.optimize(objective, n_trials=500, timeout=1500, n_jobs=2)  # 500 essais, en parallèle, 1500 secondes max

    print("Best trial:")
    print("  RMSE:", study.best_value)
    print("  Params:", study.best_params)

    best_params = study.best_params
    best_model_name = best_params["model"]

    # enlever la clé "model"
    model_params = {k: v for k, v in best_params.items() if k != "model"}

    if best_model_name == "xgb":
        model = xgb.XGBRegressor(**model_params)
    elif best_model_name == "lgbm":
        model = lgb.LGBMRegressor(**model_params)
    elif best_model_name == "rf":
        model = RandomForestRegressor(**model_params)
    elif best_model_name == "gbr":
        model = GradientBoostingRegressor(**model_params)

    best_model = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ])

    # fit + prédiction
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)

    submission = pd.DataFrame({
        "Id": test_ids,  
        "SalePrice": y_pred
    })
    submission.to_csv("submission_optuna.csv", index=False)

    print("Submission saved as submission_optuna.csv")