# House 

In [26]:
import numpy as np
import pandas as pd
from scipy import stats
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from ydata_profiling import ProfileReport

from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test_ids = test['Id']

In [27]:
y= train["SalePrice"]
train = train.drop(columns=["SalePrice","Id"])
test = test.drop(columns=["Id"])

# MSZoning - replace NaN with mode (RL)
train["MSZoning"] = train["MSZoning"].replace(np.nan, "RL")
test["MSZoning"] = test["MSZoning"].replace(np.nan, "RL")


# LotFrontage - replace NaN with median of neighborhood
train["LotFrontage"] = train.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median())
)
test["LotFrontage"] = test.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median())
)

# Age_Built and Age_RemodAdd - create new features
train["Age_Built"] = train["YrSold"] - train["YearBuilt"]
train["Age_RemodAdd"] = train["YrSold"] - train["YearRemodAdd"]
test["Age_Built"] = test["YrSold"] - test["YearBuilt"]
test["Age_RemodAdd"] = test["YrSold"] - test["YearRemodAdd"]

# LotArea - use log transformation to reduce skewness
train["LotArea"] = np.log1p(train["LotArea"]+1)
test["LotArea"] = np.log1p(test["LotArea"]+1)



# MasVnrType - replace NaN with None
train["MasVnrType"] = train["MasVnrType"].replace(np.nan, "None")
test["MasVnrType"] = test["MasVnrType"].replace(np.nan, "None")

# MasVnrArea - replace NaN with 0 + log transform
train["MasVnrArea"] = train["MasVnrArea"].replace(np.nan, 0)
test["MasVnrArea"] = test["MasVnrArea"].replace(np.nan, 0)
train["MasVnrArea"] = np.log1p(train["MasVnrArea"]+1)  # log transform for skew
test["MasVnrArea"] = np.log1p(test["MasVnrArea"]+1)  # log transform for skew

# ExterCond - mapping
mapping = {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
train["ExterCond"] = train["ExterCond"].map(mapping)
test["ExterCond"] = test["ExterCond"].map(mapping)
# ExterQual - mapping
mapping = {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
train["ExterQual"] = train["ExterQual"].map(mapping)
test["ExterQual"] = test["ExterQual"].map(mapping)

#BstmQual - replace NaN with None + mapping
train["BsmtQual"] = train["BsmtQual"].replace(np.nan, "None")
test["BsmtQual"] = test["BsmtQual"].replace(np.nan, "None")
mapping = {"None": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
train["BsmtQual"] = train["BsmtQual"].map(mapping)
test["BsmtQual"] = test["BsmtQual"].map(mapping)

# BsmtCOnd - replace NaN with None + mapping
train["BsmtCond"] = train["BsmtCond"].replace(np.nan, "None")
test["BsmtCond"] = test["BsmtCond"].replace(np.nan, "None")
mapping = {"None": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
train["BsmtCond"] = train["BsmtCond"].map(mapping)
test["BsmtCond"] = test["BsmtCond"].map(mapping)

# BsmtExposure - replace NaN with None + mapping
train["BsmtExposure"] = train["BsmtExposure"].replace(np.nan, "None")
test["BsmtExposure"] = test["BsmtExposure"].replace(np.nan, "None")
mapping = {"None": 0, "No": 1, "Mn": 2, "Av": 3, "Gd": 4}
train["BsmtExposure"] = train["BsmtExposure"].map(mapping)
test["BsmtExposure"] = test["BsmtExposure"].map(mapping)

#BsmtfinType1 - replace NaN with None + mapping
train["BsmtFinType1"] = train["BsmtFinType1"].replace(np.nan, "None")
test["BsmtFinType1"] = test["BsmtFinType1"].replace(np.nan, "None")
mapping = {"None": 0, "Unf": 1, "LwQ": 2, "Rec": 3, "BLQ": 4, "ALQ": 5, "GLQ": 6}
train["BsmtFinType1"] = train["BsmtFinType1"].map(mapping)
test["BsmtFinType1"] = test["BsmtFinType1"].map(mapping)

# BsmtFinSF1 - replace NaN with 0 + log transform + binary indicator
train["BsmtFinSF1"] = train["BsmtFinSF1"].replace(np.nan, 0)
test["BsmtFinSF1"] = test["BsmtFinSF1"].replace(np.nan, 0)
train["HasBsmtFinSF1"] = (train["BsmtFinSF1"] > 0).astype(int)
test["HasBsmtFinSF1"] = (test["BsmtFinSF1"] > 0).astype(int)
train["BsmtFinSF1"] = np.log1p(train["BsmtFinSF1"]+1)  # log transform for skew
test["BsmtFinSF1"] = np.log1p(test["BsmtFinSF1"]+1)  # log transform for skew

# BsmtFinType2 - replace NaN with None + mapping
train["BsmtFinType2"] = train["BsmtFinType2"].replace(np.nan, "None")
test["BsmtFinType2"] = test["BsmtFinType2"].replace(np.nan, "None")
mapping = {"None": 0, "Unf": 1, "LwQ": 2, "Rec": 3, "BLQ": 4, "ALQ": 5, "GLQ": 6}
train["BsmtFinType2"] = train["BsmtFinType2"].map(mapping)
test["BsmtFinType2"] = test["BsmtFinType2"].map(mapping)

# BsmtUnfSF - replace NaN with 0 + log transform
train["BsmtUnfSF"] = train["BsmtUnfSF"].replace(np.nan, 0)
test["BsmtUnfSF"] = test["BsmtUnfSF"].replace(np.nan, 0)
train["BsmtUnfSF"] = np.log1p(train["BsmtUnfSF"]+1)  # log transform for skew
test["BsmtUnfSF"] = np.log1p(test["BsmtUnfSF"]+1)  # log transform for skew

# TotalBsmtSF - use log transformation to reduce skewness
# replace NaN with 0 (no basement)
train["TotalBsmtSF"] = train["TotalBsmtSF"].replace(np.nan, 0)
test["TotalBsmtSF"] = test["TotalBsmtSF"].replace(np.nan, 0)
train["TotalBsmtSF"] = np.log1p(train["TotalBsmtSF"]+1)
test["TotalBsmtSF"] = np.log1p(test["TotalBsmtSF"]+1)

# HeatingQC 
mapping = {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
train["HeatingQC_num"] = train["HeatingQC"].map(mapping)
test["HeatingQC_num"] = test["HeatingQC"].map(mapping)

# Electrical - replace NaN with mode (SBrkr)
train["Electrical"] = train["Electrical"].replace(np.nan, "SBrkr")
test["Electrical"] = test["Electrical"].replace(np.nan, "SBrkr")

# First Floor SF - use log transformation to reduce skewness
train["1stFlrSF"] = np.log1p(train["1stFlrSF"]+1)
test["1stFlrSF"] = np.log1p(test["1stFlrSF"]+1)

# 2nd Flr SF - create binary indicator + log transform
train["Has2ndFlr"] = (train["2ndFlrSF"] > 0).astype(int)
test["Has2ndFlr"] = (test["2ndFlrSF"] > 0).astype(int)
train["2ndFlrSF"] = np.log1p(train["2ndFlrSF"]+1)  # log transform for skew
test["2ndFlrSF"] = np.log1p(test["2ndFlrSF"]+1)  # log transform for skew

# Low Qualfin SF - binomial indicator
train["HasLowQualFinSF"] = (train["LowQualFinSF"] > 0).astype(int)
test["HasLowQualFinSF"] = (test["LowQualFinSF"] > 0).astype(int)

# GrLivArea - use log transformation to reduce skewness
train["GrLivArea"] = np.log1p(train["GrLivArea"]+1)
test["GrLivArea"] = np.log1p(test["GrLivArea"]+1)

# Kitchen Qual - replace NaN with mode (TA) + mapping
mapping = {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
train["KitchenQual"] = train["KitchenQual"].replace(np.nan, "TA")
test["KitchenQual"] = test["KitchenQual"].replace(np.nan, "TA")
train["KitchenQual"] = train["KitchenQual"].map(mapping)
test["KitchenQual"] = test["KitchenQual"].map(mapping)

#FireplaceQu - replace NaN with None + mapping
mapping = {"None": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}  
train["FireplaceQu"] = train["FireplaceQu"].replace(np.nan, "None")
test["FireplaceQu"] = test["FireplaceQu"].replace(np.nan, "None")
train["FireplaceQu"] = train["FireplaceQu"].map(mapping)
test["FireplaceQu"] = test["FireplaceQu"].map(mapping)

# GarageYrBlt - create new feature
train["GarageAge"] = train["YrSold"] - train["GarageYrBlt"]
test["GarageAge"] = test["YrSold"] - test["GarageYrBlt"]

# GarageFinish - replace NaN with None + mapping
train["GarageFinish"] = train["GarageFinish"].replace(np.nan, "None")
test["GarageFinish"] = test["GarageFinish"].replace(np.nan, "None")
mapping = {"Fin": 1, "RFn": 2, "Unf": 3, "None": 0}
train["GarageFinish"] = train["GarageFinish"].map(mapping)
test["GarageFinish"] = test["GarageFinish"].map(mapping)

# GarageCond - mapping + replace NaN with None
mapping = {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5, "None": 0}
train["GarageCond"] = train["GarageCond"].replace(np.nan, "None")
test["GarageCond"] = test["GarageCond"].replace(np.nan, "None")
train["GarageCond"] = train["GarageCond"].map(mapping)
test["GarageCond"] = test["GarageCond"].map(mapping)

# GarageArea - create binary indicator and log transform
train["HasGarage"] = (train["GarageArea"] > 0).astype(int)
test["HasGarage"] = (test["GarageArea"] > 0).astype(int)
train["GarageArea_log"] = np.log1p(train["GarageArea"]+1)  # log transform for skew
test["GarageArea_log"] = np.log1p(test["GarageArea"]+1)

# GarageQual - replace NaN with None + mapping
mapping = {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5, "None": 0}
train["GarageQual"] = train["GarageQual"].replace(np.nan, "None")
test["GarageQual"] = test["GarageQual"].replace(np.nan, "None")
train["GarageQual"] = train["GarageQual"].map(mapping)
test["GarageQual"] = test["GarageQual"].map(mapping)

#Same with WoodDeckSF
train["HasWoodDeck"] = (train["WoodDeckSF"] > 0).astype(int)
test["HasWoodDeck"] = (test["WoodDeckSF"] > 0).astype(int)
train["WoodDeckSF_log"] = np.log1p(train["WoodDeckSF"]+1)  # log transform for skew
test["WoodDeckSF_log"] = np.log1p(test["WoodDeckSF"]+1)

# Binary indicator
train["HasOpenPorch"] = (train["OpenPorchSF"] > 0).astype(int)
test["HasOpenPorch"] = (test["OpenPorchSF"] > 0).astype(int)

# Optionally keep raw or transform it
train["OpenPorchSF_log"] = np.log1p(train["OpenPorchSF"]+1)  # log transform for skew
test["OpenPorchSF_log"] = np.log1p(test["OpenPorchSF"]+1)

train["HasScreenPorch"] = (train["ScreenPorch"] > 0).astype(int)
train["ScreenPorchLog"] = np.log1p(train["ScreenPorch"]+1)
test["HasScreenPorch"] = (test["ScreenPorch"] > 0).astype(int)
test["ScreenPorchLog"] = np.log1p(test["ScreenPorch"]+1)

train["GotPool"] = train["PoolQC"].notnull().astype(int)
test["GotPool"] = test["PoolQC"].notnull().astype(int)

# Fence - replace NaN with None + mapping with quality and good wood
# Fence - mapping with quality and good wood
# Fence_wo: 2 if GdWo, 1 if MnWw, else 0
# Fence_Prv: 2 if GdPrv, 1 if MnPrv, else 0
def fence_wo(val):
    if val == "GdWo":
        return 2
    elif val == "MnWw":
        return 1
    else:
        return 0
def fence_prv(val):
    if val == "GdPrv":
        return 2
    elif val == "MnPrv":
        return 1
    else:
        return 0
train["Fence_wo"] = train["Fence"].apply(fence_wo)
test["Fence_wo"] = test["Fence"].apply(fence_wo)
train["Fence_Prv"] = train["Fence"].apply(fence_prv)
test["Fence_Prv"] = test["Fence"].apply(fence_prv)

mapping = {"GdWo": 2, "MnPrv": 1, "GdPrv": 2, "MnWw": 1, "None": 0}
train["Fence"] = train["Fence"].map(mapping)
test["Fence"] = test["Fence"].map(mapping)

train["MoSold_sin"] = np.sin(2 * np.pi * train["MoSold"] / 12)
train["MoSold_cos"] = np.cos(2 * np.pi * train["MoSold"] / 12)
test["MoSold_sin"] = np.sin(2 * np.pi * test["MoSold"] / 12)
test["MoSold_cos"] = np.cos(2 * np.pi * test["MoSold"] / 12)

# Sale type - replace NaN with mode (WD)
train["SaleType"] = train["SaleType"].replace(np.nan, "WD")
test["SaleType"] = test["SaleType"].replace(np.nan, "WD")

train = train.drop(columns=["Functional","GarageCars","EnclosedPorch","3SsnPorch","ScreenPorch","MiscFeature","MiscVal","RoofMatl","Condition2","Alley", "Street", "Utilities", "MiscFeature"])
test = test.drop(columns=["Functional","GarageCars","EnclosedPorch","3SsnPorch","ScreenPorch","MiscFeature","MiscVal","RoofMatl","Condition2","Alley", "Street", "Utilities", "MiscFeature"])


In [28]:
#je garde à contre coeur : LandSlope, Screeporch PavedDrive?
# truc potentiellement intéréssant : MiscVal kitchenAbove
# truc ou je me suis permis des dinguerie : PoolQC (et donc poolarea) 
num_cols = ["Fence_wo","Fence_Prv","GarageQual","KitchenQual","BsmtFinType2","BsmtFinType1","MasVnrArea","ExterQual","ExterCond","BsmtQual","BsmtCond","FireplaceQu","GarageFinish","BsmtExposure","BsmtFinSF1","BsmtUnfSF","TotalBsmtSF","GarageCond","HeatingQC_num","1stFlrSF","2ndFlrSF","GrLivArea","BsmtFullBath","BsmtHalfBath","FullBath","HalfBath","BedroomAbvGr","KitchenAbvGr","TotRmsAbvGrd","Fireplaces","GarageAge","GarageArea_log","WoodDeckSF_log","HasOpenPorch","HasScreenPorch","MoSold_sin","MoSold_cos","Age_Built","Age_RemodAdd","LotFrontage","LotArea","OverallQual","OverallCond"]
cat_cols = ["MasVnrType","Foundation","HasBsmtFinSF1","Heating","CentralAir","Electrical","Has2ndFlr","HasLowQualFinSF","GarageType","HasGarage","PavedDrive","HasWoodDeck","HasOpenPorch","HasScreenPorch","GotPool","YrSold","SaleType","SaleCondition","Exterior1st","Exterior2nd","RoofStyle","MSSubClass","HouseStyle","Neighborhood","BldgType","MSSubClass","LotShape","LandContour","LotConfig","LandSlope","Condition1"]

preprocessor = ColumnTransformer(
    transformers=[
        # One-hot categorical features
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        # Pass-through numerical features
        ("num", StandardScaler(), num_cols),
    ],
    remainder="drop"  # drop unused raw columns like Name, Ticket, Cabin, etc.
)

In [29]:
X_train = train
y_train = y
X_test = test

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)

missing_in_test = set(X_train.columns) - set(X_test.columns)
missing_in_train = set(X_test.columns) - set(X_train.columns)
print("Colonnes manquantes dans test:", missing_in_test)
print("Colonnes manquantes dans train:", missing_in_train)



model = RandomForestRegressor(n_estimators=200, random_state=42)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

# 5-fold CV
cv = KFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(
    pipeline, X_train, y_train,
    cv=cv, scoring="neg_root_mean_squared_error"
)

print("RMSE par fold:", -scores)
print("RMSE moyen:", -np.mean(scores))

X_train shape: (1460, 87)
y_train shape: (1460,)
X_test shape: (1459, 87)
Colonnes manquantes dans test: set()
Colonnes manquantes dans train: set()
RMSE par fold: [28505.138272   26921.22664202 45664.71179955 28073.75117136
 24546.28372522]
RMSE moyen: 30742.22232203175


In [30]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

In [31]:
submission = pd.DataFrame({
    "Id": test_ids,
    "SalePrice": y_pred
})
submission.to_csv("submission.csv", index=False)