In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor

# 1. Chargement des données
df = pd.read_csv("ames.csv")

# 2. Préparation des données
# Extraction de la cible
df["LogSalePrice"] = np.log1p(df["SalePrice"])
y = df.pop("LogSalePrice")

# Suppression de la cible originale
df.drop("SalePrice", axis=1, inplace=True)

# 3. Création de fonctionnalités personnalisées
def create_custom_features(df):
    df = df.copy()
    df["TotalSF"] = df["TotalBsmtSF"] + df["FirstFlrSF"] + df["SecondFlrSF"]
    df["Age"] = 2025 - df["YearBuilt"]
    df["RemodelAge"] = 2025 - df["YearRemodAdd"]
    df["TotalBath"] = df["FullBath"] + (0.5 * df["HalfBath"])
    return df

df = create_custom_features(df)

# 4. Mise à l'échelle des caractéristiques
features_to_scale = [
    "LotFrontage", "LotArea", "TotalSF", "Age", "RemodelAge", "TotalBath"
]
scaler = StandardScaler()
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])

# 5. Encodage des variables catégorielles
categorical_cols = df.select_dtypes(include=["object", "category"]).columns
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# 6. Suppression des caractéristiques avec un MI Score de 0
def calculate_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    return pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)

mi_scores = calculate_mi_scores(df_encoded, y)
drop_features = mi_scores[mi_scores == 0].index
df_encoded = df_encoded.drop(columns=drop_features)

# 7. Ajout des distances aux clusters
kmeans = KMeans(n_clusters=10, random_state=0)
distance_features = kmeans.fit_transform(df_encoded)
for i in range(distance_features.shape[1]):
    df_encoded[f"Distance_to_Cluster_{i}"] = distance_features[:, i]

# 8. Modélisation
X = df_encoded.copy()

def score_dataset(X, y, model=XGBRegressor()):
    score = cross_val_score(model, X, y, cv=5, scoring="neg_mean_squared_log_error")
    score = -1 * score.mean()
    return np.sqrt(score)

# Optimisation des hyperparamètres
model_optimized = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    random_state=0
)

final_score = score_dataset(X, y, model=model_optimized)
print(f"RMSLE du modèle optimisé : {final_score}")


RMSLE du modèle optimisé : 0.010666736598226052
