In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score


import warnings
warnings.filterwarnings('ignore')

In [5]:
data_path_train = "data/df_train.csv"

df_train = pd.read_csv(data_path_train)
df_train = df_train.drop(columns=['BsmtHalfBath', 'PoolQC_Fa', 'Utilities_NoSeWa', 'PoolQC_Gd', 'BsmtFinSF2'])

In [10]:
target_column = 'SalePrice'
X = df_train.drop(columns=[target_column])
y = df_train[target_column]
print(f"Target column '{target_column}' found. Shape: X={X.shape}, y={y.shape}")

numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()

Target column 'SalePrice' found. Shape: X=(1460, 93), y=(1460,)


In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

In [2]:
pipeline_xgb = Pipeline([
    ('scaler', StandardScaler()),
    ("feature_selection", SelectKBest(score_func=f_regression)),
    ("model", XGBRegressor(
        objective="reg:squarederror",
        random_state=42,
        tree_method="hist",
        n_jobs=-1
    ))
])

In [3]:
param_grid_xgb = {
"feature_selection__k": [30, 40, 50],
"model__n_estimators": [500, 800],
"model__max_depth": [3, 4, 6],
"model__learning_rate": [0.03, 0.05],
"model__subsample": [0.8, 1.0],
"model__colsample_bytree": [0.8, 1.0],
"model__reg_alpha": [0.0, 0.1],
"model__reg_lambda": [1.0, 1.5]
}

In [12]:
feature_names = X.columns.tolist()

In [13]:
results = {}
name = 'k_best_xgb'
grid_search = GridSearchCV(
    estimator=pipeline_xgb,
    param_grid=param_grid_xgb,
    cv=5, 
    scoring='r2',
    n_jobs=-1,
    verbose=0
)

grid_search.fit(X_train, y_train)

results[name] = {
    'model': grid_search.best_estimator_,
    'best_params': grid_search.best_params_,
    'best_cv_score': grid_search.best_score_,
    'grid_search': grid_search
}


y_pred_train = grid_search.predict(X_train)
y_pred_test = grid_search.predict(X_test)

train_metrics = {
    'MAE': mean_absolute_error(y_train, y_pred_train),
    'RMSE': root_mean_squared_error(y_train, y_pred_train),
    'R2': r2_score(y_train, y_pred_train)
}

test_metrics = {
    'MAE': mean_absolute_error(y_test, y_pred_test),
    'RMSE': root_mean_squared_error(y_test, y_pred_test),
    'R2': r2_score(y_test, y_pred_test)
}

results['train_metrics'] = train_metrics
results['test_metrics'] = test_metrics

print(f"  Best CV R²: {grid_search.best_score_:.4f}")
print(f"  Test R²:    {test_metrics['R2']:.4f}")
print(f"  Best params: {grid_search.best_params_}")

  Best CV R²: 0.8905
  Test R²:    0.9103
  Best params: {'feature_selection__k': 40, 'model__colsample_bytree': 0.8, 'model__learning_rate': 0.05, 'model__max_depth': 3, 'model__n_estimators': 500, 'model__reg_alpha': 0.1, 'model__reg_lambda': 1.0, 'model__subsample': 0.8}


In [21]:
data_path_test_kaggle = "data/df_test.csv"
df_test_kaggle = pd.read_csv(data_path_test_kaggle)
X_test_kaggle = df_test_kaggle[X.columns]
X_test_kaggle = X_test_kaggle.fillna(0)

In [23]:
y_test_kaggle = grid_search.predict(X_test_kaggle)

In [30]:
df_test_kaggle['SalePrice'] = y_test_kaggle

In [32]:
df_test_kaggle[['Id','SalePrice']].to_csv('./data/submission.csv', index=False)

In [36]:
best_pipeline = results[name]['model']
xgb_model = best_pipeline.named_steps['model']
importances = xgb_model.feature_importances_

In [38]:
selector = best_pipeline.named_steps['feature_selection']
feature_names = X.columns[selector.get_support()]
feat_imp_series = pd.Series(importances, index=feature_names).sort_values(ascending=False)

print(feat_imp_series)

OverallQual         0.311927
KitchenQual_TA      0.105489
Neighborhood        0.103014
GrLivArea           0.084454
CentralAir_Y        0.035669
GarageType          0.028442
Fireplaces          0.027851
TotalBsmtSF         0.026982
GarageArea          0.024796
1stFlrSF            0.023122
BsmtFinSF1          0.021554
2ndFlrSF            0.021521
YearRemodAdd        0.017802
FullBath            0.015917
BsmtQual_Gd         0.013521
BsmtExposure_Gd     0.012722
FireplaceQu         0.011873
OpenPorchSF         0.011656
YearBuilt           0.009528
KitchenQual_Gd      0.009497
LotArea             0.008856
BsmtExposure_No     0.008042
SaleType            0.007979
GarageFinish_Unf    0.007243
BsmtQual_TA         0.005770
WoodDeckSF          0.004468
LotShape_Reg        0.004125
ExterQual_Gd        0.003861
HalfBath            0.003497
HeatingQC           0.003415
ExterQual_TA        0.003191
Exterior1st         0.003173
MasVnrArea          0.002868
MasVnrType_Stone    0.002764
MSZoning      