In [1]:
# Import XGBoost to demonstrate native handling of missing values
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import cross_val_score

# Load the dataset
Ames = pd.read_csv("Ames.csv")

# Select numeric features with missing values
cols_with_missing = Ames.isnull().any()
X = Ames.loc[:, cols_with_missing].select_dtypes(include=["int", "float"])
y = Ames["SalePrice"]

# Check and print the total number of missing values
total_missing_values = X.isna().sum().sum()
print(f"Total number of missing values: {total_missing_values}")

# Initialize XGBoost regressor with default settings, fixed seed for reproducibility
xgb_model = xgb.XGBRegressor(seed=42)

# Perform 5-fold cross-validation
scores = cross_val_score(xgb_model, X, y, cv=5, scoring="r2")

# Calculate and display the average R-squared score
mean_r2 = scores.mean()
print(f"XGB with native imputing, average R^2 score: {mean_r2:.4f}")

Total number of missing values: 829
XGB with native imputing, average R^2 score: 0.7547


In [1]:
# Demonstrate native handling of categorical features
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import cross_val_score

# Load the dataset
Ames = pd.read_csv("Ames.csv")

# Convert specified categorical features to "category" type
for col in ["Neighborhood", "BldgType", "HouseStyle"]:
    Ames[col] = Ames[col].astype("category")

# Include some numeric features for a balanced model
selected_features = [
    "OverallQual", "GrLivArea", "YearBuilt", "TotalBsmtSF", "1stFlrSF",
    "Neighborhood", "BldgType", "HouseStyle"
]
X = Ames[selected_features]
y = Ames["SalePrice"]

# Initialize XGBoost regressor with native handling for categorical data
xgb_model = xgb.XGBRegressor(
    seed=42,
    enable_categorical=True
)

# Perform 5-fold cross-validation
scores = cross_val_score(xgb_model, X, y, cv=5, scoring="r2")

# Calculate the average R-squared score
mean_r2 = scores.mean()
print(f"Average model R^2 score with selected categorical features: {mean_r2:.4f}")

Average model R^2 score with selected categorical features: 0.8543


In [3]:
# Perform Cross-Validated Recursive Feature Elimination for XGB
import pandas as pd
import xgboost as xgb
from sklearn.feature_selection import RFECV
from sklearn.model_selection import cross_val_score

# Load the dataset
Ames = pd.read_csv("Ames.csv")

# Convert selected features to "object" type to treat them as categorical
for col in ["MSSubClass", "YrSold", "MoSold"]:
    Ames[col] = Ames[col].astype("object")

# Convert all object-type features to categorical and then to codes
categorical_features = Ames.select_dtypes(include=["object"]).columns
for col in categorical_features:
    Ames[col] = Ames[col].astype("category").cat.codes

# Select features and target
X = Ames.drop(columns=["SalePrice", "PID"])
y = Ames["SalePrice"]

# Initialize XGBoost regressor
xgb_model = xgb.XGBRegressor(seed=42, enable_categorical=True)

# Initialize RFECV
rfecv = RFECV(estimator=xgb_model, step=1, cv=5, scoring="r2", min_features_to_select=1)

# Fit RFECV
rfecv.fit(X, y)

# Print the optimal number of features and their names
print("Optimal number of features: ", rfecv.n_features_)
print("Best features: ", X.columns[rfecv.support_])

Optimal number of features:  36
Best features:  Index(['GrLivArea', 'MSZoning', 'LotArea', 'Neighborhood', 'Condition1',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
       'ExterQual', 'BsmtQual', 'BsmtFinType1', 'BsmtFinSF1', 'TotalBsmtSF',
       'HeatingQC', 'CentralAir', '1stFlrSF', '2ndFlrSF', 'BsmtFullBath',
       'KitchenQual', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageYrBlt',
       'GarageCars', 'GarageArea', 'PavedDrive', 'WoodDeckSF', 'ScreenPorch',
       'MoSold', 'SaleType', 'SaleCondition', 'GeoRefNo', 'Latitude',
       'Longitude'],
      dtype='object')


In [6]:
# RFECV on an XGBoost regressor
X = Ames.drop(columns=["SalePrice", "PID"])
y = Ames["SalePrice"]

# Inicializa el modelo base
xgb_model = xgb.XGBRegressor(seed=42, enable_categorical=True)

# Inicializa RFECV con validación cruzada de 5 folds
rfecv = RFECV(
    estimator=xgb_model,
    step=1,
    cv=5,
    scoring="r2",
    min_features_to_select=1
)

# Ajusta el modelo para seleccionar las mejores características
rfecv.fit(X, y)

# Valida el modelo final usando solo las características seleccionadas
final_model = xgb.XGBRegressor(seed=42, enable_categorical=True)
cv_scores = cross_val_score(final_model, X.iloc[:, rfecv.support_], y, cv=5, scoring="r2")

# Calcula el promedio del R^2
mean_r2 = cv_scores.mean()
print(f"Average Cross-validated R^2 score with remaining features: {mean_r2:.4f}")

Average Cross-validated R^2 score with remaining features: 0.8970
