In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.feature_selection import SelectFromModel

import xgboost as xgb

# Load data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Drop irrelevant/missing-heavy columns
cols_to_drop = ['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'FireplaceQu']
train_df.drop(columns=cols_to_drop, inplace=True)
test_df.drop(columns=cols_to_drop, inplace=True)

# Save ID and apply log-transform
train_df['SalePrice'] = np.log1p(train_df['SalePrice'])  # log1p for log(1 + x)
train_ids = train_df["Id"]
test_ids = test_df["Id"]

train_df.drop(columns=["Id"], inplace=True)
test_df.drop(columns=["Id"], inplace=True)

X = train_df.drop("SalePrice", axis=1)
y = train_df["SalePrice"]

# Combine for consistent encoding
X_all = pd.concat([X, test_df], axis=0)

In [None]:
categorical_cols = X_all.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X_all.select_dtypes(exclude=['object']).columns.tolist()

# Pipelines
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_cols),
    ('cat', categorical_pipeline, categorical_cols)
])

In [None]:
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_jobs=-1, verbosity=0)

pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', xgb_model)
])

param_grid = {
    'regressor__n_estimators': [300, 500],
    'regressor__learning_rate': [0.01, 0.05, 0.1],
    'regressor__max_depth': [3, 5, 7],
    'regressor__subsample': [0.7, 1.0],
    'regressor__colsample_bytree': [0.7, 1.0]
}

rmse_scorer = make_scorer(lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)), greater_is_better=False)

grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring=rmse_scorer, verbose=1)
grid_search.fit(X, y)

print(f"Best RMSE: {-grid_search.best_score_:.4f}")
print(f"Best Params: {grid_search.best_params_}")

Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [None]:
# Get preprocessed features
X_processed = grid_search.best_estimator_.named_steps['preprocessor'].transform(X)
feature_selector = SelectFromModel(grid_search.best_estimator_.named_steps['regressor'], threshold='median', prefit=True)
X_selected = feature_selector.transform(X_processed)

In [None]:
cv_rmse_scores = cross_val_score(grid_search.best_estimator_, X, y, 
                                 scoring=rmse_scorer, cv=5)

print("Cross-Validation RMSEs:", -cv_rmse_scores)
print("Mean CV RMSE:", -cv_rmse_scores.mean())

In [None]:
X_test = test_df.copy()
final_preds_log = grid_search.predict(X_test)
final_preds = np.expm1(final_preds_log)  # Reverse of log1p

submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': final_preds
})

submission.to_csv('submission.csv', index=False)
print("submission.csv created")