In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Step 1: Load Data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [4]:
# Step 2: Drop Columns with Too Much Missing Data (Optional but Practical)
cols_to_drop = ['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'FireplaceQu', 'Id']
train_df.drop(columns=cols_to_drop, inplace=True)
test_df.drop(columns=cols_to_drop, inplace=True)

In [5]:
# Step 3: Log Transform Target
y_log = np.log1p(train_df["SalePrice"])
X = train_df.drop("SalePrice", axis=1)
X_test = test_df.copy()

In [6]:
# Combine for consistent preprocessing
X_full = pd.concat([X, X_test], axis=0)

In [7]:
# Step 4: Identify Categorical and Numerical Columns
categorical_cols = X_full.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X_full.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [8]:
# Step 5: Build Preprocessing Pipelines
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, numerical_cols),
    ('cat', categorical_pipeline, categorical_cols)
])

In [9]:
# Step 6: Split Train/Validation Set Again (before CV)
X_train, X_val, y_train_log, y_val_log = train_test_split(
    X, y_log, test_size=0.2, random_state=42)

In [10]:
# Step 7: Model Pipeline with XGBoost
xgb_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', xgb.XGBRegressor(random_state=42))
])


In [11]:
# Step 8: Hyperparameter Grid for GridSearchCV
param_grid = {
    'regressor__n_estimators': [200, 500],
    'regressor__learning_rate': [0.01, 0.05],
    'regressor__max_depth': [3, 4, 5],
    'regressor__subsample': [0.8, 1.0],
    'regressor__colsample_bytree': [0.7, 1.0]
}

In [12]:
# Step 9: Grid Search with Cross Validation
grid_search = GridSearchCV(xgb_model, param_grid, scoring='neg_mean_squared_error',
                           cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train_log)

# Best model after grid search
best_model = grid_search.best_estimator_

Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [13]:
# Step 10: Evaluate on Validation Set
y_pred_log = best_model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val_log, y_pred_log))
print(f"Validation RMSE (log scale): {rmse:.4f}")
print(f"Best Parameters: {grid_search.best_params_}")

# Inverse log transform
y_pred_original = np.expm1(y_pred_log)
y_val_original = np.expm1(y_val_log)
rmse_original = np.sqrt(mean_squared_error(y_val_original, y_pred_original))
print(f"Validation RMSE (original scale): {rmse_original:.2f}")

Validation RMSE (log scale): 0.1255
Best Parameters: {'regressor__colsample_bytree': 0.7, 'regressor__learning_rate': 0.05, 'regressor__max_depth': 3, 'regressor__n_estimators': 500, 'regressor__subsample': 0.8}
Validation RMSE (original scale): 25068.40


In [14]:
# Step 11: Cross-Validation Analysis
cv_scores = cross_val_score(best_model, X, y_log, scoring='neg_mean_squared_error', cv=KFold(5))
cv_rmse = np.sqrt(-cv_scores)
print(f"Cross-Validation RMSE: {cv_rmse.mean():.4f} ± {cv_rmse.std():.4f}")

Cross-Validation RMSE: 0.1204 ± 0.0115


In [16]:
# Step 12: Predict on Test Set
test_ids = pd.read_csv("test.csv")["Id"]
final_predictions_log = best_model.predict(X_test)
final_predictions = np.expm1(final_predictions_log)

# Create Submission File
submission = pd.DataFrame({
    "Id": test_ids,
    "SalePrice": final_predictions
})

submission.to_csv("submission3.csv", index=False)
print("submission3.csv created")

submission3.csv created
