# Housing Price Prediction
## Imports

In [17]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

## Exploration

In [18]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

## Data preprocessing

In [19]:
# Drop irrelevant or missing values columns
cols_to_drop = ['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'FireplaceQu', 'Id']
train_df.drop(columns=cols_to_drop, inplace=True)
test_df.drop(columns=cols_to_drop, inplace=True)

In [20]:
# Log Transform Target
y_log = np.log1p(train_df["SalePrice"])
X = train_df.drop("SalePrice", axis=1)
X_test = test_df.copy()

X_full = pd.concat([X, X_test], axis=0)

### Pipeline creation

In [21]:
categorical_cols = X_full.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X_full.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [22]:
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, numerical_cols),
    ('cat', categorical_pipeline, categorical_cols)
])

## Train test split

In [23]:
X_train, X_val, y_train_log, y_val_log = train_test_split(
    X, y_log, test_size=0.2, random_state=42)

## Model Creation

In [24]:
xgb_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', xgb.XGBRegressor(random_state=42))
])


## Hyperparameter Grid for GridSearchCV

In [25]:
param_grid = {
    'regressor__n_estimators': [200, 500],
    'regressor__learning_rate': [0.01, 0.05],
    'regressor__max_depth': [3, 4, 5],
    'regressor__subsample': [0.8, 1.0],
    'regressor__colsample_bytree': [0.7, 1.0]
}

In [26]:
grid_search = GridSearchCV(xgb_model, param_grid, scoring='neg_mean_squared_error',
                           cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train_log)

best_model = grid_search.best_estimator_

Fitting 5 folds for each of 48 candidates, totalling 240 fits


### Evaluation on val set

In [27]:
y_pred_log = best_model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val_log, y_pred_log))
print(f"Validation RMSE (log scale): {rmse:.4f}")
print(f"Best Parameters: {grid_search.best_params_}")

# Inverse log transform
y_pred_original = np.expm1(y_pred_log)
y_val_original = np.expm1(y_val_log)
rmse_original = np.sqrt(mean_squared_error(y_val_original, y_pred_original))
print(f"Validation RMSE (original scale): {rmse_original:.2f}")

Validation RMSE (log scale): 0.1255
Best Parameters: {'regressor__colsample_bytree': 0.7, 'regressor__learning_rate': 0.05, 'regressor__max_depth': 3, 'regressor__n_estimators': 500, 'regressor__subsample': 0.8}
Validation RMSE (original scale): 25068.40


In [28]:
cv_scores = cross_val_score(best_model, X, y_log, scoring='neg_mean_squared_error', cv=KFold(5))
cv_rmse = np.sqrt(-cv_scores)
print(f"Cross-Validation RMSE: {cv_rmse.mean():.4f} ± {cv_rmse.std():.4f}")

Cross-Validation RMSE: 0.1204 ± 0.0115


## Predict on Test Set

In [29]:
test_ids = pd.read_csv("test.csv")["Id"]
final_predictions_log = best_model.predict(X_test)
final_predictions = np.expm1(final_predictions_log)

submission = pd.DataFrame({
    "Id": test_ids,
    "SalePrice": final_predictions
})

submission.to_csv("submission3.csv", index=False)