# House Prices - Advanced Regression Techniques
Predict sales prices and practice feature engineering, Random Forests, and Gradient Boosting

In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

Train shape: (1460, 81)
Test shape: (1459, 80)


In [3]:
# Target variable
y = train['SalePrice']
train_ID = train['Id']
test_ID = test['Id']
train.drop(['Id', 'SalePrice'], axis=1, inplace=True)
test.drop(['Id'], axis=1, inplace=True)
all_data = pd.concat([train, test], axis=0, sort=False).reset_index(drop=True)
print(f"Combined data shape: {all_data.shape}")

Combined data shape: (2919, 79)


In [4]:
# Missing values
missing = all_data.isnull().sum().sort_values(ascending=False)
print(missing[missing > 0].head(20))

PoolQC          2909
MiscFeature     2814
Alley           2721
Fence           2348
MasVnrType      1766
FireplaceQu     1420
LotFrontage      486
GarageCond       159
GarageFinish     159
GarageYrBlt      159
GarageQual       159
GarageType       157
BsmtExposure      82
BsmtCond          82
BsmtQual          81
BsmtFinType2      80
BsmtFinType1      79
MasVnrArea        23
MSZoning           4
BsmtHalfBath       2
dtype: int64


In [5]:
# Fill missing values
for col in all_data.columns:
    if all_data[col].dtype == 'object':
        all_data[col] = all_data[col].fillna(all_data[col].mode()[0])
    else:
        all_data[col] = all_data[col].fillna(all_data[col].median())

In [6]:
# Encode categorical features
for col in all_data.select_dtypes(include='object').columns:
    lbl = LabelEncoder()
    all_data[col] = lbl.fit_transform(all_data[col].astype(str))

In [7]:
# Feature engineering
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

In [8]:
# Standardize numeric features
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
scaler = StandardScaler()
all_data[numeric_feats] = scaler.fit_transform(all_data[numeric_feats])

In [9]:
# Split back
n_train = train.shape[0]
X_train = all_data[:n_train]
X_test = all_data[n_train:]

In [10]:
# Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y)
rf_pred = rf.predict(X_test)
cv_scores = cross_val_score(rf, X_train, y, scoring='neg_root_mean_squared_error', cv=5)
print(f"Random Forest CV RMSE: {-cv_scores.mean():.4f}")

Random Forest CV RMSE: 29117.9279


In [11]:
# XGBoost
xgbr = xgb.XGBRegressor(n_estimators=1000, learning_rate=0.05, max_depth=3, subsample=0.8, colsample_bytree=0.8, random_state=42)
xgbr.fit(X_train, y)
xgb_pred = xgbr.predict(X_test)
cv_scores = cross_val_score(xgbr, X_train, y, scoring='neg_root_mean_squared_error', cv=5)
print(f"XGBoost CV RMSE: {-cv_scores.mean():.4f}")

XGBoost CV RMSE: 24767.7570


In [12]:
# Ensemble
ensemble_pred = (rf_pred + xgb_pred) / 2

In [13]:
# Submission
submission = pd.DataFrame({
    "Id": test_ID,
    "SalePrice": ensemble_pred
})
submission.to_csv("submission.csv", index=False)
print("Submission file created!")

Submission file created!
