#  House Price Prediction with Preprocessing and Feature Engineering

In [4]:

import pandas as pd
import numpy as np

# Loading dataset
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train.shape, test.shape


((1460, 81), (1459, 80))

In [5]:

# Saving and dropping ID
train_ID = train['Id']
test_ID = test['Id']
train.drop("Id", axis=1, inplace=True)
test.drop("Id", axis=1, inplace=True)

# Log transform SalePrice
train["SalePrice"] = np.log1p(train["SalePrice"])

# Combine train and test for preprocessing
ntrain = train.shape[0]
y_train = train["SalePrice"]
all_data = pd.concat((train.drop(['SalePrice'], axis=1), test)).reset_index(drop=True)

all_data.shape


(2919, 79)

In [6]:

# Handling missing data
def handle_missing(df):
    none_cols = ['PoolQC','MiscFeature','Alley','Fence','FireplaceQu','GarageType',
                 'GarageFinish','GarageQual','GarageCond','BsmtQual','BsmtCond',
                 'BsmtExposure','BsmtFinType1','BsmtFinType2','MasVnrType']
    for col in none_cols:
        df[col] = df[col].fillna("None")
    zero_cols = ['GarageYrBlt','GarageArea','GarageCars','BsmtFinSF1','BsmtFinSF2',
                 'BsmtUnfSF','TotalBsmtSF','BsmtFullBath','BsmtHalfBath','MasVnrArea']
    for col in zero_cols:
        df[col] = df[col].fillna(0)
    mode_cols = ['MSZoning', 'Functional', 'KitchenQual', 'Electrical', 'SaleType', 'Exterior1st', 'Exterior2nd']
    for col in mode_cols:
        df[col] = df[col].fillna(df[col].mode()[0])
    df["LotFrontage"] = df.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))
    return df

all_data = handle_missing(all_data)
"Missing values handled."


'Missing values handled.'

In [7]:

# Converting types and label encoding ordinal features
from sklearn.preprocessing import LabelEncoder

all_data["MSSubClass"] = all_data["MSSubClass"].astype(str)
all_data["OverallCond"] = all_data["OverallCond"].astype(str)
all_data["YrSold"] = all_data["YrSold"].astype(str)
all_data["MoSold"] = all_data["MoSold"].astype(str)

label_cols = ['FireplaceQu','BsmtQual', 'BsmtCond','GarageQual','GarageCond',
              'ExterQual','ExterCond','HeatingQC','PoolQC','KitchenQual','BsmtFinType1',
              'BsmtFinType2','Functional','BsmtExposure','GarageFinish','LandSlope',
              'LotShape','PavedDrive','Street','Alley','CentralAir','Utilities']
for col in label_cols:
    lbl = LabelEncoder()
    all_data[col] = lbl.fit_transform(all_data[col].astype(str))
"Label encoding completed."


'Label encoding completed.'

In [8]:

# Creating new features
all_data["TotalSF"] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
all_data['Age'] = all_data['YrSold'].astype(int) - all_data['YearBuilt']
all_data['RemodAge'] = all_data['YrSold'].astype(int) - all_data['YearRemodAdd']
all_data['TotalBath'] = all_data['FullBath'] + all_data['HalfBath']*0.5 + all_data['BsmtFullBath'] + all_data['BsmtHalfBath']*0.5
all_data['TotalPorchSF'] = all_data['OpenPorchSF'] + all_data['EnclosedPorch'] + all_data['3SsnPorch'] + all_data['ScreenPorch']
"Feature engineering completed."


'Feature engineering completed.'

In [9]:

# Log-transform skewed numeric features
from scipy.stats import skew

numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewness = pd.DataFrame({'Skew': skewed_feats})

skewed_features = skewness[abs(skewness["Skew"]) > 0.75].index
all_data[skewed_features] = np.log1p(all_data[skewed_features])
"Skewed features transformed."


'Skewed features transformed.'

In [10]:

# One-hot encoding
all_data = pd.get_dummies(all_data)
"One-hot encoding completed."


'One-hot encoding completed.'

In [11]:

# Split back into train and test sets
X_train = all_data[:ntrain]
X_test = all_data[ntrain:]

X_train.shape, X_test.shape


((1460, 268), (1459, 268))

In [12]:

# Training XGBoost and making predictions
import xgboost as xgb
from sklearn.model_selection import cross_val_score

xgb_model = xgb.XGBRegressor(n_estimators=1000, learning_rate=0.05, max_depth=4, random_state=42)
xgb_model.fit(X_train, y_train)
scores = cross_val_score(xgb_model, X_train, y_train, scoring="neg_root_mean_squared_error", cv=5)
f"Cross-validated RMSE: {-scores.mean():.4f}"


'Cross-validated RMSE: 0.1271'

In [13]:

# Predicting and exporting submission
preds = np.expm1(xgb_model.predict(X_test))
submission = pd.DataFrame({
    "Id": test_ID,
    "SalePrice": preds
})
submission.to_csv("house_price_submission.csv", index=False)
"Submission file created: house_price_submission.csv"


'Submission file created: house_price_submission.csv'