In [70]:
# Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import confusion_matrix as cm
import sklearn.metrics as skm
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor as rfr
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse

from xgboost import XGBRegressor
import xgboost as xgb

In [52]:
pd.set_option('display.max_columns', None)

### Load Data

In [77]:
train_df = pd.read_csv("Data/train.csv")
test_df = pd.read_csv("Data/test.csv")

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

Train shape: (1460, 81)
Test shape: (1459, 80)


### Combine Data

In [78]:
train_ID = train_df['Id']
test_ID = test_df['Id']

y_train = train_df['SalePrice']

# drop ID and target
train_df.drop(['Id','SalePrice'], axis=1, inplace=True)
test_df.drop(['Id'], axis=1, inplace=True)

all_data = pd.concat([train_df, test_df]).reset_index(drop=True)
print("Combined data shape:", all_data.shape)


Combined data shape: (2919, 79)


### Data Pre-processing

##### 1. Missing Values Handling

In [79]:
# Fill categorical missing with 'None'
for col in ['PoolQC','MiscFeature','Alley','Fence','FireplaceQu',
            'GarageType','GarageFinish','GarageQual','GarageCond',
            'BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','MasVnrType']:
    all_data[col] = all_data[col].fillna('None')

# Fill numerical missing with 0
for col in ['GarageYrBlt','GarageArea','GarageCars',
            'BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF',
            'BsmtFullBath','BsmtHalfBath','MasVnrArea']:
    all_data[col] = all_data[col].fillna(0)

# Fill mode
all_data['Electrical'] = all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])

# LotFrontage → fill by median of neighborhood
all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))


##### 2. Feature Engineering

In [80]:
# Total square footage
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

# Total bathrooms
all_data['TotalBath'] = (all_data['FullBath'] + 0.5*all_data['HalfBath'] +
                         all_data['BsmtFullBath'] + 0.5*all_data['BsmtHalfBath'])

# Total porch area
all_data['TotalPorchSF'] = (all_data['OpenPorchSF'] + all_data['EnclosedPorch'] +
                            all_data['3SsnPorch'] + all_data['ScreenPorch'])

# Convert some categorical ordinals to numbers
quality_map = {'None':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}
for col in ['ExterQual','ExterCond','BsmtQual','BsmtCond','HeatingQC','KitchenQual',
            'FireplaceQu','GarageQual','GarageCond']:
    all_data[col] = all_data[col].map(quality_map).fillna(0)


##### 3. Handling Skewness

In [81]:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = all_data[numeric_feats].apply(lambda x: x.dropna().skew()).sort_values(ascending=False)
skewed = skewed_feats[skewed_feats > 0.75].index

all_data[skewed] = np.log1p(all_data[skewed])

# Log-transform target variable too
y_train = np.log1p(y_train)

##### 4. Encoding Categoricals

In [82]:
all_data = pd.get_dummies(all_data)

print("Final all_data shape:", all_data.shape)

Final all_data shape: (2919, 268)
