In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
import io
from google.colab import files


In [2]:
uploaded = files.upload()


Saving house-prices-advanced-regression-techniques.zip to house-prices-advanced-regression-techniques.zip


In [3]:
with zipfile.ZipFile(io.BytesIO(uploaded['house-prices-advanced-regression-techniques.zip']), 'r') as zip_ref:
    zip_ref.extractall()


In [4]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)
train.head()


Train shape: (1460, 81)
Test shape: (1459, 80)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
missing = train.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
print("🔍 Columns with missing values:\n", missing)


🔍 Columns with missing values:
 PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
BsmtExposure      38
BsmtFinType2      38
BsmtQual          37
BsmtCond          37
BsmtFinType1      37
MasVnrArea         8
Electrical         1
dtype: int64


In [6]:
cat_cols = ['MasVnrType', 'Electrical', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
for col in cat_cols:
    train[col].fillna(train[col].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(train[col].mode()[0], inplace=True)


In [7]:
num_cols = ['MasVnrArea', 'GarageYrBlt']
for col in num_cols:
    train[col].fillna(train[col].median(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(train[col].median(), inplace=True)


In [8]:
drop_cols = ['Alley', 'PoolQC', 'Fence', 'MiscFeature']
train.drop(columns=drop_cols, inplace=True)


In [9]:
# Total Square Footage = Basement + 1st Floor + 2nd Floor
train['TotalSF'] = train['TotalBsmtSF'] + train['1stFlrSF'] + train['2ndFlrSF']


In [12]:
# Add source column to keep track
train['source'] = 'train'
test['source'] = 'test'

# Combine both datasets
combined = pd.concat([train, test], ignore_index=True)

# Apply one-hot encoding (do NOT drop first)
combined = pd.get_dummies(combined, drop_first=False)

# Now split them back safely
train = combined[combined['source_train'] == 1].drop(['source_train', 'source_test'], axis=1)
test = combined[combined['source_test'] == 1].drop(['SalePrice', 'source_train', 'source_test'], axis=1)


In [13]:
from sklearn.preprocessing import StandardScaler

# Select all numeric columns except target variable
numeric_features = train.select_dtypes(include=[np.number]).columns.drop('SalePrice')

scaler = StandardScaler()
train[numeric_features] = scaler.fit_transform(train[numeric_features])
test[numeric_features] = scaler.transform(test[numeric_features])


In [14]:
print("✅ Final Training Set Shape:", train.shape)
print("✅ Final Test Set Shape:", test.shape)
train.head()


✅ Final Training Set Shape: (1460, 288)
✅ Final Test Set Shape: (1459, 287)


Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,Alley_Pave,PoolQC_Ex,PoolQC_Gd,Fence_GdPrv,Fence_GdWo,Fence_MnPrv,Fence_MnWw,MiscFeature_Gar2,MiscFeature_Othr,MiscFeature_Shed
0,-1.730865,0.073375,-0.208034,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.514104,0.575425,...,False,False,False,False,False,False,False,False,False,False
1,-1.728492,-0.872563,0.409895,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.57075,1.171992,...,False,False,False,False,False,False,False,False,False,False
2,-1.72612,0.073375,-0.084449,0.07348,0.651479,-0.5172,0.984752,0.830215,0.325915,0.092907,...,False,False,False,False,False,False,False,False,False,False
3,-1.723747,0.309859,-0.414011,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.57075,-0.499274,...,False,False,False,False,False,False,False,False,False,False
4,-1.721374,0.073375,0.574676,0.375148,1.374795,-0.5172,0.951632,0.733308,1.366489,0.463568,...,False,False,False,False,False,False,False,False,False,False
