# 📁 Step 1: Load Cleaned Data

In [1]:
import pandas as pd

# Load the cleaned dataset
df = pd.read_csv('cleaned_house_data.csv')

# Preview
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,...,0,0,0,0,0,12,2008,WD,Normal,250000


# 🕳️ Step 2: Check for Missing Values

In [2]:
# Check missing values
df.isnull().sum().sum()

0

# 🔢 Step 3: Encode Categorical Features

3.1. Identify categorical columns

In [3]:
cat_cols = df.select_dtypes(include='object').columns
print(cat_cols)

Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional',
       'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
       'PavedDrive', 'SaleType', 'SaleCondition'],
      dtype='object')


3.2. Use Label Encoding or One-Hot Encoding

In [26]:
# One-Hot Encoding (best for models)
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)
print(df)

        Id  MSSubClass  LotFrontage  LotArea  OverallQual  OverallCond  \
0        1          60         65.0     8450            7            5   
1        2          20         80.0     9600            6            8   
2        3          60         68.0    11250            7            5   
3        4          70         60.0     9550            7            5   
4        5          60         84.0    14260            8            5   
...    ...         ...          ...      ...          ...          ...   
1455  1456          60         62.0     7917            6            5   
1456  1457          20         85.0    13175            6            6   
1457  1458          70         66.0     9042            7            9   
1458  1459          20         68.0     9717            5            6   
1459  1460          20         75.0     9937            5            6   

      YearBuilt  YearRemodAdd  MasVnrArea  BsmtFinSF1  ...  SaleType_ConLI  \
0          2003          2003    

In [28]:
# Recalculate categorical columns based on current df
cat_cols = df.select_dtypes(include='object').columns

# Only apply get_dummies if categorical columns exist
if len(cat_cols) > 0:
    df = pd.get_dummies(df, columns=cat_cols, drop_first=True)


# 📐 Step 4: Feature Scaling (Optional for tree models, recommended for linear models)

In [30]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Scale only numerical columns
num_cols = df.select_dtypes(include=['int64', 'float64']).drop(columns=['SalePrice']).columns

df[num_cols] = scaler.fit_transform(df[num_cols])

# 🛠️ Step 5: Create New Features 

In [32]:
# Total Bathrooms
df['TotalBathrooms'] = df['FullBath'] + (0.5 * df['HalfBath']) + df['BsmtFullBath'] + (0.5 * df['BsmtHalfBath'])

# Total House Size
df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']

# 📦 Step 6: Save Final Dataset

In [33]:
df.to_csv('final_model_ready_data.csv', index=False)
print("✅ Final preprocessed data saved.")

✅ Final preprocessed data saved.
