# 02 - Data Cleaning and Preprocessing

## Objectives
- Handle missing values appropriately
- Remove sparse features
- Encode categorical variables
- Prepare data for machine learning

In [1]:
import os
current_dir = os.getcwd()
current_dir

'/Users/sararosati/Desktop/vscode-projects/Heritage-Housing/Heritage-housing/jupyter_notebooks'

In [2]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")
current_dir = os.getcwd()
current_dir

You set a new current directory


'/Users/sararosati/Desktop/vscode-projects/Heritage-Housing/Heritage-housing'

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

Libraries imported successfully!


In [4]:
# Load the datasets
df = pd.read_csv('inputs/datasets/house_prices_records.csv')
inherited_df = pd.read_csv('inputs/datasets/inherited_houses.csv')

print(f"Main dataset shape: {df.shape}")
print(f"Inherited houses shape: {inherited_df.shape}")
print("\nDataset loaded successfully!")

Main dataset shape: (1460, 24)
Inherited houses shape: (4, 23)

Dataset loaded successfully!


## Section 1: Handling Missing Values

In [5]:
# Step 1: Drop features with >80% missing values
print("Features with >80% missing values:")
missing_percentage = (df.isnull().sum() / len(df)) * 100
sparse_features = missing_percentage[missing_percentage > 80]
print(sparse_features)

# Drop these sparse features
df_cleaned = df.drop(columns=sparse_features.index)

print(f"\nDataset shape after dropping sparse features: {df_cleaned.shape}")
print(f"Features removed: {list(sparse_features.index)}")

Features with >80% missing values:
EnclosedPorch    90.684932
WoodDeckSF       89.383562
dtype: float64

Dataset shape after dropping sparse features: (1460, 22)
Features removed: ['EnclosedPorch', 'WoodDeckSF']


## Step 2: Handle Remaining Missing Values

In [6]:
# Check remaining missing values
print("Remaining missing values:")
remaining_missing = df_cleaned.isnull().sum()
remaining_missing = remaining_missing[remaining_missing > 0]
print(remaining_missing)

print("\n" + "="*50)

# Strategy for each column:
# - LotFrontage: Fill with median (continuous variable)
# - BedroomAbvGr, 2ndFlrSF, GarageYrBlt: Fill with median
# - Categorical columns: Fill with mode (most frequent value)

# Fill numeric missing values with median
numeric_cols_missing = df_cleaned.select_dtypes(include=[np.number]).columns
for col in numeric_cols_missing:
    if df_cleaned[col].isnull().sum() > 0:
        median_value = df_cleaned[col].median()
        df_cleaned[col].fillna(median_value, inplace=True)
        print(f"Filled {col} with median: {median_value}")

# Fill categorical missing values with mode
categorical_cols = df_cleaned.select_dtypes(include=['object']).columns
for col in categorical_cols:
    if df_cleaned[col].isnull().sum() > 0:
        mode_value = df_cleaned[col].mode()[0]
        df_cleaned[col].fillna(mode_value, inplace=True)
        print(f"Filled {col} with mode: {mode_value}")

print("\n" + "="*50)
print("Verification - remaining missing values:")
print(df_cleaned.isnull().sum().sum())  # Should be 0

Remaining missing values:
2ndFlrSF         86
BedroomAbvGr     99
BsmtExposure     38
BsmtFinType1    145
GarageFinish    235
GarageYrBlt      81
LotFrontage     259
MasVnrArea        8
dtype: int64

Filled 2ndFlrSF with median: 0.0
Filled BedroomAbvGr with median: 3.0
Filled GarageYrBlt with median: 1980.0
Filled LotFrontage with median: 69.0
Filled MasVnrArea with median: 0.0
Filled BsmtExposure with mode: No
Filled BsmtFinType1 with mode: Unf
Filled GarageFinish with mode: Unf

Verification - remaining missing values:
0


## Step 3: Encode Categorical Variables

In [7]:
# Identify categorical columns
categorical_cols = df_cleaned.select_dtypes(include=['object']).columns.tolist()
print(f"Categorical columns to encode: {categorical_cols}")

# Apply Label Encoding to categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_cleaned[col] = le.fit_transform(df_cleaned[col])
    label_encoders[col] = le
    print(f"Encoded {col}: {dict(zip(le.classes_, le.transform(le.classes_)))}")

print("\n" + "="*50)
print("All categorical variables successfully encoded!")
print(f"Dataset shape: {df_cleaned.shape}")

Categorical columns to encode: ['BsmtExposure', 'BsmtFinType1', 'GarageFinish', 'KitchenQual']
Encoded BsmtExposure: {'Av': 0, 'Gd': 1, 'Mn': 2, 'No': 3}
Encoded BsmtFinType1: {'ALQ': 0, 'BLQ': 1, 'GLQ': 2, 'LwQ': 3, 'Rec': 4, 'Unf': 5}
Encoded GarageFinish: {'Fin': 0, 'RFn': 1, 'Unf': 2}
Encoded KitchenQual: {'Ex': 0, 'Fa': 1, 'Gd': 2, 'TA': 3}

All categorical variables successfully encoded!
Dataset shape: (1460, 22)


## Step 4: Prepare Data for Machine Learning

In [8]:
# Separate features (X) and target (y)
X = df_cleaned.drop('SalePrice', axis=1)
y = df_cleaned['SalePrice']

print("Data Preparation Summary:")
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeature columns ({X.shape[1]}):")
print(X.columns.tolist())
print(f"\nTarget variable statistics:")
print(y.describe())

print("\n" + "="*50)
print("Data is ready for model training!")

Data Preparation Summary:
Features shape: (1460, 21)
Target shape: (1460,)

Feature columns (21):
['1stFlrSF', '2ndFlrSF', 'BedroomAbvGr', 'BsmtExposure', 'BsmtFinSF1', 'BsmtFinType1', 'BsmtUnfSF', 'GarageArea', 'GarageFinish', 'GarageYrBlt', 'GrLivArea', 'KitchenQual', 'LotArea', 'LotFrontage', 'MasVnrArea', 'OpenPorchSF', 'OverallCond', 'OverallQual', 'TotalBsmtSF', 'YearBuilt', 'YearRemodAdd']

Target variable statistics:
count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

Data is ready for model training!


## Step 5: Save Cleaned Data

In [10]:
import os

# Create outputs directory if it doesn't exist
if not os.path.exists('outputs'):
    os.makedirs('outputs')
    print("Created 'outputs' directory")

# Now save the cleaned dataset
df_cleaned.to_csv('outputs/X_y_cleaned.csv', index=False)
print("Cleaned dataset saved to: outputs/X_y_cleaned.csv")

# Also save X and y separately
X.to_csv('outputs/X.csv', index=False)
y.to_csv('outputs/y.csv', index=False)

print("\nAll files saved successfully!")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

Created 'outputs' directory
Cleaned dataset saved to: outputs/X_y_cleaned.csv

All files saved successfully!
X shape: (1460, 21)
y shape: (1460,)
