# 10. Mini-Project: Melbourne Housing Data Preprocessing 🏡

This notebook provides a complete, working solution for preprocessing the `melb_data.csv` dataset. It uses both Pandas for data loading/manipulation and Scikit-learn for encoding, covering both of your assignments.

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

### 1. Load the Data (Using Pandas)

In [None]:
df = pd.read_csv('melb_data.csv')
print("Dataset loaded successfully!")
print("Original shape:", df.shape)
display(df.head())

### 2. Handle Missing Data (Using Pandas)

In [None]:
print("--- Handling Missing Data ---")
print("Missing values before handling:\n", df.isnull().sum().sort_values(ascending=False).head())

# Strategy: Fill numerical columns with the median and categorical columns with the mode.
for col in df.columns:
    if df[col].isnull().any():
        if pd.api.types.is_object_dtype(df[col]):
            df[col].fillna(df[col].mode()[0], inplace=True)
        else:
            df[col].fillna(df[col].median(), inplace=True)

print("\nTotal missing values after handling:", df.isnull().sum().sum())

### 3. Handle Categorical Data (Using a mix of Pandas and Scikit-learn)

In [None]:
print("--- Handling Categorical Data ---")
df_processed = df.copy()

# Using Pandas get_dummies for columns with low cardinality
low_cardinality_cols = [col for col in df_processed.select_dtypes(include='object').columns if df_processed[col].nunique() < 10]
df_processed = pd.get_dummies(df_processed, columns=low_cardinality_cols, drop_first=True)
print(f"Applied One-Hot Encoding on: {low_cardinality_cols}")

# Using Scikit-learn's LabelEncoder for columns with high cardinality
label_encoder = LabelEncoder()
high_cardinality_cols = [col for col in df_processed.select_dtypes(include='object').columns if df_processed[col].nunique() >= 10]
for col in high_cardinality_cols:
    df_processed[col] = label_encoder.fit_transform(df_processed[col])
print(f"Applied Label Encoding on: {high_cardinality_cols}")

### Preprocessing Complete ✅

In [None]:
print("Final shape of the processed dataset:", df_processed.shape)
print("First 5 rows of the processed dataset:")
display(df_processed.head())