In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# --- LOAD DATA ---
df = pd.read_csv('data/kc_house_data.csv')

# --- STEP 0: HANDLE OUTLIERS ---
# We do this BEFORE all other transformations
print("--- 0. HANDLING OUTLIERS ---")
print(f"Number of rows before cleaning: {len(df)}")

# Define bounds for 'sqft_living'
Q1 = df['sqft_living'].quantile(0.25)
Q3 = df['sqft_living'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter the DataFrame. .copy() is used to avoid Pandas warnings.
df = df[(df['sqft_living'] >= lower_bound) & (df['sqft_living'] <= upper_bound)].copy()

print(f"Number of rows after cleaning: {len(df)}\n")

# --- STEP 1: FEATURE CREATION ---
# This step now works with the CLEANED data
print("--- 1. CREATING NEW FEATURES ---")
df['date'] = pd.to_datetime(df['date'])
df['sale_year'] = df['date'].dt.year
df['house_age'] = df['sale_year'] - df['yr_built']
# df['price_per_sqft'] = df['price'] / (df['sqft_living'] + 1) # This line was removed to prevent data leakage
df = df.drop(['date', 'id'], axis=1) # id is also useless for the model
print("New features created.\n")

# --- STEP 2: ENCODING CATEGORICAL VARIABLES ---
print("--- 2. ENCODING CATEGORICAL FEATURES ---")
# Here we use the df that is already cleaned and has new features
df_processed = pd.get_dummies(df, columns=['zipcode'], prefix='zip')
print(f"Table shape changed from {df.shape} to {df_processed.shape}\n")

# --- STEP 3: DATA NORMALIZATION (SCALING) ---
print("--- 3. NORMALIZING NUMERICAL FEATURES ---")
# We don't want to normalize our target variable 'price'
# or the OHE columns since they are already binary (0/1)
cols_to_scale = [col for col in df_processed.columns if not col.startswith('zip_') and col != 'price']
scaler = StandardScaler()
df_processed[cols_to_scale] = scaler.fit_transform(df_processed[cols_to_scale])
print("Normalization complete.\n")

# --- RESULT ---
df_final = df_processed
print("Final view of data ready for modeling:")
display(df_final.head())

# --- SAVE ---
df_final.to_pickle('data/02_data_prepared.pkl')
print("\nPrepared data saved to 'data/02_data_prepared.pkl'")