In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 1. Load Data
train_df = pd.read_excel("../data/raw/train(1).xlsx")
test_df = pd.read_excel("../data/raw/test2.xlsx")

# 2. Link Images
# Create a column that stores the local path to the image
def get_image_path(prop_id):
    path = f"data/satellite_images/{prop_id}.jpg"
    return path if os.path.exists(f"../{path}") else None

train_df['image_path'] = train_df['id'].apply(get_image_path)
test_df['image_path'] = test_df['id'].apply(get_image_path)

# Drop rows where images failed to download
train_df = train_df.dropna(subset=['image_path'])

# 3. Feature Engineering
# Create 'renovated' binary flag
train_df['is_renovated'] = train_df['yr_renovated'].apply(lambda x: 1 if x > 0 else 0)

# 4. Scaling Numerical Features
features_to_scale = ['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'sqft_living15', 'sqft_lot15']
scaler = StandardScaler()
train_df[features_to_scale] = scaler.fit_transform(train_df[features_to_scale])
test_df[features_to_scale] = scaler.transform(test_df[features_to_scale])

# 5. Save Processed Data
train_df.to_csv("../data/processed/train_cleaned.csv", index=False)
test_df.to_csv("../data/processed/test_cleaned.csv", index=False)
print("Preprocessing complete. Files saved to data/processed/")