In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
import joblib

# Configuration
IMAGE_DIR = '/kaggle/input/new-dataset/images'
TRAIN_PATH = '/kaggle/input/new-dataset/train.csv'
TEST_PATH = '/kaggle/input/new-dataset/test.csv'

def preprocess_data():
    print("Loading data...")
    train_df = pd.read_csv(TRAIN_PATH)
    test_df = pd.read_csv(TEST_PATH)
    
    # 1. Handle Date
    train_df['date'] = pd.to_datetime(train_df['date'])
    test_df['date'] = pd.to_datetime(test_df['date'])
    
    train_df['date_int'] = train_df['date'].astype(np.int64) // 10**9
    test_df['date_int'] = test_df['date'].astype(np.int64) // 10**9
    
    # 2. Log Transform Target (Price)
    train_df['log_price'] = np.log1p(train_df['price'])
    
    # 3. Log Transform Skewed Input Features (Crucial for NN performance)
    skewed_cols = ['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 
                   'sqft_living15', 'sqft_lot15']
    
    for col in skewed_cols:
        # Fill NA with 0 just in case, though dataset is clean
        train_df[col] = np.log1p(train_df[col].fillna(0))
        test_df[col] = np.log1p(test_df[col].fillna(0))
        print(f"Log-transformed {col}")

    # 4. Feature Selection
    drop_cols = ['id', 'date', 'price', 'log_price']
    feature_cols = [c for c in train_df.columns if c not in drop_cols]
    
    print(f"Selected {len(feature_cols)} tabular features.")
    
    # 5. Scaling
    scaler = StandardScaler()
    train_df[feature_cols] = scaler.fit_transform(train_df[feature_cols])
    test_df[feature_cols] = scaler.transform(test_df[feature_cols])
    
    joblib.dump(scaler, 'tabular_scaler.pkl')
    
    # 6. Image Path Mapping
    def get_image_path(house_id):
        path_jpg = os.path.join(IMAGE_DIR, f"{house_id}.jpg")
        if os.path.exists(path_jpg): return path_jpg
        path_png = os.path.join(IMAGE_DIR, f"{house_id}.png")
        if os.path.exists(path_png): return path_png
        return None 

    train_df['image_path'] = train_df['id'].apply(get_image_path)
    test_df['image_path'] = test_df['id'].apply(get_image_path)
    
    print("Saving processed data...")
    train_df.to_csv('processed_train.csv', index=False)
    test_df.to_csv('processed_test.csv', index=False)
    print("Done. Files saved.")

if __name__ == "__main__":
    preprocess_data()