In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# --- 1. SETUP: Learn from TRAIN Data (Scaler & Target Map) ---
print("STEP 1: Learning parameters from TRAIN data...")
try:
    train_values = pd.read_csv('train_values.csv')
    train_labels = pd.read_csv('train_labels.csv')
    train_df = pd.merge(train_values, train_labels, on='building_id')
except FileNotFoundError:
    print("❌ ERROR: 'train_values.csv' or 'train_labels.csv' not found. These are needed to learn encoding maps.")
    exit()

# Features for fitting
X_fit = train_df.drop(['damage_grade', 'building_id'], axis=1)
y_fit = train_df['damage_grade']
global_mean = y_fit.mean() # For filling NaNs in Target Encoding

# Feature Groups
CONT_FEATURES = ['age', 'area_percentage', 'height_percentage']
ORDINAL_FEATURES = ['count_floors_pre_eq', 'count_families']
OHE_FEATURES = [
    'land_surface_condition', 'foundation_type', 'roof_type', 'ground_floor_type', 
    'other_floor_type', 'position', 'plan_configuration', 'legal_ownership_status'
]
TARGET_ENCODE_FEATURES = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']
FEATURES_TO_SCALE = CONT_FEATURES + ORDINAL_FEATURES

# A. Fit Scaler (Log age first)
X_fit['age_log'] = np.log1p(X_fit['age'])
X_fit = X_fit.drop('age', axis=1)
FEATURES_TO_SCALE.remove('age')
FEATURES_TO_SCALE.append('age_log')

scaler = StandardScaler()
scaler.fit(X_fit[FEATURES_TO_SCALE])

# B. Fit Target Encoding Maps
target_maps = {}
for col in TARGET_ENCODE_FEATURES:
    # Calculate mean damage_grade for each category in TRAIN
    target_maps[col] = y_fit.groupby(X_fit[col]).mean().to_dict()

# C. Get OHE Columns Structure (to ensure Test has same columns)
X_fit_ohe = pd.get_dummies(X_fit[OHE_FEATURES], drop_first=True)
ohe_train_columns = X_fit_ohe.columns.tolist() # Keep this list!


# --- 2. APPLY TO TEST DATA (Transform Only) ---
print("\nSTEP 2: Applying preprocessing to TEST data...")
try:
    test_values = pd.read_csv('test_values.csv')
except FileNotFoundError:
    print("❌ ERROR: 'test_values.csv' not found.")
    exit()

X_test = test_values.drop('building_id', axis=1)
test_ids = test_values['building_id']

# A. Log Transform & Scaling (Use fitted scaler)
X_test['age_log'] = np.log1p(X_test['age'])
X_test = X_test.drop('age', axis=1)
X_test[FEATURES_TO_SCALE] = scaler.transform(X_test[FEATURES_TO_SCALE])

# B. One-Hot Encoding & Column Alignment
print("-> Applying One-Hot Encoding...")
X_test = pd.get_dummies(X_test, columns=OHE_FEATURES, drop_first=True)

# Important: Align columns with Train set (Add missing cols with 0, drop extra cols)
# 1. Add missing columns
for col in ohe_train_columns:
    if col not in X_test.columns:
        X_test[col] = 0
# 2. Reorder/Filter columns to match Train exactly (excluding other numeric cols for now)
# We will do a full reindex at the end to be safe, but let's handle dimensionality reduction first.

# C. Dimensionality Reduction (Drop redundant cols found in Train analysis)
REDUNDANT_COLS_TO_DROP = ['land_surface_condition_t', 'roof_type_q', 'position_t']
X_test = X_test.drop(columns=REDUNDANT_COLS_TO_DROP, errors='ignore')

# D. Target Encoding (Use learned maps)
print("-> Applying Target Encoding (using Train maps)...")
for col in TARGET_ENCODE_FEATURES:
    # Map values using the Train dictionary
    X_test[f'{col}_target_enc'] = X_test[col].map(target_maps[col])
    # Fill unseen categories (NaNs) with global mean from Train
    X_test[f'{col}_target_enc'] = X_test[f'{col}_target_enc'].fillna(global_mean)
    X_test = X_test.drop(col, axis=1)

# E. Boolean to Integer
bool_cols = X_test.select_dtypes(include=['bool']).columns
if not bool_cols.empty:
    X_test[bool_cols] = X_test[bool_cols].astype(int)

# F. Re-insert ID
X_test.insert(0, 'building_id', test_ids)

# Save
output_filename = 'preprocessed_test_values_final.csv'
X_test.to_csv(output_filename, index=False)
print(f"\n✅ Successfully created '{output_filename}' with {X_test.shape[0]} rows.")
print(f"   (Use this file to generate predictions for submission)")

STEP 1: Learning parameters from TRAIN data...

STEP 2: Applying preprocessing to TEST data...
-> Applying One-Hot Encoding...
-> Applying Target Encoding (using Train maps)...

✅ Successfully created 'preprocessed_test_values_final.csv' with 86868 rows.
   (Use this file to generate predictions for submission)
