In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split

# --- 1. Load Data and Create Split Files ---
print("STEP 1: Starting Data Split File Creation")
try:
    train_values = pd.read_csv('train_values.csv')
    train_labels = pd.read_csv('train_labels.csv')
    train_df_full = pd.merge(train_values, train_labels, on='building_id')
except FileNotFoundError:
    print("❌ ERROR: Original files not found. Check 'train_values.csv' and 'train_labels.csv'.")
    exit()

X = train_df_full.drop('damage_grade', axis=1)
y = train_df_full['damage_grade']

# Create Stratified Splits
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val)

X_train_df = train_df_full.loc[X_train.index].copy()
X_train_df.to_csv('final_train_split.csv', index=False)
print("✅ Split files created (final_train_split.csv).")
print("-" * 50)


# --- 2. PREPROCESSING PIPELINE (with Dimensionality Reduction) ---
print("STEP 2: Applying Preprocessing Pipeline with Dimensionality Reduction")

df_train_raw = pd.read_csv('final_train_split.csv') 
df_train = df_train_raw.reset_index(drop=True).copy() # FIX: Reset index for KFold compatibility

y_train = df_train['damage_grade']
X_train = df_train.drop('damage_grade', axis=1).drop('building_id', axis=1)

# Define Feature Groups
CONT_FEATURES = ['age', 'area_percentage', 'height_percentage']
ORDINAL_FEATURES = ['count_floors_pre_eq', 'count_families']
OHE_FEATURES = [
    'land_surface_condition', 'foundation_type', 'roof_type', 'ground_floor_type', 
    'other_floor_type', 'position', 'plan_configuration', 'legal_ownership_status'
]
TARGET_ENCODE_FEATURES = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']
FEATURES_TO_SCALE = CONT_FEATURES + ORDINAL_FEATURES


# 3. Log Transformation and Standard Scaling
X_train['age_log'] = np.log1p(X_train['age'])
X_train = X_train.drop('age', axis=1)
FEATURES_TO_SCALE.remove('age')
FEATURES_TO_SCALE.append('age_log')

scaler = StandardScaler()
X_train[FEATURES_TO_SCALE] = scaler.fit_transform(X_train[FEATURES_TO_SCALE])


# 4. One-Hot Encoding
print("-> 4. Applying One-Hot Encoding")
# We use drop_first=True here to eliminate the perfect multicollinearity base case.
X_train = pd.get_dummies(X_train, columns=OHE_FEATURES, drop_first=True)


# --- 5. DIMENSIONALITY REDUCTION (r > 0.8 Threshold) ---
print("-> 5. Performing Dimensionality Reduction (r > 0.8)")
# Dropping one feature from each pair with |r| > 0.8 
# (land_surface_condition_n/t, roof_type_n/q, position_s/t)
REDUNDANT_COLS_TO_DROP = [
    'land_surface_condition_t', # Correlated with _n (-0.883)
    'roof_type_q',              # Correlated with _n (-0.853)
    'position_t'                # Correlated with _s (-0.825)
]

# Ensure columns exist before dropping (OHE might change column names depending on drop_first)
# We drop the explicit dummy columns generated.
X_train = X_train.drop(columns=REDUNDANT_COLS_TO_DROP, errors='ignore')


# --- 6. Target Encoding (CV-based, Leakage Prevention) ---
print("-> 6. Applying CV-based Target Encoding to geo_levels")

for col in TARGET_ENCODE_FEATURES:
    X_train[f'{col}_target_enc'] = np.nan

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
global_mean = y_train.mean()

for train_idx, val_idx in skf.split(X_train, y_train):
    for col in TARGET_ENCODE_FEATURES:
        means = y_train.iloc[train_idx].groupby(X_train.iloc[train_idx][col]).mean()
        X_train.loc[val_idx, f'{col}_target_enc'] = X_train.loc[val_idx, col].map(means)

# Final clean up (fill NaNs and drop original geo_level features)
for col in TARGET_ENCODE_FEATURES:
    X_train[f'{col}_target_enc'] = X_train[f'{col}_target_enc'].fillna(global_mean)
    X_train = X_train.drop(col, axis=1)

# 7. Final Save
X_train.to_csv('final_features_preprocessed_reduced.csv', index=False)
print(f"\n✅ Preprocessing complete! Final feature count: {X_train.shape[1]}")
print("✅ 'final_features_preprocessed_reduced.csv' file created successfully.")
