In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split

# --- 1. Load Data and Create Split Files (This must be run first) ---
print("STEP 1: Starting Data Split File Creation")
try:
    train_values = pd.read_csv('train_values.csv')
    train_labels = pd.read_csv('train_labels.csv')
    # Merge features and target
    train_df_full = pd.merge(train_values, train_labels, on='building_id')
except FileNotFoundError:
    print("❌ ERROR: Original files not found. Check 'train_values.csv' and 'train_labels.csv'.")
    exit()

X = train_df_full.drop('damage_grade', axis=1)
y = train_df_full['damage_grade']

# Stratified Split (60:20:20)
# 80% (Train+Val) / 20% (Test) Split (Stratified)
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# 60% (Train) / 20% (Validation) Split (Stratified)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val)

# Attach target and save split files
X_train_df = train_df_full.loc[X_train.index].copy()
X_val_df = train_df_full.loc[X_val.index].copy()
X_test_df = train_df_full.loc[X_test.index].copy()

X_train_df.to_csv('final_train_split.csv', index=False)
X_val_df.to_csv('final_validation_split.csv', index=False)
X_test_df.to_csv('final_test_split.csv', index=False)
print("✅ Train/Validation/Test split files created successfully.")
print("-" * 50)


# --- 2. Applying Preprocessing Pipeline (Final Data File Creation) ---
print("STEP 2: Starting Final Preprocessing File Creation")
# Load the file created in Step 1
df_train_raw = pd.read_csv('final_train_split.csv') 

# *** CRITICAL FIX: Resetting index for KFold compatibility ***
df_train = df_train_raw.reset_index(drop=True).copy()

y_train = df_train['damage_grade']
# Drop target and identifier from features
X_train = df_train.drop('damage_grade', axis=1).drop('building_id', axis=1)

# Define Feature Groups (Based on EDA)
CONT_FEATURES = ['age', 'area_percentage', 'height_percentage']
ORDINAL_FEATURES = ['count_floors_pre_eq', 'count_families']
OHE_FEATURES = [
    'land_surface_condition', 'foundation_type', 'roof_type', 'ground_floor_type', 
    'other_floor_type', 'position', 'plan_configuration', 'legal_ownership_status'
]
TARGET_ENCODE_FEATURES = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']
FEATURES_TO_SCALE = CONT_FEATURES + ORDINAL_FEATURES

# Log Transformation and Scaling preparation
print("-> Applying Log Transformation to 'age'")
X_train['age_log'] = np.log1p(X_train['age'])
X_train = X_train.drop('age', axis=1)
FEATURES_TO_SCALE.remove('age')
FEATURES_TO_SCALE.append('age_log')

print("-> Applying Standard Scaling to numerical features")
scaler = StandardScaler()
X_train[FEATURES_TO_SCALE] = scaler.fit_transform(X_train[FEATURES_TO_SCALE])

# One-Hot Encoding
print("-> Applying One-Hot Encoding")
X_train = pd.get_dummies(X_train, columns=OHE_FEATURES, drop_first=True)

# Target Encoding (CV-based, FIXED)
print("-> Applying CV-based Target Encoding to high cardinality features")
for col in TARGET_ENCODE_FEATURES:
    X_train[f'{col}_target_enc'] = np.nan
    
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
global_mean = y_train.mean()

# The iteration now works correctly because the index is a simple range (0 to N-1)
for train_idx, val_idx in skf.split(X_train, y_train):
    for col in TARGET_ENCODE_FEATURES:
        # Calculate mean target using the training folds
        means = y_train.iloc[train_idx].groupby(X_train.iloc[train_idx][col]).mean()
        # Apply the mean to the validation fold (Out-of-Fold mapping)
        X_train.loc[val_idx, f'{col}_target_enc'] = X_train.loc[val_idx, col].map(means)

# Final clean up and save
for col in TARGET_ENCODE_FEATURES:
    # Fill remaining NaNs (rare cases) with the global mean
    X_train[f'{col}_target_enc'] = X_train[f'{col}_target_enc'].fillna(global_mean)
    # Drop original geo_level features
    X_train = X_train.drop(col, axis=1)

X_train.to_csv('final_features_preprocessed.csv', index=False)
print("✅ 'final_features_preprocessed.csv' file created successfully.")

1단계: 훈련/검증/테스트 분할 파일 생성 시작
✅ 훈련/검증/테스트 분할 파일 생성 완료.
--------------------------------------------------
2단계: 최종 전처리 파일 생성 시작
✅ 'final_features_preprocessed.csv' 파일 생성 완료.
