In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold # 특징 그룹 정의를 위해 유지

# --- 1. Load 100% Data ---
print("STEP 1: Starting 100% Full Data Load")
try:
    train_values = pd.read_csv('train_values.csv')
    train_labels = pd.read_csv('train_labels.csv')
    # 100% 전체 데이터셋 병합
    train_df_full = pd.merge(train_values, train_labels, on='building_id')
except FileNotFoundError:
    print("❌ ERROR: 원본 파일을 찾을 수 없습니다. 'train_values.csv'와 'train_labels.csv'를 확인하세요.")
    exit()

# X와 y를 전체 100% 데이터셋으로 설정
X_train = train_df_full.drop('damage_grade', axis=1).drop('building_id', axis=1)
y_train = train_df_full['damage_grade']
X_full_ids = train_df_full['building_id'] # building_id 저장

# --- 2. PREPROCESSING PIPELINE ---
print("STEP 2: Applying Preprocessing Pipeline (100% Data)")

# Define Feature Groups (특징 그룹 정의는 유지)
CONT_FEATURES = ['age', 'area_percentage', 'height_percentage']
ORDINAL_FEATURES = ['count_floors_pre_eq', 'count_families']
OHE_FEATURES = [
    'land_surface_condition', 'foundation_type', 'roof_type', 'ground_floor_type', 
    'other_floor_type', 'position', 'plan_configuration', 'legal_ownership_status'
]
TARGET_ENCODE_FEATURES = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']
FEATURES_TO_SCALE = CONT_FEATURES + ORDINAL_FEATURES

# Log Transformation and Standard Scaling
print("-> Applying Log Transformation and Standard Scaling")
X_train['age_log'] = np.log1p(X_train['age'])
X_train = X_train.drop('age', axis=1)
FEATURES_TO_SCALE.remove('age')
FEATURES_TO_SCALE.append('age_log')

scaler = StandardScaler()
X_train[FEATURES_TO_SCALE] = scaler.fit_transform(X_train[FEATURES_TO_SCALE])

# One-Hot Encoding
print("-> Applying One-Hot Encoding")
X_train = pd.get_dummies(X_train, columns=OHE_FEATURES, drop_first=True)

# Dimensionality Reduction (r > 0.8 Threshold)
print("-> Applying Dimensionality Reduction (r > 0.8)")
REDUNDANT_COLS_TO_DROP = [
    'land_surface_condition_t', 
    'roof_type_q',              
    'position_t'                
]
X_train = X_train.drop(columns=REDUNDANT_COLS_TO_DROP, errors='ignore')


# Target Encoding (Full Mean Application - CV 분할 대신 전체 평균 사용)
print("-> Applying Target Encoding (Full Mean)")
global_mean = y_train.mean()

for col in TARGET_ENCODE_FEATURES:
    # 전체 100% 데이터셋의 평균 Target 값을 계산
    means = y_train.groupby(X_train[col]).mean()
    # 계산된 평균을 해당 특징에 바로 매핑
    X_train[f'{col}_target_enc'] = X_train[col].map(means)
    
# Final clean up (NaN 처리 및 원본 geo_level 삭제)
for col in TARGET_ENCODE_FEATURES:
    # Target Encoding 후 발생한 NaN (데이터가 하나도 없는 경우)은 Global Mean으로 채움
    X_train[f'{col}_target_enc'] = X_train[f'{col}_target_enc'].fillna(global_mean)
    X_train = X_train.drop(col, axis=1)

# --- NEW STEP: Explicitly convert Boolean (TRUE/FALSE) to Integer (1/0) ---
bool_cols = X_train.select_dtypes(include=['bool']).columns
if not bool_cols.empty:
    X_train[bool_cols] = X_train[bool_cols].astype(int)
    print("-> Boolean columns successfully converted to 1/0.")

# building_id를 다시 추가
X_train.insert(0, 'building_id', X_full_ids)

# Final Save
X_train.to_csv('final_features_preprocessed_100_percent.csv', index=False)
print(f"\n✅ Preprocessing complete! Final feature count: {X_train.shape[1]}")
print("✅ 'final_features_preprocessed_100_percent.csv' 파일이 생성되었습니다. (100% 데이터)")

STEP 1: Starting Data Split File Creation
✅ Split files created (final_train_split.csv).
--------------------------------------------------
STEP 2: Applying Preprocessing Pipeline with Dimensionality Reduction
-> 4. Applying One-Hot Encoding
-> 5. Performing Dimensionality Reduction (r > 0.8)
-> 6. Applying CV-based Target Encoding to geo_levels

✅ Preprocessing complete! Final feature count: 57
✅ 'final_features_preprocessed_reduced.csv' file created successfully.


In [1]:
#without dim reduction (r > 0.8)
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold # 특징 그룹 정의를 위해 유지

# --- 1. Load 100% Data ---
print("STEP 1: Starting 100% Full Data Load")
try:
    train_values = pd.read_csv('train_values.csv')
    train_labels = pd.read_csv('train_labels.csv')
    # 100% 전체 데이터셋 병합
    train_df_full = pd.merge(train_values, train_labels, on='building_id')
except FileNotFoundError:
    print("❌ ERROR: 원본 파일을 찾을 수 없습니다. 'train_values.csv'와 'train_labels.csv'를 확인하세요.")
    exit()

# X와 y를 전체 100% 데이터셋으로 설정
X_train = train_df_full.drop('damage_grade', axis=1).drop('building_id', axis=1)
y_train = train_df_full['damage_grade']
X_full_ids = train_df_full['building_id'] # building_id 저장

# --- 2. PREPROCESSING PIPELINE ---
print("STEP 2: Applying Preprocessing Pipeline (100% Data, No Dimensionality Reduction)")

# Define Feature Groups (특징 그룹 정의는 유지)
CONT_FEATURES = ['age', 'area_percentage', 'height_percentage']
ORDINAL_FEATURES = ['count_floors_pre_eq', 'count_families']
OHE_FEATURES = [
    'land_surface_condition', 'foundation_type', 'roof_type', 'ground_floor_type', 
    'other_floor_type', 'position', 'plan_configuration', 'legal_ownership_status'
]
TARGET_ENCODE_FEATURES = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']
FEATURES_TO_SCALE = CONT_FEATURES + ORDINAL_FEATURES

# Log Transformation and Standard Scaling
print("-> Applying Log Transformation and Standard Scaling")
X_train['age_log'] = np.log1p(X_train['age'])
X_train = X_train.drop('age', axis=1)
FEATURES_TO_SCALE.remove('age')
FEATURES_TO_SCALE.append('age_log')

scaler = StandardScaler()
X_train[FEATURES_TO_SCALE] = scaler.fit_transform(X_train[FEATURES_TO_SCALE])

# One-Hot Encoding
print("-> Applying One-Hot Encoding")
X_train = pd.get_dummies(X_train, columns=OHE_FEATURES, drop_first=True)

# -------------------------------------------------------------
# [삭제됨] Dimensionality Reduction (r > 0.8 Threshold) 부분 제거
# -------------------------------------------------------------

# Target Encoding (Full Mean Application - CV 분할 대신 전체 평균 사용)
print("-> Applying Target Encoding (Full Mean)")
global_mean = y_train.mean()

for col in TARGET_ENCODE_FEATURES:
    # 전체 100% 데이터셋의 평균 Target 값을 계산
    means = y_train.groupby(X_train[col]).mean()
    # 계산된 평균을 해당 특징에 바로 매핑
    X_train[f'{col}_target_enc'] = X_train[col].map(means)
    
# Final clean up (NaN 처리 및 원본 geo_level 삭제)
for col in TARGET_ENCODE_FEATURES:
    # Target Encoding 후 발생한 NaN (데이터가 하나도 없는 경우)은 Global Mean으로 채움
    X_train[f'{col}_target_enc'] = X_train[f'{col}_target_enc'].fillna(global_mean)
    X_train = X_train.drop(col, axis=1)

# --- NEW STEP: Explicitly convert Boolean (TRUE/FALSE) to Integer (1/0) ---
bool_cols = X_train.select_dtypes(include=['bool']).columns
if not bool_cols.empty:
    X_train[bool_cols] = X_train[bool_cols].astype(int)
    print("-> Boolean columns successfully converted to 1/0.")

# building_id를 다시 추가
X_train.insert(0, 'building_id', X_full_ids)

# Final Save
X_train.to_csv('final_features_preprocessed_100_percent_no_reduction.csv', index=False)
print(f"\n✅ Preprocessing complete! Final feature count: {X_train.shape[1]}")
print("✅ 'final_features_preprocessed_100_percent_no_reduction.csv' 파일이 생성되었습니다.")

STEP 1: Starting 100% Full Data Load
STEP 2: Applying Preprocessing Pipeline (100% Data, No Dimensionality Reduction)
-> Applying Log Transformation and Standard Scaling
-> Applying One-Hot Encoding
-> Applying Target Encoding (Full Mean)
-> Boolean columns successfully converted to 1/0.

✅ Preprocessing complete! Final feature count: 61
✅ 'final_features_preprocessed_100_percent_no_reduction.csv' 파일이 생성되었습니다.
