In [None]:
# !pip install -U scikit-learn==1.3.2 xgboost==1.7.6 lightgbm==3.3.5

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression
from google.colab import drive

# 1. 데이터 로드 및 전처리 (기존 코드 그대로)
df = pd.read_csv('/content/2020_2023_최종데이터.csv')
if 'tm_dt' in df.columns: df.drop(columns=['tm_dt'], inplace=True)
df['log_call_count'] = np.log1p(df['call_count'])
df['year']    = df['tm'] // 10000
df['month']   = (df['tm'] % 10000) // 100
df['day']     = df['tm'] % 100
df['weekday'] = pd.to_datetime(df['tm'], format='%Y%m%d').dt.weekday
df['is_weekend']        = df['weekday'].isin([5,6]).astype(int)
df['is_before_holiday'] = df['공휴일'].shift(-1, fill_value=0)
df['is_after_holiday']  = df['공휴일'].shift(1, fill_value=0)

encoder = LabelEncoder()
df['address_city'] = encoder.fit_transform(df['address_city'])
df['address_gu_mean_target']   = df.groupby('address_gu')['call_count'].transform('mean')
df['sub_address_mean_target']  = df.groupby('sub_address')['call_count'].transform('mean')

drop_cols = ['tm','call_count','log_call_count','address_gu','sub_address']
X = df.drop(columns=drop_cols).astype('float32')
y = df['log_call_count'].astype('float32')
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. 기본 개별 모델 정의
xgb = XGBRegressor(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    tree_method='hist',
    random_state=42
)
lgbm = LGBMRegressor(
    n_estimators=200,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

# 3-1. Voting 앙상블 (단순 평균)
voting = VotingRegressor(
    estimators=[('xgb', xgb), ('lgbm', lgbm), ('rf', rf)],
    n_jobs=-1
)
voting.fit(X_train, y_train)
pred_v = voting.predict(X_val)
rmse_v = np.sqrt(mean_squared_error(np.expm1(y_val), np.expm1(pred_v)))
print(f"✅ Voting RMSE: {rmse_v:.3f}")

# 3-2. Stacking 앙상블
stack = StackingRegressor(
    estimators=[('xgb', xgb), ('lgbm', lgbm), ('rf', rf)],
    final_estimator=LinearRegression(),
    passthrough=True,
    n_jobs=-1
)
stack.fit(X_train, y_train)
pred_s = stack.predict(X_val)
rmse_s = np.sqrt(mean_squared_error(np.expm1(y_val), np.expm1(pred_s)))
print(f"✅ Stacking RMSE: {rmse_s:.3f}")

# 4. 가장 좋은 앙상블 모델 선택 및 저장
best_model = voting if rmse_v < rmse_s else stack
print(f"선택된 모델: {'Voting' if best_model is voting else 'Stacking'}")

# 5. 이후 검증 데이터(valid_df)에 대해 동일하게 예측·저장
# (기존 valid_df 전처리 코드 재사용)
# ...
# preds_log = best_model.predict(X_valid)
# preds = np.expm1(preds_log).round().astype(int)
# valid_df['call_count'] = preds
# valid_df.to_csv( … )



✅ Voting RMSE: 1.404
✅ Stacking RMSE: 1.400
선택된 모델: Stacking


In [None]:
# 5. 검증 데이터 로드 및 전처리
valid_df = pd.read_csv('/content/2024_최종데이터.csv')
# tm_dt 제거
if 'tm_dt' in valid_df.columns:
    valid_df.drop(columns=['tm_dt'], inplace=True)

# 날짜 파생 변수
valid_df['year']    = valid_df['tm'] // 10000
valid_df['month']   = (valid_df['tm'] % 10000) // 100
valid_df['day']     = valid_df['tm'] % 100
valid_df['weekday'] = pd.to_datetime(valid_df['tm'], format='%Y%m%d').dt.weekday
valid_df['is_weekend']        = valid_df['weekday'].isin([5,6]).astype(int)
valid_df['is_before_holiday'] = valid_df['공휴일'].shift(-1, fill_value=0)
valid_df['is_after_holiday']  = valid_df['공휴일'].shift(1, fill_value=0)

# address_city 인코딩
# 기존 encoder 클래스에 없는 값은 'Unknown' 처리 후 인코딩
valid_df['address_city'] = valid_df['address_city'].astype(str)
valid_df.loc[~valid_df['address_city'].isin(encoder.classes_), 'address_city'] = 'Unknown'
if 'Unknown' not in encoder.classes_:
    encoder.classes_ = np.append(encoder.classes_, 'Unknown')
valid_df['address_city'] = encoder.transform(valid_df['address_city'])

# 평균 타겟 인코딩 매핑
address_gu_map   = df.groupby('address_gu')['call_count'].mean()
sub_address_map  = df.groupby('sub_address')['call_count'].mean()

valid_df['address_gu_mean_target']  = valid_df['address_gu'].map(address_gu_map)\
    .fillna(address_gu_map.mean())
valid_df['sub_address_mean_target'] = valid_df['sub_address'].map(sub_address_map)\
    .fillna(sub_address_map.mean())

# 불필요 컬럼 제거
valid_df.drop(columns=['address_gu', 'sub_address'], errors='ignore', inplace=True)

# 입력 피처 구성
X_valid = valid_df.drop(columns=['tm', 'call_count'], errors='ignore').astype('float32')

# 학습 시 사용한 X 컬럼 순서에 맞추기
for col in X.columns:
    if col not in X_valid.columns:
        X_valid[col] = 0.0
X_valid = X_valid[X.columns]

# 6. 예측 수행
preds_log = best_model.predict(X_valid)
preds = np.expm1(preds_log).round().astype(int)
valid_df['call_count'] = preds

# 7. test_call119에 예측값 반영 및 파일 저장
result_df = pd.read_csv('/content/test_call119.csv', encoding='euc-kr')
result_df['call_count'] = valid_df['call_count'].values

# 결과 파일명은 원하는 대로 변경하세요
result_df.to_csv('/content/250028.csv', index=False, encoding='euc-kr')
print("✅ 최종 submission 파일 저장 완료: 250028.csv")


✅ 최종 submission 파일 저장 완료: 250028.csv
