In [None]:
# 1. 패키지 설치
!pip install -U scikit-learn==1.3.2 xgboost==1.7.6 koreanize_matplotlib

Collecting scikit-learn==1.3.2
  Downloading scikit_learn-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting xgboost==1.7.6
  Downloading xgboost-1.7.6-py3-none-manylinux2014_x86_64.whl.metadata (1.9 kB)
Collecting koreanize_matplotlib
  Downloading koreanize_matplotlib-0.1.1-py3-none-any.whl.metadata (992 bytes)
Collecting numpy<2.0,>=1.17.3 (from scikit-learn==1.3.2)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m812.7 kB/s[0m eta [36m0:00:00[0m
Downloading scikit_learn-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m64.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xgboost-1.7.6-py3-none-manylinux2014_x86_64.whl (200.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.

In [None]:
# 2. Google Drive 연동
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from tqdm import tqdm

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/Epoch 공모전/2020_2023_최종데이터.csv')
valid_df = pd.read_csv('/content/drive/MyDrive/Epoch 공모전/2024_최종데이터.csv')

# --- 2. 전처리 ---
train_df.drop(columns=['tm_dt'], inplace=True, errors='ignore')
valid_df.drop(columns=['tm_dt'], inplace=True, errors='ignore')
train_df.sort_values(by='tm', inplace=True)

# --- 3. 피처 엔지니어링 ---
def make_features(df):
    df['year'] = df['tm'] // 10000
    df['month'] = (df['tm'] % 10000) // 100
    df['day'] = df['tm'] % 100
    df['weekday'] = pd.to_datetime(df['tm'], format='%Y%m%d').dt.weekday
    df['day_of_year'] = pd.to_datetime(df['tm'], format='%Y%m%d').dt.dayofyear
    df['is_weekend'] = df['weekday'].isin([5, 6]).astype(int)
    df['is_before_holiday'] = df['공휴일'].shift(-1, fill_value=0)
    df['is_after_holiday'] = df['공휴일'].shift(1, fill_value=0)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df['is_rain'] = (df['rn_day'] > 0).astype(int)
    df['ta_range'] = df['ta_max'] - df['ta_min']
    df['ws_diff'] = df['ws_max'] - df['ws_ins_max']
    df.drop(columns=['month'], inplace=True)
    return df

train_df = make_features(train_df)

# --- 4. Lag & Rolling ---
train_df['call_count_lag_1'] = train_df.groupby('address_gu')['call_count'].shift(1)
train_df['call_count_lag_7'] = train_df.groupby('address_gu')['call_count'].shift(7)
train_df['call_count_rolling_mean_7'] = train_df.groupby('address_gu')['call_count'].shift(1).rolling(7, min_periods=1).mean()
train_df['call_count_rolling_std_7'] = train_df.groupby('address_gu')['call_count'].shift(1).rolling(7, min_periods=1).std()
train_df.fillna(0, inplace=True)

# --- 5. 타겟 로그 변환 ---
y = np.log1p(train_df['call_count'])
X = train_df.drop(columns=['call_count'])

# --- 6. 타겟 인코딩 ---
gu_mean_map = y.groupby(train_df['address_gu']).mean()
sub_mean_map = y.groupby(train_df['sub_address']).mean()
overall_mean = y.mean()

X['gu_mean'] = train_df['address_gu'].map(gu_mean_map)
X['sub_mean'] = train_df['sub_address'].map(sub_mean_map)
X.fillna(overall_mean, inplace=True)

# --- 7. 범주형 인코딩 ---
encoder = LabelEncoder()
X['address_city'] = encoder.fit_transform(X['address_city'])

# --- 8. Train/Val 분리 (인덱스 리셋 포함) ---
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=False)
gu_train = train_df['address_gu'].iloc[X_train.index].reset_index(drop=True)
gu_val = train_df['address_gu'].iloc[X_val.index].reset_index(drop=True)
X_train = X_train.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)

# --- 9. 모델 학습 (구별) ---
X_train_use = X_train.drop(columns=['tm', 'address_gu', 'sub_address'])
X_val_use = X_val.drop(columns=['tm', 'address_gu', 'sub_address'])

for gu in tqdm(gu_train.unique(), desc="구별 모델 학습"):
    idx = (gu_val == gu)
    if idx.sum() < 10:
        continue
    model = XGBRegressor(
        n_estimators=1000,
        learning_rate=0.03,
        max_depth=5,
        min_child_weight=3,
        subsample=0.8,
        colsample_bytree=0.8,
        tree_method='hist',
        early_stopping_rounds=50,
        random_state=42
    )
    model.fit(X_train_use, y_train, eval_set=[(X_val_use[idx], y_val[idx])], verbose=False)
    pred = model.predict(X_val_use[idx])
    rmse = np.sqrt(mean_squared_error(np.expm1(y_val[idx]), np.expm1(pred)))
    gu_models[gu] = model
    gu_scores[gu] = rmse
    print(f"[{gu}] RMSE: {rmse:.4f}")

# --- 전체 검증 데이터 RMSE 계산 ---
all_preds = []
all_targets = []

for gu in gu_models:
    idx = (gu_val == gu)
    if idx.sum() == 0:
        continue
    preds = gu_models[gu].predict(X_val_use[idx])
    all_preds.extend(np.expm1(preds))
    all_targets.extend(np.expm1(y_val[idx]))

total_rmse = np.sqrt(mean_squared_error(all_targets, all_preds))
print(f"\n📊 전체 검증 데이터 RMSE (역변환 기준): {total_rmse:.4f}")

# --- 10. 통합 예측을 위한 2024 데이터 준비 ---
combined_df = pd.concat([train_df, valid_df], ignore_index=True)
combined_df = make_features(combined_df)
combined_df.sort_values(by=['address_gu', 'tm'], inplace=True)
combined_df['call_count_lag_1'] = combined_df.groupby('address_gu')['call_count'].shift(1)
combined_df['call_count_lag_7'] = combined_df.groupby('address_gu')['call_count'].shift(7)
combined_df['call_count_rolling_mean_7'] = combined_df.groupby('address_gu')['call_count'].shift(1).rolling(7, min_periods=1).mean()
combined_df['call_count_rolling_std_7'] = combined_df.groupby('address_gu')['call_count'].shift(1).rolling(7, min_periods=1).std()
combined_df.fillna(0, inplace=True)

pred_df = combined_df[combined_df['tm'] >= 20240101].copy()

if 'Unknown' not in encoder.classes_:
    encoder.classes_ = np.append(encoder.classes_, 'Unknown')
pred_df['address_city'] = pred_df['address_city'].apply(lambda x: x if x in encoder.classes_ else 'Unknown')
pred_df['address_city'] = encoder.transform(pred_df['address_city'])
pred_df['gu_mean'] = pred_df['address_gu'].map(gu_mean_map).fillna(overall_mean)
pred_df['sub_mean'] = pred_df['sub_address'].map(sub_mean_map).fillna(overall_mean)

X_pred = pred_df[X_train_use.columns].astype('float32')

# --- 11. 예측 ---
result_list = []
for gu in tqdm(pred_df['address_gu'].unique(), desc="2024 예측"):
    if gu not in gu_models:
        continue
    idx = (pred_df['address_gu'] == gu)
    preds = np.expm1(gu_models[gu].predict(X_pred[idx]))
    preds = np.round(preds).astype(int)
    preds[preds < 0] = 0
    temp = pred_df[idx].copy()
    temp['call_count'] = preds
    result_list.append(temp[['tm', 'address_city', 'address_gu', 'sub_address', 'call_count']])

# --- 12. 제출 파일 생성 ---
submission_df = pd.concat(result_list)
submission_df.rename(columns={'tm': 'TM'}, inplace=True)
template = pd.read_csv('/content/drive/MyDrive/Epoch 공모전/test_call119.csv', encoding='euc-kr')
template.drop(columns=['call_count'], inplace=True, errors='ignore')
template['address_city'] = template['address_city'].apply(lambda x: x if x in encoder.classes_ else 'Unknown')
template['address_city'] = encoder.transform(template['address_city'])

submission = template.merge(submission_df, on=['TM', 'address_city', 'address_gu', 'sub_address'], how='left')
submission['call_count'] = submission['call_count'].fillna(0).astype(int)
submission.to_csv('/content/drive/MyDrive/Epoch 공모전/250028.csv', index=False, encoding='euc-kr')

print("\n✅ 제출 파일 저장 완료: 250028.csv")

구별 모델 학습:   6%|▋         | 1/16 [00:02<00:38,  2.56s/it]

[금정구] RMSE: 1.3669


구별 모델 학습:  12%|█▎        | 2/16 [00:06<00:44,  3.18s/it]

[남구] RMSE: 1.0796


구별 모델 학습:  19%|█▉        | 3/16 [00:10<00:46,  3.57s/it]

[사하구] RMSE: 0.9074


구별 모델 학습:  25%|██▌       | 4/16 [00:11<00:32,  2.71s/it]

[사상구] RMSE: 1.0336


구별 모델 학습:  31%|███▏      | 5/16 [00:13<00:27,  2.48s/it]

[연제구] RMSE: 1.0587


구별 모델 학습:  38%|███▊      | 6/16 [00:17<00:30,  3.08s/it]

[해운대구] RMSE: 1.1929


구별 모델 학습:  44%|████▍     | 7/16 [00:19<00:23,  2.60s/it]

[중구] RMSE: 1.1133


구별 모델 학습:  50%|█████     | 8/16 [00:23<00:24,  3.03s/it]

[영도구] RMSE: 1.1401


구별 모델 학습:  56%|█████▋    | 9/16 [00:27<00:22,  3.24s/it]

[수영구] RMSE: 1.0117


구별 모델 학습:  62%|██████▎   | 10/16 [00:29<00:17,  2.95s/it]

[서구] RMSE: 1.1240


구별 모델 학습:  69%|██████▉   | 11/16 [00:35<00:19,  3.88s/it]

[강서구] RMSE: 1.3224


구별 모델 학습:  75%|███████▌  | 12/16 [00:38<00:14,  3.61s/it]

[북구] RMSE: 1.0406


구별 모델 학습:  81%|████████▏ | 13/16 [00:41<00:09,  3.33s/it]

[부산진구] RMSE: 1.0283


구별 모델 학습:  88%|████████▊ | 14/16 [00:43<00:06,  3.13s/it]

[동래구] RMSE: 1.1630


구별 모델 학습:  94%|█████████▍| 15/16 [00:46<00:02,  2.93s/it]

[동구] RMSE: 0.9921


구별 모델 학습: 100%|██████████| 16/16 [00:51<00:00,  3.24s/it]

[기장군] RMSE: 1.1459






📊 전체 검증 데이터 RMSE (역변환 기준): 1.1199


2024 예측: 100%|██████████| 16/16 [00:00<00:00, 55.41it/s]



✅ 제출 파일 저장 완료: 250028.csv
