In [None]:
!pip install -U xgboost==2.0.3 optuna scikit-learn



In [None]:
# 필수 모듈 import
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from xgboost import XGBRegressor
import optuna


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from xgboost import XGBRegressor
from xgboost.callback import EarlyStopping
import optuna

# ✅ 데이터 로드
df = pd.read_csv('/content/2020_2023_최종데이터.csv')
df.drop(columns=['tm_dt'], errors='ignore', inplace=True)

# ✅ 타겟 로그 변환
df['log_call_count'] = np.log1p(df['call_count'])

# ✅ 날짜 파생 변수
df['year'] = df['tm'] // 10000
df['month'] = (df['tm'] % 10000) // 100
df['day'] = df['tm'] % 100
df['weekday'] = pd.to_datetime(df['tm'], format='%Y%m%d').dt.weekday
df['is_weekend'] = df['weekday'].isin([5, 6]).astype(int)
df['is_before_holiday'] = df['공휴일'].shift(-1, fill_value=0)
df['is_after_holiday'] = df['공휴일'].shift(1, fill_value=0)

# ✅ 날씨 파생
df['is_rain'] = (df['rn_day'] > 0).astype(int)
df['high_temp'] = (df['ta_max'] > 30).astype(int)
df['low_temp'] = (df['ta_min'] < 0).astype(int)

# ✅ 이동 평균 및 래깅
df['gu_lag1'] = df.groupby('address_gu')['call_count'].shift(1)
df['gu_roll3'] = df.groupby('address_gu')['call_count'].shift(1).rolling(3).mean().reset_index(0, drop=True)

# ✅ 범주형 인코딩
encoder = LabelEncoder()
df['address_city'] = encoder.fit_transform(df['address_city'])

# ✅ 평균 타겟 인코딩
df['address_gu_mean_target'] = df.groupby('address_gu')['call_count'].transform('mean')
df['sub_address_mean_target'] = df.groupby('sub_address')['call_count'].transform('mean')

# ✅ 결측치 제거
df.dropna(inplace=True)

# ✅ 피처/타겟 분리
drop_cols = ['tm', 'call_count', 'log_call_count', 'address_gu', 'sub_address']
X = df.drop(columns=drop_cols)
y = df['log_call_count']

X = X.astype('float32')
y = y.astype('float32')

# ✅ 훈련/검증 분할
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# ✅ Optuna 목적 함수 정의 (callbacks 방식)
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int("n_estimators", 200, 600),
        'max_depth': trial.suggest_int("max_depth", 4, 10),
        'learning_rate': trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
        'subsample': trial.suggest_float("subsample", 0.7, 1.0),
        'colsample_bytree': trial.suggest_float("colsample_bytree", 0.7, 1.0),
        'tree_method': 'hist',
        'random_state': 42
    }

    model = XGBRegressor(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        callbacks=[EarlyStopping(rounds=20, save_best=True)],
        verbose=False
    )
    preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(np.expm1(y_val), np.expm1(preds)))
    return rmse

# ✅ Optuna 튜닝 실행
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)

# ✅ 최적 파라미터 추출 후 모델 재학습
best_params = study.best_params
best_params['tree_method'] = 'hist'
best_params['random_state'] = 42

final_model = XGBRegressor(**best_params)
final_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    callbacks=[EarlyStopping(rounds=20, save_best=True)],
    verbose=False
)

# ✅ 최종 평가
preds = final_model.predict(X_val)
y_true = np.expm1(y_val)
y_pred = np.expm1(preds)

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
rmsle = np.sqrt(mean_squared_log_error(y_true, y_pred))

print(f"✅ RMSE: {rmse:.2f}")
print(f"✅ RMSLE: {rmsle:.4f}")


[I 2025-06-18 14:06:28,558] A new study created in memory with name: no-name-e0a94601-e198-4cf0-b121-7a87278ac1bf
[I 2025-06-18 14:06:46,906] Trial 0 finished with value: 1.612262099111962 and parameters: {'n_estimators': 421, 'max_depth': 4, 'learning_rate': 0.012802089322641248, 'subsample': 0.9322890424799122, 'colsample_bytree': 0.8433187840474503}. Best is trial 0 with value: 1.612262099111962.
[I 2025-06-18 14:06:52,351] Trial 1 finished with value: 1.5670977276755709 and parameters: {'n_estimators': 575, 'max_depth': 10, 'learning_rate': 0.06680646332507088, 'subsample': 0.8314812548643848, 'colsample_bytree': 0.7745372793176316}. Best is trial 1 with value: 1.5670977276755709.
[I 2025-06-18 14:06:55,579] Trial 2 finished with value: 1.6064536978552046 and parameters: {'n_estimators': 469, 'max_depth': 10, 'learning_rate': 0.12523240127651755, 'subsample': 0.7101122944684548, 'colsample_bytree': 0.8044231106373455}. Best is trial 1 with value: 1.5670977276755709.
[I 2025-06-18 1

✅ RMSE: 1.44
✅ RMSLE: 0.3030


In [None]:
# 2024 데이터 로드
df_2024 = pd.read_csv('/content/2024_최종데이터.csv')
df_2024.drop(columns=['tm_dt'], errors='ignore', inplace=True)

# 날짜 파생
df_2024['year'] = df_2024['tm'] // 10000
df_2024['month'] = (df_2024['tm'] % 10000) // 100
df_2024['day'] = df_2024['tm'] % 100
df_2024['weekday'] = pd.to_datetime(df_2024['tm'], format='%Y%m%d').dt.weekday
df_2024['is_weekend'] = df_2024['weekday'].isin([5, 6]).astype(int)
df_2024['is_before_holiday'] = df_2024['공휴일'].shift(-1, fill_value=0)
df_2024['is_after_holiday'] = df_2024['공휴일'].shift(1, fill_value=0)

# 날씨 파생
df_2024['is_rain'] = (df_2024['rn_day'] > 0).astype(int)
df_2024['high_temp'] = (df_2024['ta_max'] > 30).astype(int)
df_2024['low_temp'] = (df_2024['ta_min'] < 0).astype(int)


In [None]:
# 평균 타겟 인코딩
address_gu_mean_map = df.groupby('address_gu')['call_count'].mean()
sub_address_mean_map = df.groupby('sub_address')['call_count'].mean()

df_2024['address_gu_mean_target'] = df_2024['address_gu'].map(address_gu_mean_map)
df_2024['sub_address_mean_target'] = df_2024['sub_address'].map(sub_address_mean_map)

df_2024['address_gu_mean_target'] = df_2024['address_gu_mean_target'].fillna(address_gu_mean_map.mean())
df_2024['sub_address_mean_target'] = df_2024['sub_address_mean_target'].fillna(sub_address_mean_map.mean())

# address_city 인코딩
df_2024['address_city'] = df_2024['address_city'].apply(lambda x: x if x in encoder.classes_ else 'Unknown')
if 'Unknown' not in encoder.classes_:
    encoder.classes_ = np.append(encoder.classes_, 'Unknown')
df_2024['address_city'] = encoder.transform(df_2024['address_city'])


In [None]:
# 제거할 컬럼
df_2024.drop(columns=['address_gu', 'sub_address'], inplace=True, errors='ignore')

# 예측용 입력 피처 구성
X_2024 = df_2024.drop(columns=['tm', 'call_count'], errors='ignore')
for col in X.columns:
    if col not in X_2024.columns:
        X_2024[col] = 0.0  # 누락된 컬럼 보완
X_2024 = X_2024[X.columns]  # 컬럼 순서 정렬
X_2024 = X_2024.astype('float32')

# 예측 수행
preds_log_2024 = final_model.predict(X_2024)
preds_2024 = np.expm1(preds_log_2024).round().astype(int)

# 예측 결과 반영
df_2024['predicted_call_count'] = preds_2024


In [None]:
# 저장 경로는 필요에 따라 변경
df_2024.to_csv('/content/250028.csv', index=False, encoding='euc-kr')
print("✅ 예측 결과 저장 완료!")

✅ 예측 결과 저장 완료!


In [None]:
result_df = pd.read_csv('/content/test_call119.csv', encoding='euc-kr')

In [None]:
result_df.head()

Unnamed: 0,TM,address_city,address_gu,sub_address,STN,ta_max,ta_min,ta_max_min,hm_min,hm_max,ws_max,ws_ins_max,rn_day,call_count
0,20240501,부산광역시,강서구,대저2동,904,18.5,11.1,7.4,42.5,82.5,6.5,11.6,0.0,
1,20240501,부산광역시,강서구,생곡동,904,18.5,11.1,7.4,42.5,82.5,6.5,11.6,0.0,
2,20240501,부산광역시,강서구,송정동,937,16.9,9.9,7.0,55.3,93.9,4.5,9.7,0.0,
3,20240501,부산광역시,강서구,신호동,950,16.6,11.4,5.2,48.1,84.6,6.4,13.5,0.0,
4,20240501,부산광역시,금정구,구서동,940,16.9,10.2,6.7,46.8,91.3,3.3,8.7,0.0,


In [None]:
result_df['call_count'] = df_2024['predicted_call_count']

In [None]:
result_df.head()

Unnamed: 0,TM,address_city,address_gu,sub_address,STN,ta_max,ta_min,ta_max_min,hm_min,hm_max,ws_max,ws_ins_max,rn_day,call_count
0,20240501,부산광역시,강서구,대저2동,904,18.5,11.1,7.4,42.5,82.5,6.5,11.6,0.0,1
1,20240501,부산광역시,강서구,생곡동,904,18.5,11.1,7.4,42.5,82.5,6.5,11.6,0.0,1
2,20240501,부산광역시,강서구,송정동,937,16.9,9.9,7.0,55.3,93.9,4.5,9.7,0.0,2
3,20240501,부산광역시,강서구,신호동,950,16.6,11.4,5.2,48.1,84.6,6.4,13.5,0.0,1
4,20240501,부산광역시,금정구,구서동,940,16.9,10.2,6.7,46.8,91.3,3.3,8.7,0.0,1


In [None]:
result_df.to_csv('/content/250028.csv', index=False, encoding='euc-kr')

In [None]:
result_df['call_count'].value_counts()

Unnamed: 0_level_0,count
call_count,Unnamed: 1_level_1
2,4487
1,4230
3,794
4,82
5,7
6,1
