In [None]:
!pip install -U scikit-learn==1.3.2 xgboost==1.7.6 koreanize_matplotlib Optuna koreanize_matplotlib

Collecting scikit-learn==1.3.2
  Downloading scikit_learn-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting xgboost==1.7.6
  Downloading xgboost-1.7.6-py3-none-manylinux2014_x86_64.whl.metadata (1.9 kB)
Collecting koreanize_matplotlib
  Downloading koreanize_matplotlib-0.1.1-py3-none-any.whl.metadata (992 bytes)
Collecting Optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting numpy<2.0,>=1.17.3 (from scikit-learn==1.3.2)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting alembic>=1.5.0 (from Optuna)
  Downloading alembic-1.16.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from Optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading scikit_learn-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


- 1.02 이하 상관계수 제거, address city 제거

In [None]:
# --- [사전 준비] 필요한 라이브러리 임포트 ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
import joblib
import optuna
from functools import partial

# --- 1. 데이터 불러오기 및 기본 전처리 ---
df = pd.read_csv('/content/drive/MyDrive/기상청/최종작업/2020_2023_최종데이터.csv')
if 'tm_dt' in df.columns:
    df.drop(columns=['tm_dt'], inplace=True)

# address_city 컬럼 제거
df.drop(columns=['address_city'], inplace=True)


df['datetime'] = pd.to_datetime(df['tm'], format='%Y%m%d')
df['year'] = df['datetime'].dt.year

# --- 2. 5~10월만 필터링 ---
df = df[df['datetime'].dt.month.isin([5,6,7,8,9,10])].copy()

# --- 3. 누락된 날짜-동 조합을 0으로 채우기 ---
dong_list = df[['address_gu', 'sub_address']].drop_duplicates()
all_dates = pd.date_range(start='2020-05-01', end='2023-10-31')
all_dates = all_dates[(all_dates.month >= 5) & (all_dates.month <= 10)]
all_dates_int = all_dates.strftime('%Y%m%d').astype(int)

full_index = pd.MultiIndex.from_product(
    [all_dates_int, [dong for dong in map(tuple, dong_list.values)]],
    names=['tm', 'gu_sub']
)
full_df = pd.DataFrame(index=full_index).reset_index()
full_df[['address_gu', 'sub_address']] = pd.DataFrame(full_df['gu_sub'].tolist(), index=full_df.index)
full_df.drop(columns='gu_sub', inplace=True)

# 병합 및 call_count 결측치 처리
df = full_df.merge(df, on=['tm', 'address_gu', 'sub_address'], how='left')
df['call_count'] = df['call_count'].fillna(0)
df['datetime'] = pd.to_datetime(df['tm'], format='%Y%m%d')

# --- 4. 피처 생성 ---
df['day'] = df['datetime'].dt.day
df['weekday'] = df['datetime'].dt.weekday
df['day_of_year'] = df['datetime'].dt.dayofyear
df['is_weekend'] = df['weekday'].isin([5, 6]).astype(int)
df['month_sin'] = np.sin(2 * np.pi * df['datetime'].dt.month / 12)
df['month_cos'] = np.cos(2 * np.pi * df['datetime'].dt.month / 12)
df['is_before_holiday'] = df['공휴일'].shift(-1, fill_value=0)
df['is_after_holiday'] = df['공휴일'].shift(1, fill_value=0)

# ---- 4-1. 상관계수 낮은 피처 제거 --
# --- 상관관계 계산 ---
columns = df.select_dtypes(exclude='object').columns
corr_matrix = df[columns].corr()

# --- 제거 대상 기준 ---
threshold = 0.02
low_corr_features = corr_matrix['call_count'][corr_matrix['call_count'].abs() <= threshold].index.tolist()

# --- 예외로 반드시 유지할 컬럼 목록 ---
protected_cols = ['datetime', 'tm', 'address_gu', 'sub_address', 'call_count']
low_corr_features = [col for col in low_corr_features if col not in protected_cols]

# --- 컬럼 제거 수행 ---
df = df.drop(columns=low_corr_features)

# --- 결과 출력 ---
print(f"제거된 컬럼 수: {len(low_corr_features)}")
print("제거된 컬럼 목록:")
for col in low_corr_features:
    print("-", col)

# --- 5. 시계열 기반 피처 (동 단위만 유지) ---
group_key = ['address_gu', 'sub_address']
df['days_since_last_call_dong'] = df.groupby(group_key)['datetime'].diff().dt.days.fillna(0)
df['dong_lag_1'] = df.groupby(group_key)['call_count'].shift(1)
df['dong_lag_7'] = df.groupby(group_key)['call_count'].shift(7)
df['dong_rolling_mean_7'] = df.groupby(group_key)['call_count'].shift(1).rolling(window=7, min_periods=1).mean()
df['dong_rolling_std_7'] = df.groupby(group_key)['call_count'].shift(1).rolling(window=7, min_periods=1).std()

lag_cols = ['dong_lag_1', 'dong_lag_7', 'dong_rolling_mean_7', 'dong_rolling_std_7']
df[lag_cols] = df[lag_cols].fillna(0)


# --- 6. 학습/검증 분할 ---
df.sort_values(by='tm', inplace=True)
y = df['call_count'].copy()
X = df.drop(columns=['call_count'])
X_train_full, X_val_full, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

# --- 7. 인코딩 및 mean target encoding ---

gu_mean_map = y_train.groupby(X_train_full['address_gu']).mean()
sub_address_mean_map = y_train.groupby(X_train_full['sub_address']).mean()
overall_train_mean = y_train.mean()

X_train_full['address_gu_mean_target'] = X_train_full['address_gu'].map(gu_mean_map)
X_val_full['address_gu_mean_target'] = X_val_full['address_gu'].map(gu_mean_map).fillna(overall_train_mean)
X_train_full['sub_address_mean_target'] = X_train_full['sub_address'].map(sub_address_mean_map)
X_val_full['sub_address_mean_target'] = X_val_full['sub_address'].map(sub_address_mean_map).fillna(overall_train_mean)

X_train_full.fillna(overall_train_mean, inplace=True)
X_val_full.fillna(overall_train_mean, inplace=True)

# --- 8. 학습용 피처 구성 ---
gu_train = X_train_full['address_gu']
gu_val = X_val_full['address_gu']

X_train = X_train_full.drop(columns=['tm', 'datetime', 'address_gu', 'sub_address'])
X_val = X_val_full.drop(columns=['tm', 'datetime', 'address_gu', 'sub_address'])
model_columns = X_train.columns.tolist()

X_train = X_train.astype('float32')
X_val = X_val.astype('float32')
y_train = y_train.astype('float32')
y_val = y_val.astype('float32')

# --- 9. Optuna 튜닝 함수 정의 ---
def objective(trial, p_X_train, p_y_train, p_X_val, p_y_val, p_gu_train, p_gu_val):
    params = {
        'n_estimators': 1000,
        'max_depth': trial.suggest_int('max_depth', 5, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
        'tree_method': 'hist',
        'random_state': 42,
        'early_stopping_rounds': 30
    }
    gu_models = {}
    gu_list = p_gu_train.unique()

    for gu in gu_list:
        val_gu_indices = (p_gu_val == gu)
        X_val_gu = p_X_val[val_gu_indices]
        y_val_gu = p_y_val[val_gu_indices]
        if len(X_val_gu) < 10:
            continue
        model = XGBRegressor(**params)
        model.fit(p_X_train, p_y_train, eval_set=[(X_val_gu, y_val_gu)], verbose=False)
        gu_models[gu] = model

    all_preds, all_targets = [], []
    for gu, model in gu_models.items():
        val_gu_indices = (p_gu_val == gu)
        if val_gu_indices.any():
            y_pred_val = model.predict(p_X_val[val_gu_indices])
            # y_pred_val = np.clip(y_pred_val, 1, 10)
            all_preds.extend(y_pred_val)
            all_targets.extend(p_y_val[val_gu_indices])

    if not all_targets:
        return float('inf')
    return np.sqrt(mean_squared_error(all_targets, all_preds))

# --- 10. Optuna 실행 ---
print("\nOptuna 하이퍼파라미터 튜닝 시작")
objective_with_data = partial(objective,
                              p_X_train=X_train, p_y_train=y_train,
                              p_X_val=X_val, p_y_val=y_val,
                              p_gu_train=gu_train, p_gu_val=gu_val)
study = optuna.create_study(direction='minimize')
study.optimize(objective_with_data, n_trials=30)

print(f"\n최적 RMSE: {study.best_value}")
print(f"최적 파라미터: {study.best_params}")

# --- 11. 최종 모델 재학습 및 저장 ---
best_params = study.best_params
best_params['n_estimators'] = 1000
best_params['early_stopping_rounds'] = 50
best_params['random_state'] = 42
best_params['tree_method'] = 'hist'

final_gu_models = {}
for gu in tqdm(gu_train.unique(), desc="최종 모델 학습 진행"):
    val_gu_indices = (gu_val == gu)
    X_val_gu = X_val[val_gu_indices]
    y_val_gu = y_val[val_gu_indices]
    if len(X_val_gu) < 10:
        continue
    final_model = XGBRegressor(**best_params)
    final_model.fit(X_train, y_train, eval_set=[(X_val_gu, y_val_gu)], verbose=False)
    final_gu_models[gu] = final_model

# --- 12. 모델 및 인코더 저장 ---
joblib.dump(final_gu_models, 'final_gu_models.pkl')
artifacts = {
    'gu_mean_map': gu_mean_map,
    'sub_address_mean_map': sub_address_mean_map,
    'overall_train_mean': overall_train_mean,
    'model_columns': model_columns
}
joblib.dump(artifacts, 'artifacts.pkl')

print("\n모델 및 관련 객체 저장 완료!")

제거된 컬럼 수: 11
제거된 컬럼 목록:
- 가조시간(hr)
- 평균 전운량(1/10)
- 합계 대형증발량(mm)
- 합계 소형증발량(mm)
- 사망
- 공휴일
- 평균 지면온도(°C)
- weekday
- month_cos
- is_before_holiday
- is_after_holiday

Optuna 하이퍼파라미터 튜닝 시작


[I 2025-06-27 00:35:37,812] A new study created in memory with name: no-name-39d15fda-141f-4dbd-b412-e5c3a3a77e29
[I 2025-06-27 00:36:04,500] Trial 0 finished with value: 0.7396824868200007 and parameters: {'max_depth': 5, 'learning_rate': 0.04574793466804872, 'subsample': 0.9075013421376088, 'colsample_bytree': 0.8675940508178319, 'gamma': 6.985794697835589e-06, 'lambda': 0.00010999746895886135, 'alpha': 6.010476690813036e-06}. Best is trial 0 with value: 0.7396824868200007.
[I 2025-06-27 00:37:10,655] Trial 1 finished with value: 0.7422030611063104 and parameters: {'max_depth': 11, 'learning_rate': 0.09617075599377899, 'subsample': 0.776568133452279, 'colsample_bytree': 0.8970898495191988, 'gamma': 0.000610081613089591, 'lambda': 1.8139441171698413e-06, 'alpha': 0.0007366297384976316}. Best is trial 0 with value: 0.7396824868200007.
[I 2025-06-27 00:38:03,085] Trial 2 finished with value: 0.7436935644483161 and parameters: {'max_depth': 11, 'learning_rate': 0.12941641171209983, 'subs


최적 RMSE: 0.727642632454488
최적 파라미터: {'max_depth': 9, 'learning_rate': 0.16817612886481179, 'subsample': 0.9239044455958653, 'colsample_bytree': 0.8921670848424961, 'gamma': 2.733462655193376e-08, 'lambda': 0.8615857613998775, 'alpha': 7.760180286326885e-08}


최종 모델 학습 진행: 100%|██████████| 16/16 [00:52<00:00,  3.30s/it]


모델 및 관련 객체 저장 완료!





In [None]:
# --- 12. 모델 및 인코더 저장 ---
joblib.dump(final_gu_models, 'final_gu_models.pkl')
artifacts = {
    'gu_mean_map': gu_mean_map,
    'sub_address_mean_map': sub_address_mean_map,
    'overall_train_mean': overall_train_mean,
    'model_columns': model_columns,
    'low_corr_features': low_corr_features  # ✅ 제거된 피처 목록 저장
}
joblib.dump(artifacts, 'artifacts.pkl')

['artifacts.pkl']

In [None]:
from sklearn.metrics import r2_score

all_preds_r2, all_targets_r2 = [], []
for gu, model in final_gu_models.items():
    val_gu_indices = (gu_val == gu)
    if val_gu_indices.any():
        y_pred_val = model.predict(X_val[val_gu_indices])
        all_preds_r2.extend(y_pred_val)
        all_targets_r2.extend(y_val[val_gu_indices])

r2 = r2_score(all_targets_r2, all_preds_r2)
print(f"✅ 전체 검증 데이터에 대한 R² Score: {r2:.4f}")

✅ 전체 검증 데이터에 대한 R² Score: 0.6864


In [None]:
# --- [사전 준비] 필요한 라이브러리 임포트 ---
import pandas as pd
import numpy as np
from tqdm import tqdm
import joblib
from xgboost import XGBRegressor

# 1. Load model and artifacts
final_gu_models = joblib.load('final_gu_models.pkl')
artifacts = joblib.load('artifacts.pkl')
gu_mean_map = artifacts['gu_mean_map']
sub_address_mean_map = artifacts['sub_address_mean_map']
overall_train_mean = artifacts['overall_train_mean']
model_columns = artifacts['model_columns']
low_corr_features = artifacts['low_corr_features'] # ✅ 제거할 피처 목록 불러오기

# 2. Load and merge train + test data
train_df = pd.read_csv('/content/drive/MyDrive/기상청/최종작업/2020_2023_최종데이터.csv')
test_df = pd.read_csv('/content/drive/MyDrive/기상청/최종작업/2024_최종데이터.csv')
test_df['call_count'] = np.nan
combined_df = pd.concat([train_df, test_df], ignore_index=True)
combined_df.sort_values(by=['address_gu', 'sub_address', 'tm'], inplace=True)
combined_df['datetime'] = pd.to_datetime(combined_df['tm'], format='%Y%m%d')

# 3. Feature engineering
# --- [수정 1] 'year' 피처 생성 코드 추가 ---
combined_df['year'] = combined_df['datetime'].dt.year

combined_df['day'] = combined_df['datetime'].dt.day
combined_df['weekday'] = combined_df['datetime'].dt.weekday
combined_df['day_of_year'] = combined_df['datetime'].dt.dayofyear
combined_df['is_weekend'] = combined_df['weekday'].isin([5, 6]).astype(int)
combined_df['month_sin'] = np.sin(2 * np.pi * combined_df['datetime'].dt.month / 12)
combined_df['month_cos'] = np.cos(2 * np.pi * combined_df['datetime'].dt.month / 12)
combined_df['is_before_holiday'] = combined_df['공휴일'].shift(-1, fill_value=0)
combined_df['is_after_holiday'] = combined_df['공휴일'].shift(1, fill_value=0)

# --- [수정 2] 'days_since_last_call_dong' 피처 생성 코드 추가 ---
group_key = ['address_gu', 'sub_address']
combined_df['days_since_last_call_dong'] = combined_df.groupby(group_key)['datetime'].diff().dt.days.fillna(0)

# --- [사전 준비] 필요한 라이브러리 임포트 ---
import pandas as pd
import numpy as np
from tqdm import tqdm
import joblib
from xgboost import XGBRegressor

# 1. Load model and artifacts
final_gu_models = joblib.load('final_gu_models.pkl')
artifacts = joblib.load('artifacts.pkl')
gu_mean_map = artifacts['gu_mean_map']
sub_address_mean_map = artifacts['sub_address_mean_map']
overall_train_mean = artifacts['overall_train_mean']
model_columns = artifacts['model_columns']

# 2. Load and merge train + test data
train_df = pd.read_csv('/content/drive/MyDrive/기상청/최종작업/2020_2023_최종데이터.csv')
test_df = pd.read_csv('/content/drive/MyDrive/기상청/최종작업/2024_최종데이터.csv')
test_df['call_count'] = np.nan
combined_df = pd.concat([train_df, test_df], ignore_index=True)
combined_df.sort_values(by=['address_gu', 'sub_address', 'tm'], inplace=True)
combined_df['datetime'] = pd.to_datetime(combined_df['tm'], format='%Y%m%d')

# 3. Feature engineering
# --- [수정 1] 'year' 피처 생성 코드 추가 ---
combined_df['year'] = combined_df['datetime'].dt.year

combined_df['day'] = combined_df['datetime'].dt.day
combined_df['weekday'] = combined_df['datetime'].dt.weekday
combined_df['day_of_year'] = combined_df['datetime'].dt.dayofyear
combined_df['is_weekend'] = combined_df['weekday'].isin([5, 6]).astype(int)
combined_df['month_sin'] = np.sin(2 * np.pi * combined_df['datetime'].dt.month / 12)
combined_df['month_cos'] = np.cos(2 * np.pi * combined_df['datetime'].dt.month / 12)
combined_df['is_before_holiday'] = combined_df['공휴일'].shift(-1, fill_value=0)
combined_df['is_after_holiday'] = combined_df['공휴일'].shift(1, fill_value=0)

# ✅ 불러온 목록을 사용하여 동일한 컬럼 제거
combined_df = combined_df.drop(columns=low_corr_features, errors='ignore')

# --- [수정 2] 'days_since_last_call_dong' 피처 생성 코드 추가 ---
group_key = ['address_gu', 'sub_address']
combined_df['days_since_last_call_dong'] = combined_df.groupby(group_key)['datetime'].diff().dt.days.fillna(0)

combined_df['address_gu_mean_target'] = combined_df['address_gu'].map(gu_mean_map).fillna(overall_train_mean)
combined_df['sub_address_mean_target'] = combined_df['sub_address'].map(sub_address_mean_map).fillna(overall_train_mean)

# 4. Predict iteratively with lag features updated in-place
group_key = ['address_gu', 'sub_address']
pred_df = combined_df[combined_df['tm'] >= 20240101].copy()

# --- [수정 3] 초기 lag 피처를 Null 값으로 생성 (반복문 내에서 채워짐) ---
# 이 피처들은 반복적으로 업데이트되므로 초기에 생성해두는 것이 명확합니다.
lag_cols = ['dong_lag_1', 'dong_lag_7', 'dong_rolling_mean_7', 'dong_rolling_std_7']
for col in lag_cols:
    pred_df[col] = np.nan

pred_df['predicted_call_count'] = 0.0

for idx in tqdm(pred_df.index, desc="시계열 순차 예측"):
    gu = pred_df.loc[idx, 'address_gu']
    sub = pred_df.loc[idx, 'sub_address']
    tm = pred_df.loc[idx, 'tm']

    # --- ✅ history_calls 정의 ---
    history = combined_df[
        (combined_df['address_gu'] == gu) &
        (combined_df['sub_address'] == sub) &
        (combined_df['tm'] < tm)
    ].sort_values(by='tm')
    history_calls = history['call_count'].values

    # lag 피처 계산
    last_1 = history_calls[-1] if len(history_calls) >= 1 else 0
    last_7_window = history_calls[-7:] if len(history_calls) >= 7 else np.pad(history_calls, (7 - len(history_calls), 0))
    rolling_mean = np.mean(last_7_window)
    rolling_std = np.std(last_7_window)
    pred_df.loc[idx, 'dong_lag_1'] = last_1
    pred_df.loc[idx, 'dong_lag_7'] = last_7_window[0]
    pred_df.loc[idx, 'dong_rolling_mean_7'] = rolling_mean
    pred_df.loc[idx, 'dong_rolling_std_7'] = rolling_std

    # 입력 데이터 전처리 및 예측
    x_input_series = pred_df.loc[idx, model_columns]
    x_input_series_filled = pd.to_numeric(x_input_series, errors='coerce').fillna(0)
    x_input = x_input_series_filled.astype('float32').values.reshape(1, -1)

    if gu in final_gu_models:
        pred = final_gu_models[gu].predict(x_input)[0]
        final_pred = round(pred)
        # final_pred = np.clip(round(pred * 0.8), 1, 6)
    else:
        pred = pred_df.loc[idx, 'address_gu_mean_target']
        final_pred = round(pred)
        # final_pred = np.clip(round(pred * 0.8), 1, 6)

    pred_df.loc[idx, 'predicted_call_count'] = final_pred
    combined_df.loc[idx, 'call_count'] = final_pred

# 5. Merge with submission template (이하 동일)
submission_template = pd.read_csv('/content/drive/MyDrive/기상청/최종작업/test_call119.csv', encoding='euc-kr')
result_df = pred_df[['tm', 'address_gu', 'sub_address', 'predicted_call_count']].copy()
result_df.rename(columns={'tm': 'TM'}, inplace=True)

submission = pd.merge(
    submission_template.drop(columns=['call_count'], errors='ignore'),
    result_df,
    on=['TM', 'address_gu', 'sub_address'],
    how='left'
)

submission['call_count'] = submission['predicted_call_count'].astype(int)
submission.drop(columns=['predicted_call_count'], inplace=True)
submission.to_csv('250028.csv', index=False, encoding='euc-kr')
print("\n✅ 제출 파일 저장 완료: 250028.csv")


combined_df['address_gu_mean_target'] = combined_df['address_gu'].map(gu_mean_map).fillna(overall_train_mean)
combined_df['sub_address_mean_target'] = combined_df['sub_address'].map(sub_address_mean_map).fillna(overall_train_mean)

# 4. Predict iteratively with lag features updated in-place
group_key = ['address_gu', 'sub_address']
pred_df = combined_df[combined_df['tm'] >= 20240101].copy()

# --- [수정 3] 초기 lag 피처를 Null 값으로 생성 (반복문 내에서 채워짐) ---
# 이 피처들은 반복적으로 업데이트되므로 초기에 생성해두는 것이 명확합니다.
lag_cols = ['dong_lag_1', 'dong_lag_7', 'dong_rolling_mean_7', 'dong_rolling_std_7']
for col in lag_cols:
    pred_df[col] = np.nan

pred_df['predicted_call_count'] = 0.0

for idx in tqdm(pred_df.index, desc="시계열 순차 예측"):
    gu = pred_df.loc[idx, 'address_gu']
    sub = pred_df.loc[idx, 'sub_address']
    tm = pred_df.loc[idx, 'tm']

    # --- ✅ history_calls 정의 ---
    history = combined_df[
        (combined_df['address_gu'] == gu) &
        (combined_df['sub_address'] == sub) &
        (combined_df['tm'] < tm)
    ].sort_values(by='tm')
    history_calls = history['call_count'].values

    # lag 피처 계산
    last_1 = history_calls[-1] if len(history_calls) >= 1 else 0
    last_7_window = history_calls[-7:] if len(history_calls) >= 7 else np.pad(history_calls, (7 - len(history_calls), 0))
    rolling_mean = np.mean(last_7_window)
    rolling_std = np.std(last_7_window)
    pred_df.loc[idx, 'dong_lag_1'] = last_1
    pred_df.loc[idx, 'dong_lag_7'] = last_7_window[0]
    pred_df.loc[idx, 'dong_rolling_mean_7'] = rolling_mean
    pred_df.loc[idx, 'dong_rolling_std_7'] = rolling_std

    # 입력 데이터 전처리 및 예측
    x_input_series = pred_df.loc[idx, model_columns]
    x_input_series_filled = pd.to_numeric(x_input_series, errors='coerce').fillna(0)
    x_input = x_input_series_filled.astype('float32').values.reshape(1, -1)

    if gu in final_gu_models:
        pred = final_gu_models[gu].predict(x_input)[0]

        scaled_pred = pred * 0.8

        clipped_pred = np.clip(scaled_pred, 1, 6)

        final_pred = round(clipped_pred)

    else:
        pred = pred_df.loc[idx, 'address_gu_mean_target']

        scaled_pred = pred * 0.8

        clipped_pred = np.clip(scaled_pred, 1, 6)

        final_pred = round(clipped_pred)

    pred_df.loc[idx, 'predicted_call_count'] = final_pred
    combined_df.loc[idx, 'call_count'] = final_pred

# 5. Merge with submission template (이하 동일)
submission_template = pd.read_csv('/content/drive/MyDrive/기상청/최종작업/test_call119.csv', encoding='euc-kr')
result_df = pred_df[['tm', 'address_gu', 'sub_address', 'predicted_call_count']].copy()
result_df.rename(columns={'tm': 'TM'}, inplace=True)

submission = pd.merge(
    submission_template.drop(columns=['call_count'], errors='ignore'),
    result_df,
    on=['TM', 'address_gu', 'sub_address'],
    how='left'
)

submission['call_count'] = submission['predicted_call_count'].astype(int)
submission.drop(columns=['predicted_call_count'], inplace=True)
submission.to_csv('250028.csv', index=False, encoding='euc-kr')
print("\n✅ 제출 파일 저장 완료: 250028.csv")

시계열 순차 예측: 100%|██████████| 9601/9601 [02:52<00:00, 55.50it/s]



✅ 제출 파일 저장 완료: 250028.csv


시계열 순차 예측: 100%|██████████| 9601/9601 [02:53<00:00, 55.48it/s]



✅ 제출 파일 저장 완료: 250028.csv


In [None]:
submission['call_count'].value_counts()

Unnamed: 0_level_0,count
call_count,Unnamed: 1_level_1
1,6228
2,2584
3,549
6,182
4,36
5,22
