In [None]:
!pip install -U scikit-learn==1.3.2 xgboost==1.7.6

Collecting scikit-learn==1.3.2
  Downloading scikit_learn-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting xgboost==1.7.6
  Downloading xgboost-1.7.6-py3-none-manylinux2014_x86_64.whl.metadata (1.9 kB)
Collecting numpy<2.0,>=1.17.3 (from scikit-learn==1.3.2)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Downloading scikit_learn-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m74.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xgboost-1.7.6-py3-none-manylinux2014_x86_64.whl (200.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.3/200.3 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import matplotlib.pyplot as plt

In [None]:
# 데이터 불러오기
df = pd.read_csv('/content/drive/MyDrive/Epoch 공모전/2020_2023_최종데이터.csv')

In [None]:
# 불필요한 컬럼 제거
if 'tm_dt' in df.columns:
    df.drop(columns=['tm_dt'], inplace=True)

In [None]:
# 로그 타겟 생성
df['log_call_count'] = np.log1p(df['call_count'])

In [None]:
# 날짜 파생 변수
df['year'] = df['tm'] // 10000
df['month'] = (df['tm'] % 10000) // 100
df['day'] = df['tm'] % 100
df['weekday'] = pd.to_datetime(df['tm'], format='%Y%m%d').dt.weekday
df['is_weekend'] = df['weekday'].isin([5, 6]).astype(int)
df['is_before_holiday'] = df['공휴일'].shift(-1, fill_value=0)
df['is_after_holiday'] = df['공휴일'].shift(1, fill_value=0)

In [None]:
# ✅ 추가 파생 변수
df['is_rainy_day'] = (df['rn_day'] > 0).astype(int)
df['humidity_range'] = df['hm_max'] - df['hm_min']
gu_mean = df.groupby('address_gu')['call_count'].mean()
threshold = gu_mean.quantile(0.75)
high_demand_gu = gu_mean[gu_mean >= threshold].index
df['high_demand_area'] = df['address_gu'].apply(lambda x: int(x in high_demand_gu))

In [None]:
# 범주형 인코딩
encoder = LabelEncoder()
df['address_city'] = encoder.fit_transform(df['address_city'])

In [None]:
# 평균 타겟 인코딩
df['address_gu_mean_target'] = df.groupby('address_gu')['call_count'].transform('mean')
df['sub_address_mean_target'] = df.groupby('sub_address')['call_count'].transform('mean')

In [None]:
# 학습용 피처 설정
drop_cols = ['tm', 'call_count', 'log_call_count', 'address_gu', 'sub_address']
X = df.drop(columns=drop_cols)
y = df['log_call_count']

In [None]:
# float32 최적화
X = X.astype('float32')
y = y.astype('float32')

In [None]:
# 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# 모델 학습
model = XGBRegressor(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    tree_method='hist',
    early_stopping_rounds=30,
    random_state=42
)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=False
)

In [None]:
# 예측 및 평가
preds = model.predict(X_val)
preds_real = np.expm1(preds)
y_val_real = np.expm1(y_val)

rmse = np.sqrt(mean_squared_error(y_val_real, preds_real))
print(f"✅ RMSE (추가 파생 변수 포함): {rmse:.2f}")

✅ RMSE (추가 파생 변수 포함): 1.41


In [None]:
# 데이터 불러오기
valid_df = pd.read_csv('/content/drive/MyDrive/Epoch 공모전/2024_최종데이터.csv')

In [None]:
# 2. tm_dt 제거
if 'tm_dt' in valid_df.columns:
    valid_df.drop(columns=['tm_dt'], inplace=True)

In [None]:
# 3. 날짜 파생 변수
valid_df['year'] = valid_df['tm'] // 10000
valid_df['month'] = (valid_df['tm'] % 10000) // 100
valid_df['day'] = valid_df['tm'] % 100
valid_df['weekday'] = pd.to_datetime(valid_df['tm'], format='%Y%m%d').dt.weekday
valid_df['is_weekend'] = valid_df['weekday'].isin([5, 6]).astype(int)
valid_df['is_before_holiday'] = valid_df['공휴일'].shift(-1, fill_value=0)
valid_df['is_after_holiday'] = valid_df['공휴일'].shift(1, fill_value=0)

In [None]:
# 4. 파생 변수 동일하게 생성
valid_df['is_rainy_day'] = (valid_df['rn_day'] > 0).astype(int)
valid_df['humidity_range'] = valid_df['hm_max'] - valid_df['hm_min']

In [None]:
# 고수요 지역구 라벨링
gu_mean = df.groupby('address_gu')['call_count'].mean()
threshold = gu_mean.quantile(0.75)
high_demand_gu = gu_mean[gu_mean >= threshold].index
valid_df['high_demand_area'] = valid_df['address_gu'].apply(lambda x: int(x in high_demand_gu) if x in df['address_gu'].unique() else 0)

In [None]:
# 5. address_city 인코딩 (encoder 사용)
valid_df['address_city'] = valid_df['address_city'].apply(
    lambda x: x if x in encoder.classes_ else 'Unknown'
)
if 'Unknown' not in encoder.classes_:
    encoder.classes_ = np.append(encoder.classes_, 'Unknown')
valid_df['address_city'] = encoder.transform(valid_df['address_city'])

In [None]:
# 6. 평균값 인코딩 (있을 경우) or 평균값 대체
if 'address_gu' in valid_df.columns:
    address_gu_mean_map = df.groupby('address_gu')['call_count'].mean()
    valid_df['address_gu_mean_target'] = valid_df['address_gu'].map(address_gu_mean_map)
    valid_df['address_gu_mean_target'] = valid_df['address_gu_mean_target'].fillna(address_gu_mean_map.mean())
else:
    valid_df['address_gu_mean_target'] = df['call_count'].mean()

if 'sub_address' in valid_df.columns:
    sub_address_mean_map = df.groupby('sub_address')['call_count'].mean()
    valid_df['sub_address_mean_target'] = valid_df['sub_address'].map(sub_address_mean_map)
    valid_df['sub_address_mean_target'] = valid_df['sub_address_mean_target'].fillna(sub_address_mean_map.mean())
else:
    valid_df['sub_address_mean_target'] = df['call_count'].mean()

In [None]:
# 7. 불필요 컬럼 제거
X_valid = valid_df.drop(columns=['tm', 'call_count', 'address_gu', 'sub_address'], errors='ignore')
X_valid = X_valid.astype('float32')

In [None]:
# 8. 컬럼 정렬 및 누락 처리 (모델 학습 시와 일치하도록)
for col in X.columns:
    if col not in X_valid.columns:
        print(f"🟡 '{col}' 누락 → 0으로 채움")
        X_valid[col] = 0.0
X_valid = X_valid[X.columns]


In [None]:
# 9. 예측 수행
preds_log = model.predict(X_valid)
preds = np.expm1(preds_log)
preds_rounded = preds.round().astype(int)

In [None]:
result_df = pd.read_csv('/content/drive/MyDrive/Epoch 공모전/test_call119.csv', encoding='euc-kr')

In [None]:
result_df['call_count'] = preds_rounded

In [None]:
result_df.head()

Unnamed: 0,TM,address_city,address_gu,sub_address,STN,ta_max,ta_min,ta_max_min,hm_min,hm_max,ws_max,ws_ins_max,rn_day,call_count
0,20240501,부산광역시,강서구,대저2동,904,18.5,11.1,7.4,42.5,82.5,6.5,11.6,0.0,1
1,20240501,부산광역시,강서구,생곡동,904,18.5,11.1,7.4,42.5,82.5,6.5,11.6,0.0,1
2,20240501,부산광역시,강서구,송정동,937,16.9,9.9,7.0,55.3,93.9,4.5,9.7,0.0,2
3,20240501,부산광역시,강서구,신호동,950,16.6,11.4,5.2,48.1,84.6,6.4,13.5,0.0,1
4,20240501,부산광역시,금정구,구서동,940,16.9,10.2,6.7,46.8,91.3,3.3,8.7,0.0,1


In [None]:
result_df.to_csv('/content/drive/MyDrive/Epoch 공모전/250028.csv', index=False, encoding='euc-kr')