In [1]:
# pip install xgboost

In [2]:
# -*- coding: utf-8 -*-
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

import torch
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb

In [3]:
try:
    CUDA = torch.cuda.is_available()
except:
    CUDA = False

print(f'CUDA available: {CUDA}')

CUDA available: False


In [4]:
# -----------------------------
# 1) 경로 설정
# -----------------------------
TRAIN_DIR = Path('../open/train')
TEST_DIR  = Path('../open/test')
SAMPLE_FP = Path('../open/sample_submission.csv')

if not TRAIN_DIR.exists() or not TEST_DIR.exists() or not SAMPLE_FP.exists():
    print('Path error: directory or file does not exist. Please check the paths.')
    print(f'Expected file path: {TRAIN_DIR}')
    exit()
else:
    print('File paths are valid.')

File paths are valid.


In [5]:
# -----------------------------
# 2) 데이터 로드
# -----------------------------
print('Data loading...')

train = pd.read_csv('../open/train/train.csv')
train['영업일자'] = pd.to_datetime(train['영업일자'], format='%Y-%m-%d')

sample = pd.read_csv(SAMPLE_FP)

tests = {}
for i in range(10):
    test_name = f'TEST_{i:02d}'
    df = pd.read_csv(TEST_DIR / f'{test_name}.csv')
    df['영업일자'] = pd.to_datetime(df['영업일자'], format='%Y-%m-%d')
    tests[test_name] = df

Data loading...


In [6]:
# -----------------------------
# 3) 피처 엔지니어링
# -----------------------------
print('Feature engineering...')

encoder = LabelEncoder()
train['item_id'] = encoder.fit_transform(train['영업장명_메뉴명'])

def make_date_feats(dataframe: pd.DataFrame) -> pd.DataFrame:
    df_processed = dataframe.copy()
    
    date_column = df_processed['영업일자']
    df_processed['year']    = date_column.dt.year
    df_processed['month']   = date_column.dt.month
    df_processed['day']     = date_column.dt.day
    df_processed['weekday'] = date_column.dt.weekday
    
    df_processed['is_weekend'] = df_processed['weekday'].apply(lambda x: 1 if x in [5, 6] else 0)
    
    df_processed['month_sin'] = np.sin(2 * np.pi * df_processed['month'] / 12)
    df_processed['month_cos'] = np.cos(2 * np.pi * df_processed['month'] / 12)
    df_processed['wday_sin']  = np.sin(2 * np.pi * df_processed['weekday'] / 7)
    df_processed['wday_cos']  = np.cos(2 * np.pi * df_processed['weekday'] / 7)
    
    return df_processed

train = make_date_feats(train)
train.sort_values(['item_id', '영업일자'], inplace=True)

lags = [1, 7, 14]
for lag in lags:
    train[f'lag_{lag}'] = train.groupby('item_id', observed=True)['매출수량'].shift(lag)

groups = train.groupby('item_id', observed=True)['매출수량']
train['rolling_mean_7']     = groups.shift(1).rolling(7).mean().reset_index(0, drop=True)
train['rolling_mean_14']    = groups.shift(1).rolling(14).mean().reset_index(0, drop=True)
train['rolling_std_7']      = groups.shift(1).rolling(7).std().reset_index(0, drop=True)

cols_with_na = [
    'lag_1', 'lag_7', 'lag_14', 'rolling_mean_7', 'rolling_mean_14', 'rolling_std_7'
]
train.dropna(subset=cols_with_na, inplace=True)

feature_cols = [
    'year',
    'month',
    'day',
    'weekday',
    'is_weekend',
    'month_sin',
    'month_cos',
    'wday_sin',
    'wday_cos',
    'item_id',
    'lag_1',
    'lag_7',
    'lag_14',
    'rolling_mean_7',
    'rolling_mean_14',
    'rolling_std_7',
]
X = train[feature_cols]
y = train['매출수량'].astype(float)

XGBparams = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'tree_method': 'hist',
    'device': 'cuda' if CUDA else 'cpu',
    'max_depth': 8,
    'learning_rate': 0.05,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42,
}

print('Feature engineering complete')

Feature engineering...
Feature engineering complete


In [7]:
# -----------------------------
# 4) TimeSeriesSplit CV로 best_iteration 찾기
# -----------------------------
print('Finding best iteration using TimeSeriesSplit CV...')

time_series_validator = TimeSeriesSplit(n_splits=5)

optimal_iterations_list = []

for fold, (tr_idx, va_idx) in enumerate(time_series_validator.split(X), 1):
    X_tf, X_val = X.iloc[tr_idx], X.iloc[va_idx]
    y_tf, y_val = y.iloc[tr_idx], y.iloc[va_idx]

    dtr = xgb.DMatrix(X_tf, label=y_tf)
    dva = xgb.DMatrix(X_val, label=y_val)

    trained_model = xgb.train(
        XGBparams,
        dtr,
        num_boost_round=5000,
        evals=[(dva, 'val')],
        early_stopping_rounds=100,
        verbose_eval=200,
    )
    optimal_iterations_list.append(trained_model.best_iteration)

if len(optimal_iterations_list) > 0:
    final_boost_round = int(np.median(optimal_iterations_list))
else:
    final_boost_round = 1000

data_dmatrix = xgb.DMatrix(X, label=y)
final_model = xgb.train(
    XGBparams, 
    data_dmatrix, 
    num_boost_round=final_boost_round, 
    verbose_eval=False
)

print(f'Best iteration: {final_boost_round}')

Finding best iteration using TimeSeriesSplit CV...
[0]	val-rmse:10.62126
[145]	val-rmse:7.58975
[0]	val-rmse:9.47710
[179]	val-rmse:7.05368
[0]	val-rmse:19.88510
[155]	val-rmse:16.71569
[0]	val-rmse:37.42719
[180]	val-rmse:29.87412
[0]	val-rmse:87.11863
[200]	val-rmse:59.36537
[212]	val-rmse:59.34093
Best iteration: 79


In [8]:
# -----------------------------
# 5) 재귀 예측
# -----------------------------
print('Recursive prediction...')
prediction_results = []

for test_name, current_test_data in tests.items():
    current_test_data = current_test_data.copy()
    current_test_data['item_id'] = encoder.transform(current_test_data['영업장명_메뉴명'])
    current_test_data = make_date_feats(current_test_data)
    current_test_data = current_test_data.sort_values(['item_id', '영업일자'])

    last_date = current_test_data['영업일자'].max()
    items = current_test_data['영업장명_메뉴명'].unique()

    history = current_test_data[['영업일자', 'item_id', '영업장명_메뉴명', '매출수량']].copy()

    preds_rows = []
    current_date = last_date
    for step in range(1, 8):
        target_date = current_date + pd.Timedelta(days=1)

        # 예측을 위한 데이터 프레임 생성
        frame = pd.DataFrame(
            {'영업일자': np.repeat(target_date, len(items)), '영업장명_메뉴명': items}
        )
        frame['item_id'] = encoder.transform(frame['영업장명_메뉴명'])
        frame = make_date_feats(frame)

        # 특징 계산
        temp = history.copy()
        for lag in [1, 7, 14]:
            lagged = temp[['영업일자', 'item_id', '매출수량']].copy()
            lagged['영업일자'] = lagged['영업일자'] + pd.Timedelta(days=lag)
            frame = frame.merge(
                lagged.rename(columns={'매출수량': f'lag_{lag}'}),
                on=['영업일자', 'item_id'],
                how='left',
            )

        roll_base = history.sort_values(['item_id', '영업일자']).copy()
        gb = roll_base.groupby('item_id', observed=True)['매출수량']
        roll_base['rolling_mean_7']    = gb.rolling(7).mean().reset_index(0, drop=True)
        roll_base['rolling_mean_14']   = gb.rolling(14).mean().reset_index(0, drop=True)
        roll_base['rolling_std_7']     = gb.rolling(7).std().reset_index(0, drop=True)
        roll_base['영업일자']           = roll_base['영업일자'] + pd.Timedelta(days=1)

        frame = frame.merge(
            roll_base[
                ['영업일자', 'item_id', 'rolling_mean_7', 'rolling_mean_14', 'rolling_std_7']
            ],
            on=['영업일자', 'item_id'],
            how='left',
        )

        fill_cols = ['lag_1', 'lag_7', 'lag_14', 'rolling_mean_7', 'rolling_mean_14', 'rolling_std_7']
        frame[fill_cols] = frame[fill_cols].fillna(0)

        X_pred           = frame[feature_cols]
        dpred            = xgb.DMatrix(X_pred)
        predicted_values = final_model.predict(dpred)
        predicted_values = np.maximum(predicted_values, 0)
        frame['pred']    = predicted_values

        # 이전 예측 결과를 history에 추가해 재귀
        add_hist = frame[['영업일자', 'item_id', '영업장명_메뉴명', 'pred']].rename(
            columns={'pred': '매출수량'}
        )
        history = pd.concat([history, add_hist], ignore_index=True)

        # 결과 저장
        frame_out = frame[['영업일자', '영업장명_메뉴명', 'pred']].copy()
        frame_out['영업일자'] = f'{test_name}+{step}일'
        preds_rows.append(frame_out)

        current_date = target_date

    test_pred = pd.concat(preds_rows, ignore_index=True)
    wide = test_pred.pivot(index='영업일자', columns='영업장명_메뉴명', values='pred')
    prediction_results.append(wide)

print('Prediction complete')

Recursive prediction...
Prediction complete


In [9]:
# -----------------------------
# 7) 최종 제출 파일 생성
# -----------------------------
submission = pd.concat(prediction_results)
submission = submission.reset_index().rename(columns={'index': '영업일자'})
submission = submission[sample.columns]
out_path = '../open/out/submission_xgboost_1.csv'
submission.to_csv(out_path, index=False, encoding='utf-8-sig')
print(f'Submission file created: {out_path}')

Submission file created: ../open/out/submission_xgboost_1.csv
