In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import shutil

source_folder = '/content/drive/MyDrive/LGAimers/data'
destination_folder = '/content/data'

try:
    shutil.copytree(source_folder, destination_folder)
except FileExistsError:
    print(f"폴더 '{destination_folder}'가 이미 존재")

폴더 '/content/data'가 이미 존재


In [None]:
!pip install catboost
!pip install holidays



In [None]:
import os
import random
import glob
import re

import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split


import torch
import torch.nn as nn
from tqdm import tqdm

from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

import pandas as pd
import holidays
from tqdm import tqdm

Fixed RandomSeed & Setting Hyperparameter

In [None]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

set_seed(42)

In [None]:
LOOKBACK, PREDICT, BATCH_SIZE, EPOCHS = 28, 7, 16, 50
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

LOAD DATA

In [None]:
train_data = pd.read_csv('/content/data/train.csv', encoding='utf-8-sig')
print(train_data)

              영업일자            영업장명_메뉴명  매출수량
0       2023-01-01  느티나무 셀프BBQ_1인 수저세트     0
1       2023-01-02  느티나무 셀프BBQ_1인 수저세트     0
2       2023-01-03  느티나무 셀프BBQ_1인 수저세트     0
3       2023-01-04  느티나무 셀프BBQ_1인 수저세트     0
4       2023-01-05  느티나무 셀프BBQ_1인 수저세트     0
...            ...                 ...   ...
102671  2024-06-11        화담숲카페_현미뻥스크림    12
102672  2024-06-12        화담숲카페_현미뻥스크림    10
102673  2024-06-13        화담숲카페_현미뻥스크림    14
102674  2024-06-14        화담숲카페_현미뻥스크림    12
102675  2024-06-15        화담숲카페_현미뻥스크림    60

[102676 rows x 3 columns]


Data analysis

In [None]:
##sort by menu and chronological time
train_data = train_data.sort_values(['영업장명_메뉴명', '영업일자'])
train_data['영업일자'] = pd.to_datetime(train_data['영업일자'])
train_data['요일'] = train_data['영업일자'].dt.day_name()
train_data[['영업장명', '메뉴명']] = train_data['영업장명_메뉴명'].str.split('_', n=1, expand=True)

print(train_data)

             영업일자            영업장명_메뉴명  매출수량         요일        영업장명      메뉴명
0      2023-01-01  느티나무 셀프BBQ_1인 수저세트     0     Sunday  느티나무 셀프BBQ  1인 수저세트
1      2023-01-02  느티나무 셀프BBQ_1인 수저세트     0     Monday  느티나무 셀프BBQ  1인 수저세트
2      2023-01-03  느티나무 셀프BBQ_1인 수저세트     0    Tuesday  느티나무 셀프BBQ  1인 수저세트
3      2023-01-04  느티나무 셀프BBQ_1인 수저세트     0  Wednesday  느티나무 셀프BBQ  1인 수저세트
4      2023-01-05  느티나무 셀프BBQ_1인 수저세트     0   Thursday  느티나무 셀프BBQ  1인 수저세트
...           ...                 ...   ...        ...         ...      ...
102671 2024-06-11        화담숲카페_현미뻥스크림    12    Tuesday       화담숲카페   현미뻥스크림
102672 2024-06-12        화담숲카페_현미뻥스크림    10  Wednesday       화담숲카페   현미뻥스크림
102673 2024-06-13        화담숲카페_현미뻥스크림    14   Thursday       화담숲카페   현미뻥스크림
102674 2024-06-14        화담숲카페_현미뻥스크림    12     Friday       화담숲카페   현미뻥스크림
102675 2024-06-15        화담숲카페_현미뻥스크림    60   Saturday       화담숲카페   현미뻥스크림

[102676 rows x 6 columns]


In [None]:
def make_expanded_train(df, input_window_size=28, predict_window_size=7):
    df = df.sort_values(['영업장명_메뉴명', '영업일자'])
    df['영업일자'] = pd.to_datetime(df['영업일자'])
    df['분기'] = df['영업일자'].dt.quarter

    result = []

    for name, group in df.groupby('영업장명_메뉴명'):
        group = group.sort_values('영업일자')
        sales = group['매출수량'].values
        dates = group['영업일자'].values
        store_name = group['영업장명'].iloc[0]
        menu_name = group['메뉴명'].iloc[0]

        for i in range(len(group) - input_window_size - predict_window_size + 1):
            input_window = sales[i : i + input_window_size]
            predict_window = sales[i + input_window_size : i + input_window_size + predict_window_size]
            start_date = dates[i]

            for j in range(predict_window_size):
                predict_date = dates[i + input_window_size + j]

                result.append({
                    '영업장명_메뉴명': name,
                    'store_name': store_name,
                    'menu_name': menu_name,
                    'start_date': start_date,
                    **{f'X_{k}': input_window[k] for k in range(input_window_size)},
                    'predict_date': predict_date,
                    'predict_date_offset': j + 1,
                    'target': predict_window[j]
                })

    df_result = pd.DataFrame(result)

    df_result.insert(df_result.columns.get_loc('start_date') + 1, 'start_date_year', df_result['start_date'].dt.year)
    df_result.insert(df_result.columns.get_loc('start_date') + 2, 'start_date_month', df_result['start_date'].dt.month)
    df_result.insert(df_result.columns.get_loc('start_date') + 3, 'start_date_day', df_result['start_date'].dt.day)
    df_result.insert(df_result.columns.get_loc('start_date') + 4, 'start_date_weekday', df_result['start_date'].dt.day_name())
    df_result.insert(df_result.columns.get_loc('start_date') + 5, 'start_date_holiday',
                     df_result['start_date'].dt.date.isin(holidays.KR(years=[2023, 2024, 2025])).astype(int))
    df_result.insert(df_result.columns.get_loc('start_date') + 6, 'start_date_quarter', df_result['start_date'].dt.quarter)


    df_result.insert(df_result.columns.get_loc('predict_date') + 1, 'predict_date_year', df_result['predict_date'].dt.year)
    df_result.insert(df_result.columns.get_loc('predict_date') + 2, 'predict_date_month', df_result['predict_date'].dt.month)
    df_result.insert(df_result.columns.get_loc('predict_date') + 3, 'predict_date_day', df_result['predict_date'].dt.day)
    df_result.insert(df_result.columns.get_loc('predict_date') + 4, 'predict_date_weekday', df_result['predict_date'].dt.day_name())
    df_result.insert(df_result.columns.get_loc('predict_date') + 5, 'predict_date_holiday',
                     df_result['predict_date'].dt.date.isin(holidays.KR(years=[2023, 2024, 2025])).astype(int))
    df_result.insert(df_result.columns.get_loc('predict_date') + 6, 'predict_date_quarter', df_result['predict_date'].dt.quarter)


    return df_result

In [None]:
df_train = make_expanded_train(train_data.copy(), input_window_size=28, predict_window_size=7)

In [None]:
print(df_train.shape)
df_train.head()

(672798, 47)


Unnamed: 0,영업장명_메뉴명,store_name,menu_name,start_date,start_date_year,start_date_month,start_date_day,start_date_weekday,start_date_holiday,start_date_quarter,...,X_27,predict_date,predict_date_year,predict_date_month,predict_date_day,predict_date_weekday,predict_date_holiday,predict_date_quarter,predict_date_offset,target
0,느티나무 셀프BBQ_1인 수저세트,느티나무 셀프BBQ,1인 수저세트,2023-01-01,2023,1,1,Sunday,1,1,...,0,2023-01-29,2023,1,29,Sunday,0,1,1,8
1,느티나무 셀프BBQ_1인 수저세트,느티나무 셀프BBQ,1인 수저세트,2023-01-01,2023,1,1,Sunday,1,1,...,0,2023-01-30,2023,1,30,Monday,0,1,2,0
2,느티나무 셀프BBQ_1인 수저세트,느티나무 셀프BBQ,1인 수저세트,2023-01-01,2023,1,1,Sunday,1,1,...,0,2023-01-31,2023,1,31,Tuesday,0,1,3,4
3,느티나무 셀프BBQ_1인 수저세트,느티나무 셀프BBQ,1인 수저세트,2023-01-01,2023,1,1,Sunday,1,1,...,0,2023-02-01,2023,2,1,Wednesday,0,1,4,6
4,느티나무 셀프BBQ_1인 수저세트,느티나무 셀프BBQ,1인 수저세트,2023-01-01,2023,1,1,Sunday,1,1,...,0,2023-02-02,2023,2,2,Thursday,0,1,5,2


Model Train

In [71]:
num_features = [col for col in df_train.columns if col.startswith("X_")] + ['predict_date_offset']
del_features = ["영업일자_메뉴명", "start_date", "start_date_year", "predict_date", "predict_date_year"]
cat_features = [col for col in df_train.columns if col not in num_features and col not in del_features and col != 'target']


In [72]:
features = num_features + cat_features

x_train, x_val, y_train, y_val = train_test_split(
    df_train[features],
    df_train["target"],
    test_size=0.2,
    shuffle=True,
)

In [70]:
model = CatBoostRegressor()

model.fit(
    x_train, y_train,
    eval_set=(x_val, y_val),
    cat_features=cat_features,
    use_best_model=True
)

Learning rate set to 0.136665
0:	learn: 37.1904239	test: 36.9054716	best: 36.9054716 (0)	total: 1.19s	remaining: 19m 44s


KeyboardInterrupt: 

In [None]:
y_pred = model.predict(x_val)

rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Validation RMSE: {rmse:.4f}")

Validation RMSE: 13.9474


Prediction

In [None]:
def make_expanded_test(df, test_prefix):
    df = df.sort_values(['영업장명_메뉴명', '영업일자'])
    df['영업일자'] = pd.to_datetime(df['영업일자'])

    result = []

    for name, group in df.groupby('영업장명_메뉴명'):
        group = group.sort_values('영업일자')
        sales = group['매출수량'].values
        dates = group['영업일자'].values
        store_name = group['영업장명'].iloc[0]
        menu_name = group['메뉴명'].iloc[0]

        start_date = dates[0]

        for j in range(7):
            predict_date = start_date + pd.Timedelta(days=28 + j)

            result.append({
                '영업장명_메뉴명': name,
                '영업일자':f"{test_prefix}+{j+1}일",
                'store_name': store_name,
                'menu_name': menu_name,
                'start_date': start_date,
                **{f'X_{k}': sales[k] for k in range(28)},
                'predict_date': predict_date,
                'predict_date_offset': j + 1
            })

    df_result = pd.DataFrame(result)


    df_result.insert(df_result.columns.get_loc('start_date') + 1, 'start_date_year', df_result['start_date'].dt.year)
    df_result.insert(df_result.columns.get_loc('start_date') + 2, 'start_date_month', df_result['start_date'].dt.month)
    df_result.insert(df_result.columns.get_loc('start_date') + 3, 'start_date_day', df_result['start_date'].dt.day)
    df_result.insert(df_result.columns.get_loc('start_date') + 4, 'start_date_weekday', df_result['start_date'].dt.day_name())
    df_result.insert(df_result.columns.get_loc('start_date') + 5, 'start_date_holiday',
                     df_result['start_date'].dt.date.isin(holidays.KR(years=[2023, 2024, 2025])).astype(int))
    df_result.insert(df_result.columns.get_loc('start_date') + 6, 'start_date_quarter', df_result['start_date'].dt.quarter)


    df_result.insert(df_result.columns.get_loc('predict_date') + 1, 'predict_date_year', df_result['predict_date'].dt.year)
    df_result.insert(df_result.columns.get_loc('predict_date') + 2, 'predict_date_month', df_result['predict_date'].dt.month)
    df_result.insert(df_result.columns.get_loc('predict_date') + 3, 'predict_date_day', df_result['predict_date'].dt.day)
    df_result.insert(df_result.columns.get_loc('predict_date') + 4, 'predict_date_weekday', df_result['predict_date'].dt.day_name())
    df_result.insert(df_result.columns.get_loc('predict_date') + 5, 'predict_date_holiday',
                     df_result['predict_date'].dt.date.isin(holidays.KR(years=[2023, 2024, 2025])).astype(int))
    df_result.insert(df_result.columns.get_loc('predict_date') + 6, 'predict_date_quarter', df_result['predict_date'].dt.quarter)

    return df_result

In [None]:
df_test_list = []

test_files = sorted(glob.glob('/content/data/TEST_*.csv'))

for path in test_files:
    test_data = pd.read_csv(path, encoding='utf-8-sig')

    test_data = test_data.sort_values(['영업장명_메뉴명', '영업일자'])
    test_data['영업일자'] = pd.to_datetime(test_data['영업일자'])

    test_data['요일'] = test_data['영업일자'].dt.day_name()

    test_data[['영업장명', '메뉴명']] = test_data['영업장명_메뉴명'].str.split('_', n=1, expand=True)

    filename = os.path.basename(path)
    test_prefix = re.search(r'(TEST_\d+)', filename).group(1)

    df_temp = make_expanded_test(test_data.copy(), test_prefix)

    df_test_list.append(df_temp)

df_test = pd.concat(df_test_list, ignore_index=True)

In [None]:
x_test = df_test[features]
y_test = model.predict(x_test)

In [None]:
df_test['매출수량'] = y_test

In [None]:
def convert_to_submission_format(pred_df: pd.DataFrame, sample_submission: pd.DataFrame):
    # (영업일자, 메뉴) → 매출수량 딕셔너리로 변환
    pred_dict = dict(zip(
        zip(pred_df['영업일자'], pred_df['영업장명_메뉴명']),
        pred_df['매출수량']
    ))

    final_df = sample_submission.copy()

    for col in final_df.columns[1:]:
      final_df[col] = final_df[col].astype(float)

    for row_idx in final_df.index:
        date = final_df.loc[row_idx, '영업일자']
        for col in final_df.columns[1:]:
            final_df.loc[row_idx, col] = max(0, pred_dict.get((date, col), 0))

    return final_df

In [None]:
sample_submission = pd.read_csv('/content/data/sample_submission.csv')
submission = convert_to_submission_format(df_test, sample_submission)
submission.to_csv('/content/data/submission.csv', index=False, encoding='utf-8-sig')