In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import shutil

source_folder = '/content/drive/MyDrive/LGAimers/data'
destination_folder = '/content/data'

try:
    shutil.copytree(source_folder, destination_folder)
except FileExistsError:
    print(f"폴더 '{destination_folder}'가 이미 존재")

In [4]:
!pip install catboost
!pip install holidays

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [5]:
import os
import random
import glob
import re

import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split


import torch
import torch.nn as nn
from tqdm import tqdm

from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

import pandas as pd
import holidays
from tqdm import tqdm

Fixed RandomSeed & Setting Hyperparameter

In [6]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

set_seed(42)

In [7]:
LOOKBACK, PREDICT, BATCH_SIZE, EPOCHS = 28, 7, 16, 50
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

LOAD DATA

In [8]:
train_data = pd.read_csv('/content/data/train.csv', encoding='utf-8-sig')
print(train_data)

              영업일자            영업장명_메뉴명  매출수량
0       2023-01-01  느티나무 셀프BBQ_1인 수저세트     0
1       2023-01-02  느티나무 셀프BBQ_1인 수저세트     0
2       2023-01-03  느티나무 셀프BBQ_1인 수저세트     0
3       2023-01-04  느티나무 셀프BBQ_1인 수저세트     0
4       2023-01-05  느티나무 셀프BBQ_1인 수저세트     0
...            ...                 ...   ...
102671  2024-06-11        화담숲카페_현미뻥스크림    12
102672  2024-06-12        화담숲카페_현미뻥스크림    10
102673  2024-06-13        화담숲카페_현미뻥스크림    14
102674  2024-06-14        화담숲카페_현미뻥스크림    12
102675  2024-06-15        화담숲카페_현미뻥스크림    60

[102676 rows x 3 columns]


Data analysis

In [9]:
##sort by menu and chronological time
train_data = train_data.sort_values(['영업장명_메뉴명', '영업일자'])
train_data['영업일자'] = pd.to_datetime(train_data['영업일자'])
train_data['요일'] = train_data['영업일자'].dt.day_name()
train_data[['영업장명', '메뉴명']] = train_data['영업장명_메뉴명'].str.split('_', n=1, expand=True)

print(train_data)

             영업일자            영업장명_메뉴명  매출수량         요일        영업장명      메뉴명
0      2023-01-01  느티나무 셀프BBQ_1인 수저세트     0     Sunday  느티나무 셀프BBQ  1인 수저세트
1      2023-01-02  느티나무 셀프BBQ_1인 수저세트     0     Monday  느티나무 셀프BBQ  1인 수저세트
2      2023-01-03  느티나무 셀프BBQ_1인 수저세트     0    Tuesday  느티나무 셀프BBQ  1인 수저세트
3      2023-01-04  느티나무 셀프BBQ_1인 수저세트     0  Wednesday  느티나무 셀프BBQ  1인 수저세트
4      2023-01-05  느티나무 셀프BBQ_1인 수저세트     0   Thursday  느티나무 셀프BBQ  1인 수저세트
...           ...                 ...   ...        ...         ...      ...
102671 2024-06-11        화담숲카페_현미뻥스크림    12    Tuesday       화담숲카페   현미뻥스크림
102672 2024-06-12        화담숲카페_현미뻥스크림    10  Wednesday       화담숲카페   현미뻥스크림
102673 2024-06-13        화담숲카페_현미뻥스크림    14   Thursday       화담숲카페   현미뻥스크림
102674 2024-06-14        화담숲카페_현미뻥스크림    12     Friday       화담숲카페   현미뻥스크림
102675 2024-06-15        화담숲카페_현미뻥스크림    60   Saturday       화담숲카페   현미뻥스크림

[102676 rows x 6 columns]


In [16]:
def make_expanded_train(df, input_window_size=28, predict_window_size=7):
    df = df.sort_values(['영업장명_메뉴명', '영업일자'])
    df['영업일자'] = pd.to_datetime(df['영업일자'])

    result = []

    for name, group in df.groupby('영업장명_메뉴명'):
        group = group.sort_values('영업일자')
        sales = group['매출수량'].values
        dates = group['영업일자'].values
        store_name = group['영업장명'].iloc[0]
        menu_name = group['메뉴명'].iloc[0]

        for i in range(len(group) - input_window_size - predict_window_size + 1):
            input_window = sales[i : i + input_window_size]
            predict_window = sales[i + input_window_size : i + input_window_size + predict_window_size]
            start_date = dates[i]

            avg_sale_input = sum(input_window) / len(input_window)
            avg_sale_input = int(avg_sale_input.round())
            input_dates = dates[i : i + input_window_size]
            input_dates_series = pd.Series(input_dates)
            avg_quarter_input = input_dates_series.dt.quarter.mean()
            avg_quarter_input = int(avg_quarter_input.round())


            for j in range(predict_window_size):
                predict_date = dates[i + input_window_size + j]

                result.append({
                    '영업장명_메뉴명': name,
                    'store_name': store_name,
                    'menu_name': menu_name,
                    'start_date': start_date,
                    **{f'X_{k}': input_window[k] for k in range(input_window_size)},
                    'avg_sale_input': avg_sale_input,
                    'avg_quarter_input': avg_quarter_input,
                    'predict_date': predict_date,
                    'predict_date_offset': j + 1,
                    'target': predict_window[j]
                })

    df_result = pd.DataFrame(result)

    df_result.insert(df_result.columns.get_loc('start_date') + 1, 'start_date_year', df_result['start_date'].dt.year)
    df_result.insert(df_result.columns.get_loc('start_date') + 2, 'start_date_month', df_result['start_date'].dt.month)
    df_result.insert(df_result.columns.get_loc('start_date') + 3, 'start_date_day', df_result['start_date'].dt.day)
    df_result.insert(df_result.columns.get_loc('start_date') + 4, 'start_date_weekday', df_result['start_date'].dt.day_name())
    df_result.insert(df_result.columns.get_loc('start_date') + 5, 'start_date_holiday',
                     df_result['start_date'].dt.date.isin(holidays.KR(years=[2023, 2024, 2025])).astype(int))
    # df_result.insert(df_result.columns.get_loc('start_date') + 6, 'start_date_quarter', df_result['start_date'].dt.quarter)


    df_result.insert(df_result.columns.get_loc('predict_date') + 1, 'predict_date_year', df_result['predict_date'].dt.year)
    df_result.insert(df_result.columns.get_loc('predict_date') + 2, 'predict_date_month', df_result['predict_date'].dt.month)
    df_result.insert(df_result.columns.get_loc('predict_date') + 3, 'predict_date_day', df_result['predict_date'].dt.day)
    df_result.insert(df_result.columns.get_loc('predict_date') + 4, 'predict_date_weekday', df_result['predict_date'].dt.day_name())
    df_result.insert(df_result.columns.get_loc('predict_date') + 5, 'predict_date_holiday',
                     df_result['predict_date'].dt.date.isin(holidays.KR(years=[2023, 2024, 2025])).astype(int))
    df_result.insert(df_result.columns.get_loc('predict_date') + 6, 'predict_date_quarter', df_result['predict_date'].dt.quarter)


    return df_result

In [17]:
df_train = make_expanded_train(train_data.copy(), input_window_size=28, predict_window_size=7)

In [18]:
print(df_train.shape)
df_train.head()

(672798, 48)


Unnamed: 0,영업장명_메뉴명,store_name,menu_name,start_date,start_date_year,start_date_month,start_date_day,start_date_weekday,start_date_holiday,X_0,...,avg_quarter_input,predict_date,predict_date_year,predict_date_month,predict_date_day,predict_date_weekday,predict_date_holiday,predict_date_quarter,predict_date_offset,target
0,느티나무 셀프BBQ_1인 수저세트,느티나무 셀프BBQ,1인 수저세트,2023-01-01,2023,1,1,Sunday,1,0,...,1,2023-01-29,2023,1,29,Sunday,0,1,1,8
1,느티나무 셀프BBQ_1인 수저세트,느티나무 셀프BBQ,1인 수저세트,2023-01-01,2023,1,1,Sunday,1,0,...,1,2023-01-30,2023,1,30,Monday,0,1,2,0
2,느티나무 셀프BBQ_1인 수저세트,느티나무 셀프BBQ,1인 수저세트,2023-01-01,2023,1,1,Sunday,1,0,...,1,2023-01-31,2023,1,31,Tuesday,0,1,3,4
3,느티나무 셀프BBQ_1인 수저세트,느티나무 셀프BBQ,1인 수저세트,2023-01-01,2023,1,1,Sunday,1,0,...,1,2023-02-01,2023,2,1,Wednesday,0,1,4,6
4,느티나무 셀프BBQ_1인 수저세트,느티나무 셀프BBQ,1인 수저세트,2023-01-01,2023,1,1,Sunday,1,0,...,1,2023-02-02,2023,2,2,Thursday,0,1,5,2


Model Train

In [20]:
num_features = [col for col in df_train.columns if col.startswith("X_")] + ['predict_date_offset']
del_features = ["영업일자_메뉴명", "start_date", "start_date_year", "predict_date", "predict_date_year"]
cat_features = [col for col in df_train.columns if col not in num_features and col not in del_features and col != 'target']


In [21]:
features = num_features + cat_features

x_train, x_val, y_train, y_val = train_test_split(
    df_train[features],
    df_train["target"],
    test_size=0.2,
    shuffle=True,
)

In [22]:
model = CatBoostRegressor()

model.fit(
    x_train, y_train,
    eval_set=(x_val, y_val),
    cat_features=cat_features,
    use_best_model=True
)

Learning rate set to 0.136665
0:	learn: 37.2038592	test: 36.9048910	best: 36.9048910 (0)	total: 1.27s	remaining: 21m 13s
1:	learn: 34.7472000	test: 34.4739809	best: 34.4739809 (1)	total: 2.29s	remaining: 19m 4s
2:	learn: 32.6251697	test: 32.3866201	best: 32.3866201 (2)	total: 4.06s	remaining: 22m 27s
3:	learn: 30.9090271	test: 30.6800401	best: 30.6800401 (3)	total: 5.66s	remaining: 23m 30s
4:	learn: 29.4601502	test: 29.2683569	best: 29.2683569 (4)	total: 6.98s	remaining: 23m 9s
5:	learn: 28.2440929	test: 28.0406233	best: 28.0406233 (5)	total: 8.03s	remaining: 22m 10s
6:	learn: 27.2411102	test: 27.0571777	best: 27.0571777 (6)	total: 9.06s	remaining: 21m 24s
7:	learn: 26.4264920	test: 26.2303917	best: 26.2303917 (7)	total: 9.97s	remaining: 20m 36s
8:	learn: 25.7346088	test: 25.5428141	best: 25.5428141 (8)	total: 11s	remaining: 20m 14s
9:	learn: 25.1759041	test: 24.9914073	best: 24.9914073 (9)	total: 12.1s	remaining: 19m 56s
10:	learn: 24.7030464	test: 24.5049049	best: 24.5049049 (10)	tot

<catboost.core.CatBoostRegressor at 0x7a4321b37310>

In [23]:
y_pred = model.predict(x_val)

rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Validation RMSE: {rmse:.4f}")

Validation RMSE: 13.4783


Feature Importance

In [24]:
from catboost import Pool

val_pool = Pool(
    data=x_val,
    label=y_val,
    cat_features=cat_features
)

importances = model.get_feature_importance(data=val_pool, type='LossFunctionChange')

fi_df = pd.DataFrame({
    'feature': features,
    'importance': importances
}).sort_values('importance', ascending=False)

print("===== Feature Importance =====")
print(fi_df)

fi_df.to_csv("feature_importance.csv", index=False, encoding="utf-8-sig")
print("Feature importance saved to feature_importance.csv")

===== Feature Importance =====
                 feature  importance
27                  X_27    2.468426
40  predict_date_weekday    1.159119
29              영업장명_메뉴명    1.067569
23                  X_23    0.816386
38    predict_date_month    0.725524
30            store_name    0.618896
25                  X_25    0.547745
26                  X_26    0.530632
39      predict_date_day    0.527014
22                  X_22    0.487571
24                  X_24    0.359068
32      start_date_month    0.273643
31             menu_name    0.217382
36        avg_sale_input    0.173528
41  predict_date_holiday    0.160619
37     avg_quarter_input    0.116751
42  predict_date_quarter    0.108911
21                  X_21    0.073465
19                  X_19    0.068968
16                  X_16    0.045767
6                    X_6    0.030953
18                  X_18    0.030362
20                  X_20    0.029928
15                  X_15    0.026581
33        start_date_day    0.025782
28   pr

Scoring

In [25]:
y_pred = model.predict(x_val)

x_val["predict"] = y_pred
x_val["target"] = y_val

score = 0
for store_name in list(x_val["store_name"].unique()):
    score_store = 0
    df_store = x_val[x_val["store_name"]==store_name].copy()

    for menu_name in list(df_store["menu_name"].unique()):
        score_store_menu = 0
        df_store_menu = df_store[df_store["menu_name"]==menu_name].copy()
        df_store_menu = df_store_menu[df_store_menu["target"]!=0]

        for i in range(len(df_store_menu)):
            A = df_store_menu["target"].iloc[i]
            P = df_store_menu["predict"].iloc[i]
            score_store_menu += 2*abs(A-P)/(abs(A)+abs(P))


        score_store += score_store_menu/len(df_store_menu)

    score += score_store / len(list(df_store["menu_name"].unique()))
    if store_name in ['미라시아', '담하']:
        score += score_store / len(list(df_store["menu_name"].unique()))
score = score / (len(list(x_val["store_name"].unique()))+2)

print(score)

0.5614882300346002


Prediction

In [27]:
def make_expanded_test(df, test_prefix):
    df = df.sort_values(['영업장명_메뉴명', '영업일자'])
    df['영업일자'] = pd.to_datetime(df['영업일자'])

    result = []

    for name, group in df.groupby('영업장명_메뉴명'):
        group = group.sort_values('영업일자')
        sales = group['매출수량'].values
        dates = group['영업일자'].values
        store_name = group['영업장명'].iloc[0]
        menu_name = group['메뉴명'].iloc[0]

        start_date = dates[0]

        input_window = sales[0:28]
        input_dates = dates[0:28]

        avg_sale_input = sum(input_window) / len(input_window)
        avg_sale_input = int(avg_sale_input.round())
        input_dates_series = pd.Series(input_dates)
        avg_quarter_input = input_dates_series.dt.quarter.mean()
        avg_quarter_input = int(avg_quarter_input.round())

        for j in range(7):
            predict_date = start_date + pd.Timedelta(days=28 + j)

            result.append({
                '영업장명_메뉴명': name,
                '영업일자':f"{test_prefix}+{j+1}일",
                'store_name': store_name,
                'menu_name': menu_name,
                'start_date': start_date,
                **{f'X_{k}': sales[k] for k in range(28)},
                'avg_sale_input': avg_sale_input,
                'avg_quarter_input': avg_quarter_input,
                'predict_date': predict_date,
                'predict_date_offset': j + 1
            })

    df_result = pd.DataFrame(result)


    df_result.insert(df_result.columns.get_loc('start_date') + 1, 'start_date_year', df_result['start_date'].dt.year)
    df_result.insert(df_result.columns.get_loc('start_date') + 2, 'start_date_month', df_result['start_date'].dt.month)
    df_result.insert(df_result.columns.get_loc('start_date') + 3, 'start_date_day', df_result['start_date'].dt.day)
    df_result.insert(df_result.columns.get_loc('start_date') + 4, 'start_date_weekday', df_result['start_date'].dt.day_name())
    df_result.insert(df_result.columns.get_loc('start_date') + 5, 'start_date_holiday',
                     df_result['start_date'].dt.date.isin(holidays.KR(years=[2023, 2024, 2025])).astype(int))


    df_result.insert(df_result.columns.get_loc('predict_date') + 1, 'predict_date_year', df_result['predict_date'].dt.year)
    df_result.insert(df_result.columns.get_loc('predict_date') + 2, 'predict_date_month', df_result['predict_date'].dt.month)
    df_result.insert(df_result.columns.get_loc('predict_date') + 3, 'predict_date_day', df_result['predict_date'].dt.day)
    df_result.insert(df_result.columns.get_loc('predict_date') + 4, 'predict_date_weekday', df_result['predict_date'].dt.day_name())
    df_result.insert(df_result.columns.get_loc('predict_date') + 5, 'predict_date_holiday',
                     df_result['predict_date'].dt.date.isin(holidays.KR(years=[2023, 2024, 2025])).astype(int))
    df_result.insert(df_result.columns.get_loc('predict_date') + 6, 'predict_date_quarter', df_result['predict_date'].dt.quarter)

    return df_result

In [28]:
df_test_list = []

test_files = sorted(glob.glob('/content/data/TEST_*.csv'))

for path in test_files:
    test_data = pd.read_csv(path, encoding='utf-8-sig')

    test_data = test_data.sort_values(['영업장명_메뉴명', '영업일자'])
    test_data['영업일자'] = pd.to_datetime(test_data['영업일자'])

    test_data['요일'] = test_data['영업일자'].dt.day_name()

    test_data[['영업장명', '메뉴명']] = test_data['영업장명_메뉴명'].str.split('_', n=1, expand=True)

    filename = os.path.basename(path)
    test_prefix = re.search(r'(TEST_\d+)', filename).group(1)

    df_temp = make_expanded_test(test_data.copy(), test_prefix)

    df_test_list.append(df_temp)

df_test = pd.concat(df_test_list, ignore_index=True)

In [29]:
x_test = df_test[features]
y_test = model.predict(x_test)

In [30]:
df_test['매출수량'] = y_test

In [31]:
def convert_to_submission_format(pred_df: pd.DataFrame, sample_submission: pd.DataFrame):
    # (영업일자, 메뉴) → 매출수량 딕셔너리로 변환
    pred_dict = dict(zip(
        zip(pred_df['영업일자'], pred_df['영업장명_메뉴명']),
        pred_df['매출수량']
    ))

    final_df = sample_submission.copy()

    for col in final_df.columns[1:]:
      final_df[col] = final_df[col].astype(float)

    for row_idx in final_df.index:
        date = final_df.loc[row_idx, '영업일자']
        for col in final_df.columns[1:]:
            final_df.loc[row_idx, col] = max(0, pred_dict.get((date, col), 0))

    return final_df

In [32]:
sample_submission = pd.read_csv('/content/data/sample_submission.csv')
submission = convert_to_submission_format(df_test, sample_submission)
submission.to_csv('/content/data/submission.csv', index=False, encoding='utf-8-sig')