Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


폴더 '/content/data'가 이미 존재


# Import

In [4]:
import os
import random
import glob
import re

import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split


import torch
import torch.nn as nn
from tqdm import tqdm

from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

import pandas as pd
import holidays
from tqdm import tqdm


In [2]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [3]:
!pip install holidays



# Fixed RandomSeed & Setting Hyperparameter

In [5]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

set_seed(42)

In [6]:
LOOKBACK, PREDICT, BATCH_SIZE, EPOCHS = 28, 7, 16, 50
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Data Load

In [7]:
train_data = pd.read_csv("train.csv")

# Data 분석

In [8]:
train_data = train_data.sort_values(['영업장명_메뉴명', '영업일자'])
train_data['영업일자'] = pd.to_datetime(train_data['영업일자'])

In [9]:
train_data['요일'] = train_data['영업일자'].dt.day_name()

In [10]:
train_data[['영업장명', '메뉴명']] = train_data['영업장명_메뉴명'].str.split('_', n=1, expand=True)

In [12]:
def make_expanded_train(df, input_window_size=28, predict_window_size=7):
    df = df.sort_values(['영업장명_메뉴명', '영업일자'])
    df['영업일자'] = pd.to_datetime(df['영업일자'])

    result = []

    for name, group in df.groupby('영업장명_메뉴명'):
        group = group.sort_values('영업일자')
        sales = group['매출수량'].values
        dates = group['영업일자'].values
        store_name = group['영업장명'].iloc[0]
        menu_name = group['메뉴명'].iloc[0]

        for i in range(len(group) - input_window_size - predict_window_size + 1):
            input_window = sales[i : i + input_window_size]
            predict_window = sales[i + input_window_size : i + input_window_size + predict_window_size]
            start_date = dates[i]

            for j in range(predict_window_size):
                predict_date = dates[i + input_window_size + j]

                result.append({
                    '영업장명_메뉴명': name,
                    'store_name': store_name,
                    'menu_name': menu_name,
                    'start_date': start_date,
                    **{f'X_{k}': input_window[k] for k in range(input_window_size)},
                    'predict_date': predict_date,
                    'predict_date_offset': j + 1,
                    'target': predict_window[j]
                })

    df_result = pd.DataFrame(result)

    df_result.insert(df_result.columns.get_loc('start_date') + 1, 'start_date_year', df_result['start_date'].dt.year)
    df_result.insert(df_result.columns.get_loc('start_date') + 2, 'start_date_month', df_result['start_date'].dt.month)
    df_result.insert(df_result.columns.get_loc('start_date') + 3, 'start_date_day', df_result['start_date'].dt.day)
    df_result.insert(df_result.columns.get_loc('start_date') + 4, 'start_date_weekday', df_result['start_date'].dt.day_name())
    df_result.insert(df_result.columns.get_loc('start_date') + 5, 'start_date_holiday',
                     df_result['start_date'].dt.date.isin(holidays.KR(years=[2023, 2024, 2025])).astype(int))

    df_result.insert(df_result.columns.get_loc('predict_date') + 1, 'predict_date_year', df_result['predict_date'].dt.year)
    df_result.insert(df_result.columns.get_loc('predict_date') + 2, 'predict_date_month', df_result['predict_date'].dt.month)
    df_result.insert(df_result.columns.get_loc('predict_date') + 3, 'predict_date_day', df_result['predict_date'].dt.day)
    df_result.insert(df_result.columns.get_loc('predict_date') + 4, 'predict_date_weekday', df_result['predict_date'].dt.day_name())
    df_result.insert(df_result.columns.get_loc('predict_date') + 5, 'predict_date_holiday',
                     df_result['predict_date'].dt.date.isin(holidays.KR(years=[2023, 2024, 2025])).astype(int))

    return df_result



In [13]:
df_train = make_expanded_train(train_data.copy(), input_window_size=28, predict_window_size=7)

In [None]:
print(df_train.shape)
df_train.head()

(672798, 44)


Unnamed: 0,store_name,menu_name,start_date,start_date_year,start_date_month,start_date_day,start_date_weekday,start_date_holiday,X_0,X_1,...,X_26,X_27,predict_date,predict_date_year,predict_date_month,predict_date_day,predict_date_weekday,predict_date_holiday,predict_date_offset,target
0,느티나무 셀프BBQ,1인 수저세트,2023-01-01,2023,1,1,Sunday,1,0,0,...,8,0,2023-01-29,2023,1,29,Sunday,0,1,8
1,느티나무 셀프BBQ,1인 수저세트,2023-01-01,2023,1,1,Sunday,1,0,0,...,8,0,2023-01-30,2023,1,30,Monday,0,2,0
2,느티나무 셀프BBQ,1인 수저세트,2023-01-01,2023,1,1,Sunday,1,0,0,...,8,0,2023-01-31,2023,1,31,Tuesday,0,3,4
3,느티나무 셀프BBQ,1인 수저세트,2023-01-01,2023,1,1,Sunday,1,0,0,...,8,0,2023-02-01,2023,2,1,Wednesday,0,4,6
4,느티나무 셀프BBQ,1인 수저세트,2023-01-01,2023,1,1,Sunday,1,0,0,...,8,0,2023-02-02,2023,2,2,Thursday,0,5,2


In [None]:
num_features = [col for col in df_train.columns if col.startswith("X_")] + ['predict_date_offset']
del_features = ["영업일자_메뉴명", "start_date", "start_date_year", "predict_date", "predict_date_year"]
cat_features = [col for col in df_train.columns if col not in num_features and col not in del_features and col != 'target']

In [None]:
features = num_features + cat_features

x_train, x_val, y_train, y_val = train_test_split(
    df_train[features],
    df_train["target"],
    test_size=0.2,
    shuffle=True,
)

In [None]:
model = CatBoostRegressor()

model.fit(
    x_train, y_train,
    eval_set=(x_val, y_val),
    cat_features=cat_features,
    use_best_model=True
)

Learning rate set to 0.136665
0:	learn: 37.2620950	test: 36.6558358	best: 36.6558358 (0)	total: 1.53s	remaining: 25m 33s
1:	learn: 34.8064045	test: 34.1852270	best: 34.1852270 (1)	total: 2.26s	remaining: 18m 45s
2:	learn: 32.7945917	test: 32.1386495	best: 32.1386495 (2)	total: 3.03s	remaining: 16m 46s
3:	learn: 31.0976833	test: 30.3767316	best: 30.3767316 (3)	total: 3.85s	remaining: 15m 57s
4:	learn: 29.6188570	test: 28.8626406	best: 28.8626406 (4)	total: 4.56s	remaining: 15m 8s
5:	learn: 28.4223207	test: 27.6440735	best: 27.6440735 (5)	total: 5.3s	remaining: 14m 38s
6:	learn: 27.4073110	test: 26.6390316	best: 26.6390316 (6)	total: 5.9s	remaining: 13m 57s
7:	learn: 26.6169074	test: 25.8339143	best: 25.8339143 (7)	total: 6.64s	remaining: 13m 43s
8:	learn: 25.9409172	test: 25.1364002	best: 25.1364002 (8)	total: 7.38s	remaining: 13m 33s
9:	learn: 25.3645311	test: 24.5380604	best: 24.5380604 (9)	total: 8.08s	remaining: 13m 20s
10:	learn: 24.9008044	test: 24.0958087	best: 24.0958087 (10)	to

<catboost.core.CatBoostRegressor at 0x7ff0bd6ff2d0>

In [None]:
y_pred = model.predict(x_val)

rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Validation RMSE: {rmse:.4f}")

Validation RMSE: 14.3448


# Prediction

In [None]:
def make_expanded_test(df, test_prefix):
    df = df.sort_values(['영업장명_메뉴명', '영업일자'])
    df['영업일자'] = pd.to_datetime(df['영업일자'])

    result = []

    for name, group in df.groupby('영업장명_메뉴명'):
        group = group.sort_values('영업일자')
        sales = group['매출수량'].values
        dates = group['영업일자'].values
        store_name = group['영업장명'].iloc[0]
        menu_name = group['메뉴명'].iloc[0]

        start_date = dates[0]

        for j in range(7):
            predict_date = start_date + pd.Timedelta(days=28 + j)

            result.append({
                '영업장명_메뉴명': name,
                '영업일자':f"{test_prefix}+{j+1}일",
                'store_name': store_name,
                'menu_name': menu_name,
                'start_date': start_date,
                **{f'X_{k}': sales[k] for k in range(28)},
                'predict_date': predict_date,
                'predict_date_offset': j + 1
            })

    df_result = pd.DataFrame(result)

    df_result.insert(df_result.columns.get_loc('start_date') + 1, 'start_date_year', df_result['start_date'].dt.year)
    df_result.insert(df_result.columns.get_loc('start_date') + 2, 'start_date_month', df_result['start_date'].dt.month)
    df_result.insert(df_result.columns.get_loc('start_date') + 3, 'start_date_day', df_result['start_date'].dt.day)
    df_result.insert(df_result.columns.get_loc('start_date') + 4, 'start_date_weekday', df_result['start_date'].dt.day_name())
    df_result.insert(df_result.columns.get_loc('start_date') + 5, 'start_date_holiday',
                     df_result['start_date'].dt.date.isin(holidays.KR(years=[2023, 2024, 2025])).astype(int))

    df_result.insert(df_result.columns.get_loc('predict_date') + 1, 'predict_date_year', df_result['predict_date'].dt.year)
    df_result.insert(df_result.columns.get_loc('predict_date') + 2, 'predict_date_month', df_result['predict_date'].dt.month)
    df_result.insert(df_result.columns.get_loc('predict_date') + 3, 'predict_date_day', df_result['predict_date'].dt.day)
    df_result.insert(df_result.columns.get_loc('predict_date') + 4, 'predict_date_weekday', df_result['predict_date'].dt.day_name())
    df_result.insert(df_result.columns.get_loc('predict_date') + 5, 'predict_date_holiday',
                     df_result['predict_date'].dt.date.isin(holidays.KR(years=[2023, 2024, 2025])).astype(int))

    return df_result


In [None]:
df_test_list = []

test_files = sorted(glob.glob('/content/data/test/TEST_*.csv'))

for path in test_files:
    test_data = pd.read_csv(path, encoding='utf-8-sig')

    test_data = test_data.sort_values(['영업장명_메뉴명', '영업일자'])
    test_data['영업일자'] = pd.to_datetime(test_data['영업일자'])

    test_data['요일'] = test_data['영업일자'].dt.day_name()

    test_data[['영업장명', '메뉴명']] = test_data['영업장명_메뉴명'].str.split('_', n=1, expand=True)

    filename = os.path.basename(path)
    test_prefix = re.search(r'(TEST_\d+)', filename).group(1)

    df_temp = make_expanded_test(test_data.copy(), test_prefix)

    df_test_list.append(df_temp)

df_test = pd.concat(df_test_list, ignore_index=True)

In [None]:
x_test = df_test[features]
y_test = model.predict(x_test)

In [None]:
df_test['매출수량'] = y_test

# Submission

In [None]:
def convert_to_submission_format(pred_df: pd.DataFrame, sample_submission: pd.DataFrame):
    # (영업일자, 메뉴) → 매출수량 딕셔너리로 변환
    pred_dict = dict(zip(
        zip(pred_df['영업일자'], pred_df['영업장명_메뉴명']),
        pred_df['매출수량']
    ))

    final_df = sample_submission.copy()

    for col in final_df.columns[1:]:
      final_df[col] = final_df[col].astype(float)

    for row_idx in final_df.index:
        date = final_df.loc[row_idx, '영업일자']
        for col in final_df.columns[1:]:
            final_df.loc[row_idx, col] = max(0, pred_dict.get((date, col), 0))

    return final_df

In [None]:
sample_submission = pd.read_csv('/content/data/sample_submission.csv')
submission = convert_to_submission_format(df_test, sample_submission)
submission.to_csv('/content/data/submission.csv', index=False, encoding='utf-8-sig')