In [1]:
#1) NumPy를 1.26.4로 강제 재설치 (가장 중요)
pip uninstall -y numpy
pip install -U --force-reinstall "numpy==1.26.4"

#2) 호환 버전으로 재설치 (LightGBM + 사이언스 스택)
pip install -U --force-reinstall \
  "lightgbm==4.3.0" "scikit-learn==1.4.2" "pandas==2.1.4" "joblib==1.3.2"




SyntaxError: invalid syntax (3050080918.py, line 2)

In [8]:
# =========================================
# Colab: LightGBM 7일 수요예측 (가중치 포함) - ONE CELL
# =========================================


import os, json, gc
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype

SEED = 42
np.random.seed(SEED)

# 필요한 파일을 /content에 업로드:
# - train.csv
# - sample_submission.csv
# - TEST_00.csv ~ TEST_09.csv

# -----------------------------
# 1) 공통 함수 (피처/평가)
# -----------------------------
LAGS  = [1, 2, 3, 7, 14, 21, 28]
ROLLS = [7, 14, 28]

# =========================================
# 직접 관리할 공휴일 목록 (YYYY-MM-DD 형식)
# =========================================
CUSTOM_HOLIDAYS = [
    # 2023년
    '2023-01-01', '2023-01-21', '2023-01-22', '2023-01-23', '2023-01-24', '2023-03-01',
    '2023-05-05', '2023-05-29', '2023-06-06', '2023-08-15', '2023-09-28', '2023-09-29', 
    '2023-09-30', '2023-10-03', '2023-10-09', '2023-12-25',
    # 2024년
    '2024-01-01', '2024-02-09', '2024-02-10', '2024-02-11', '2024-02-12', '2024-03-01',
    '2024-04-10', '2024-05-06', '2024-05-15', '2024-06-06', '2024-08-15', '2024-09-16',
    '2024-09-17', '2024-09-18', '2024-10-3', '2024-10-9', '2024-12-25'
]

def smape(y_true, y_pred) -> float:
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    diff  = np.abs(y_true - y_pred)
    denom[denom == 0] = 1.0
    return float(np.mean(diff / denom) * 100.0)

def build_train_table(df: pd.DataFrame) -> pd.DataFrame:
    df = df.sort_values(['영업장명_메뉴명','영업일자']).reset_index(drop=True)
    df['item_le'] = df['영업장명_메뉴명'].astype('category').cat.codes

    # 휴일
    holiday_set = set(pd.to_datetime(CUSTOM_HOLIDAYS))
    df['is_holiday'] = df['영업일자'].isin(holiday_set).astype(int)

    # Lags
    for k in LAGS:
        df[f'lag_{k}'] = df.groupby('영업장명_메뉴명')['매출수량'].shift(k)

    # Rolling(누수 방지: 1일 시프트 후)
    df['qty_shift1'] = df.groupby('영업장명_메뉴명')['매출수량'].shift(1)
    for w in ROLLS:
        df[f'roll_mean_{w}'] = (
            df.groupby('영업장명_메뉴명')['qty_shift1']
              .rolling(window=w, min_periods=w).mean()
              .reset_index(level=0, drop=True)
        )
        df[f'roll_std_{w}']  = (
            df.groupby('영업장명_메뉴명')['qty_shift1']
              .rolling(window=w, min_periods=w).std()
              .reset_index(level=0, drop=True)
        )

    df['roll_min_7'] = (
        df.groupby('영업장명_메뉴명')['qty_shift1']
          .rolling(window=7, min_periods=7).min()
          .reset_index(level=0, drop=True)
    )
    df['roll_max_7'] = (
        df.groupby('영업장명_메뉴명')['qty_shift1']
          .rolling(window=7, min_periods=7).max()
          .reset_index(level=0, drop=True)
    )
    df['roll_sum_7'] = (
        df.groupby('영업장명_메뉴명')['qty_shift1']
          .rolling(window=7, min_periods=7).sum()
          .reset_index(level=0, drop=True)
    )

    df['is_zero'] = (df['qty_shift1'] == 0).astype(float)
    df['zero_ratio_28'] = (
        df.groupby('영업장명_메뉴명')['is_zero']
          .rolling(window=28, min_periods=28).mean()
          .reset_index(level=0, drop=True)
    )

    # 캘린더(참조일)
    df['ref_year']       = df['영업일자'].dt.year
    df['ref_month']      = df['영업일자'].dt.month
    df['ref_day']        = df['영업일자'].dt.day
    df['ref_weekday']    = df['영업일자'].dt.weekday
    df['ref_is_weekend'] = df['ref_weekday'].isin([5,6]).astype(int)

    # 타깃(1~7일 후)
    for h in range(1, 8):
        df[f'y_h{h}'] = df.groupby('영업장명_메뉴명')['매출수량'].shift(-h)

    base_cols = (
        ['영업장명_메뉴명','item_le','영업일자',
         'ref_year','ref_month','ref_day','ref_weekday','ref_is_weekend']
        + [f'lag_{k}' for k in LAGS]
        + [f'roll_mean_{w}' for w in ROLLS]
        + [f'roll_std_{w}'  for w in ROLLS]
        + ['roll_min_7','roll_max_7','roll_sum_7','zero_ratio_28']
    )

    # long-format: h=1..7
    records = []
    for h in range(1, 8):
        tmp = df[base_cols + [f'y_h{h}']].copy()
        tmp = tmp.rename(columns={f'y_h{h}':'y'})
        tmp['h'] = h
        target_date = df['영업일자'] + pd.to_timedelta(h, unit='D')
        tmp['target_weekday']    = (df['영업일자'] + pd.to_timedelta(h, unit='D')).dt.weekday
        tmp['target_is_weekend'] = tmp['target_weekday'].isin([5,6]).astype(int)
        tmp['target_is_holiday']   = target_date.isin(holiday_set).astype(int)
        records.append(tmp)

    Xy = pd.concat(records, axis=0, ignore_index=True)
    Xy = Xy.rename(columns={'영업일자':'ref_date'})
    Xy = Xy.dropna()
    return Xy

def build_test_features(test_df: pd.DataFrame) -> pd.DataFrame:
    test_df = test_df.sort_values(['영업장명_메뉴명','영업일자']).reset_index(drop=True)
    test_df['item_le'] = test_df['영업장명_메뉴명'].astype('category').cat.codes

    # 휴일
    test_holiday_set = set(pd.to_datetime(CUSTOM_HOLIDAYS))

    feats = []
    for item, g in test_df.groupby('영업장명_메뉴명'):
        g = g.sort_values('영업일자')
        if len(g) < 28:
            continue
        last28 = g.tail(28).copy()
        ref_date = last28['영업일자'].iloc[-1]
        qty = last28['매출수량'].values.astype(float)

        base = {
            'item_le': int(last28['item_le'].iloc[-1]),
            'ref_year': ref_date.year,
            'ref_month': ref_date.month,
            'ref_day': ref_date.day,
            'ref_weekday': ref_date.weekday(),
            'ref_is_weekend': 1 if ref_date.weekday() in (5,6) else 0,
        }
        for k in LAGS:
            base[f'lag_{k}'] = float(qty[-k])
        for w in ROLLS:
            tail = qty[-w:]
            base[f'roll_mean_{w}'] = float(np.mean(tail))
            base[f'roll_std_{w}']  = float(np.std(tail, ddof=0))

        base['roll_min_7'] = float(np.min(qty[-7:]))
        base['roll_max_7'] = float(np.max(qty[-7:]))
        base['roll_sum_7'] = float(np.sum(qty[-7:]))
        base['zero_ratio_28'] = float(np.mean(qty == 0))

        for h in range(1, 8):
            row = base.copy()
            target_date_for_row = ref_date + pd.Timedelta(days=h)
            t_wd = (ref_date + pd.Timedelta(days=h)).weekday()
            row['h'] = h
            row['target_weekday'] = t_wd
            row['target_is_weekend'] = 1 if t_wd in (5,6) else 0
            row['target_is_holiday'] = 1 if target_date_for_row in test_holiday_set else 0
            row['영업장명_메뉴명'] = item
            feats.append(row)

    return pd.DataFrame(feats)

# 가중 SMAPE 계산 함수
def calculate_weighted_smape(y_true: np.ndarray, y_pred: np.ndarray, df_info: pd.DataFrame, weights: dict = None) -> float:
    # 데이터 합치기
    temp_df = df_info.copy()
    temp_df['actual'] = y_true
    temp_df['prediction'] = np.maximum(0, y_pred)

    # 식음업장별 점수 계산
    total_score = 0
    
    # 가중치가 주어지지 않으면 모든 식음업장의 가중치를 1로 설정
    if weights is None:
        unique_stores = temp_df['store'].unique()
        weights = {s: 1.0 for s in unique_stores}

    for s in temp_df['store'].unique():
        store_df = temp_df[temp_df['store'] == s]
        
        # 품목별 SMAPE 계산
        item_smapes = []
        for i in store_df['item'].unique():
            item_df = store_df[store_df['item'] == i]
            
            valid_days_df = item_df[item_df['actual'] != 0].copy()
            
            if len(valid_days_df) == 0:
                item_smapes.append(0.0)
                continue
            
            diff = np.abs(valid_days_df['actual'] - valid_days_df['prediction'])
            denom = np.abs(valid_days_df['actual']) + np.abs(valid_days_df['prediction'])
            
            smape_i = np.mean(2 * diff / denom)
            item_smapes.append(smape_i)

        # 식음업장 점수 집계
        store_smape = np.mean(item_smapes)
        
        # 최종 점수에 가중치 적용하여 누적
        ws = weights.get(s, 1.0)
        total_score += ws * store_smape
        
    return total_score


# -----------------------------
# 2) 데이터 로드
# -----------------------------
train = pd.read_csv("train.csv", parse_dates=["영업일자"])
train = train.sort_values(['영업장명_메뉴명','영업일자'])
print("train shape:", train.shape, "items:", train['영업장명_메뉴명'].nunique())

# -----------------------------
# 3) 학습 테이블 + 가중치 설계
# -----------------------------
Xy = build_train_table(train)

# (A) 버킷 가중치
Xy['w'] = 1.0
rd = Xy['ref_date']

Xy.loc[(rd >= '2023-01-01') & (rd <= '2023-12-31'), 'w'] = 0.9

# 1-예외) 2023-06-01 ~ 2023-07-15만 1.3 (덮어쓰기)
Xy.loc[(rd >= '2023-06-01') & (rd <= '2023-07-15'), 'w'] = 1.3

# 2) 2024년은 6/15
# 2024-01 ~ 2024-04: 1.0
Xy.loc[(rd >= '2024-01-01') & (rd <= '2024-04-30'), 'w'] = 1.0

# 2024-05: 1.2
Xy.loc[(rd >= '2024-05-01') & (rd <= '2024-05-31'), 'w'] = 1.2

# 2024-06-01 ~ 2024-06-15: 1.4
Xy.loc[(rd >= '2024-06-01') & (rd <= '2024-06-15'), 'w'] = 1.4

# (B) 선택 가중치: 시간 감가 + 주말 + 지평(h)
HALF_LIFE_DAYS = 60
days_from_max = (Xy['ref_date'].max() - rd).dt.days.clip(lower=0)
decay = 0.5 ** (days_from_max / HALF_LIFE_DAYS)
Xy['w'] *= decay

Xy.loc[Xy['target_is_weekend'] == 1, 'w'] *= 1.10  # 주말 10% 가중
Xy['w'] *= (1.0 + (8 - Xy['h']) * 0.02)            # h 가까울수록 약간 가중

# -----------------------------
# 4) 시간 검증 분할 & 학습
# -----------------------------
VALID_DAYS      = 14
N_ESTIMATORS    = 2000
EARLY_STOPPING  = 200
LEARNING_RATE   = 0.05
NUM_LEAVES      = 64
FEATURE_FRAC    = 0.9
SUBSAMPLE       = 0.8
MIN_CHILD_SAMP  = 50

cutoff = Xy['ref_date'].max() - pd.Timedelta(days=VALID_DAYS)
train_mask = Xy['ref_date'] <= cutoff
valid_mask = Xy['ref_date'] >  cutoff

feature_cols = [c for c in Xy.columns if c not in ['y','영업장명_메뉴명','ref_date','w']]
X_train, y_train = Xy.loc[train_mask, feature_cols], Xy.loc[train_mask, 'y']
X_valid, y_valid = Xy.loc[valid_mask, feature_cols], Xy.loc[valid_mask, 'y']
w_train = Xy.loc[train_mask, 'w'].values
w_valid = Xy.loc[valid_mask, 'w'].values

from lightgbm import LGBMRegressor, early_stopping
lgbm = LGBMRegressor(
    objective='regression',
    learning_rate=LEARNING_RATE,
    num_leaves=NUM_LEAVES,
    feature_fraction=FEATURE_FRAC,
    subsample=SUBSAMPLE,
    subsample_freq=1,
    min_child_samples=MIN_CHILD_SAMP,
    n_estimators=N_ESTIMATORS,
    random_state=SEED,
    verbose=-1
)

callbacks = [early_stopping(stopping_rounds=EARLY_STOPPING)]
lgbm.fit(
    X_train, y_train,
    sample_weight=w_train,                 #학습 가중치
    eval_set=[(X_valid, y_valid)],
    eval_sample_weight=[w_valid],          #검증 가중치
    eval_metric='l1',
    callbacks=callbacks
)

valid_pred = lgbm.predict(X_valid)
print("Valid SMAPE (Original):", f"{smape(y_valid.values, valid_pred):.4f}")

info_df = Xy.loc[valid_mask, ['영업장명_메뉴명']].reset_index(drop=True)
info_df[['store', 'item']] = info_df['영업장명_메뉴명'].str.split('_', expand=True)

# 가중 SMAPE 점수 계산
score_weighted = calculate_weighted_smape(y_valid.values, valid_pred, info_df)

print(f"Valid SMAPE (Competition Weighted): {score_weighted:.6f}")

# 모델/피처 저장
import joblib
os.makedirs("lgbm_models", exist_ok=True)
joblib.dump(lgbm, "lgbm_models/lgbm_weighted.pkl")
with open("lgbm_models/feature_cols.json", "w", encoding="utf-8") as f:
    json.dump({"feature_cols": feature_cols}, f, ensure_ascii=False, indent=2)

# -----------------------------
# 5) 추론 & 제출파일 생성
# -----------------------------
sample = pd.read_csv("sample_submission.csv")
all_items = list(sample.columns[1:])

test_paths = [f"TEST_{i:02d}.csv" for i in range(10)]
missing = [p for p in test_paths if not os.path.exists(p)]
if missing:
    raise FileNotFoundError(f"테스트 파일이 없습니다: {missing}")

all_rows = []
for i, tp in enumerate(test_paths):
    tdf = pd.read_csv(tp, parse_dates=['영업일자'])
    feats = build_test_features(tdf)
    preds = lgbm.predict(feats[feature_cols])
    feats['pred'] = np.maximum(0.0, preds)  # 음수 하한 0

    for h in range(1, 8):
        tag = f"TEST_{i:02d}+{h}일"
        row = {'영업일자': tag}
        sub = feats[feats['h']==h][['영업장명_메뉴명','pred']].set_index('영업장명_메뉴명')['pred']
        for item in all_items:
            row[item] = float(sub.get(item, 0.0))
        all_rows.append(row)

submission = pd.DataFrame(all_rows, columns=['영업일자'] + all_items)
submission.to_csv("submission.csv", index=False)
print("제출 파일 생성 완료 -> submission.csv")
submission.head()


train shape: (102676, 3) items: 193
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[591]	valid_0's l1: 5.30321	valid_0's l2: 237.959
Valid SMAPE (Original): 131.4260
Valid SMAPE (Competition Weighted): 5.499343
제출 파일 생성 완료 -> submission.csv


Unnamed: 0,영업일자,느티나무 셀프BBQ_1인 수저세트,느티나무 셀프BBQ_BBQ55(단체),"느티나무 셀프BBQ_대여료 30,000원","느티나무 셀프BBQ_대여료 60,000원","느티나무 셀프BBQ_대여료 90,000원","느티나무 셀프BBQ_본삼겹 (단품,실내)",느티나무 셀프BBQ_스프라이트 (단체),느티나무 셀프BBQ_신라면,느티나무 셀프BBQ_쌈야채세트,...,화담숲주막_스프라이트,화담숲주막_참살이 막걸리,화담숲주막_찹쌀식혜,화담숲주막_콜라,화담숲주막_해물파전,화담숲카페_메밀미숫가루,화담숲카페_아메리카노 HOT,화담숲카페_아메리카노 ICE,화담숲카페_카페라떼 ICE,화담숲카페_현미뻥스크림
0,TEST_00+1일,5.949128,0.0,5.804988,4.223709,0.334873,1.147701,0.833934,2.594213,2.261126,...,8.467388,19.152408,16.181392,9.335606,52.477238,38.451493,4.783827,32.35864,7.650978,17.289565
1,TEST_00+2일,3.66798,4.810046,2.471312,1.679871,0.023417,0.259264,1.585526,0.919439,0.564238,...,0.0,1.17249,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,TEST_00+3일,4.492604,10.386981,3.003854,2.281931,0.32583,0.755501,4.938807,1.521499,1.166298,...,0.95037,6.452738,4.827981,0.605113,20.655953,19.627014,3.348635,17.332304,4.537258,8.439507
3,TEST_00+4일,4.672208,15.841145,3.191018,2.355439,0.32583,0.755501,4.938807,1.595006,1.239806,...,0.704277,6.139398,4.474463,0.35902,18.478692,15.174717,1.752077,16.281991,2.653127,6.208836
4,TEST_00+5일,6.526026,46.243227,3.748316,3.056975,0.61115,1.213168,8.575042,2.185273,1.830072,...,0.712972,9.079987,6.426056,0.721502,25.721935,20.649127,2.007468,22.996245,2.983426,10.105195
