In [1]:
import warnings
warnings.filterwarnings('ignore')
from IPython.display import clear_output

import random
import os
import torch
import pandas as pd
import numpy as np
import datetime

import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
%matplotlib inline
from pylab import rcParams
import seaborn as sns

import statsmodels.api as sm
import math
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error

from prophet import Prophet
from neuralprophet import NeuralProphet
import lightgbm
import optuna

plt.rc("font", family="Malgun Gothic")
plt.rcParams['axes.unicode_minus'] = False

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(41) # Seed 고정

In [61]:
train_data = pd.read_csv('./train.csv').drop(columns=['ID', '제품', '대분류', '중분류', '소분류', '브랜드'])

In [76]:
submit = pd.read_csv('./sample_submission.csv').drop(columns=['ID'])

In [63]:
train_data

Unnamed: 0,2022-01-01,2022-01-02,2022-01-03,2022-01-04,2022-01-05,2022-01-06,2022-01-07,2022-01-08,2022-01-09,2022-01-10,...,2023-03-26,2023-03-27,2023-03-28,2023-03-29,2023-03-30,2023-03-31,2023-04-01,2023-04-02,2023-04-03,2023-04-04
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,3,2,0,0,2,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15885,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15886,0,0,0,0,0,0,0,0,0,0,...,0,0,0,3,0,2,4,1,1,3
15887,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15888,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2


In [94]:
def make_transposed(df, i):
    df = pd.DataFrame({'ds':pd.to_datetime(df.columns),
                       'y':df.iloc[i]}).reset_index(drop=True)
    
    return df

def make_derived_variable(df):

    df['weekday'] = [d.weekday() for d in df.ds] # 월=0, 화=1 ...
    df['month'] = df.ds.dt.month
    df['hour'] = df.ds.dt.hour
    df['year'] = df.ds.dt.year
    
    return df

def make_previous_y(df):
    
    df['previous_y'] = [0] + list(df.y[:-1]) # 바로 직전 시간의 y를 변수로 생성
    
    return df

def make_difference(df, d):
    
    for i in range(1, d+1):
        df['diff'+str(i)] = [0]*(i+1) + list(df.y.diff(i)[i:-1])

    return df

In [95]:
def preprocessing_gbm(df, diff):
    
    df = make_derived_variable(df)
    df = make_previous_y(df)
    df = make_difference(df, diff)
    
    return df

### 하루를 예측하고, 해당 데이터를 학습에 다시 활용

In [134]:
for i in tqdm(range(len(train_data))):
    df_tmp = make_transposed(train_data, i)
    submit_tmp = make_transposed(submit, i)
    
    diff = 20
    for j in range(len(submit_tmp)):
        df_tmp = pd.concat([df_tmp, submit_tmp[j:j+1]]).reset_index(drop=True)
        df_tmp = preprocessing_gbm(df_tmp, diff) # 최적화 필요. 현재는 실행할 때마다 모든 행을 연산
    
        X_train, y_train = df_tmp[:-1].drop(columns=['ds', 'y']), df_tmp[:-1]['y']
        X_test = df_tmp[-1:].drop(columns=['ds', 'y'])
        
        model = lightgbm.LGBMRegressor().fit(X_train, y_train)
        df_tmp['y'].iloc[-1] = np.round(model.predict(X_test)).astype(int)
    submit.iloc[i] = df_tmp[-len(submit_tmp):]['y']

  0%|          | 4/15890 [00:05<5:53:24,  1.33s/it]

KeyboardInterrupt

