## Import

In [1]:
import random
import pandas as pd
import numpy as np
import os

from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings(action='ignore')

## Fixed Random-Seed

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Load Data

In [3]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [4]:
train_df

Unnamed: 0,ID,timestamp,item,corporation,location,supply(kg),price(원/kg)
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0
...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,RD,F,J,452440.0,468.0
59393,RD_F_J_20230228,2023-02-28,RD,F,J,421980.0,531.0
59394,RD_F_J_20230301,2023-03-01,RD,F,J,382980.0,574.0
59395,RD_F_J_20230302,2023-03-02,RD,F,J,477220.0,523.0


In [5]:
test_df

Unnamed: 0,ID,timestamp,item,corporation,location
0,TG_A_J_20230304,2023-03-04,TG,A,J
1,TG_A_J_20230305,2023-03-05,TG,A,J
2,TG_A_J_20230306,2023-03-06,TG,A,J
3,TG_A_J_20230307,2023-03-07,TG,A,J
4,TG_A_J_20230308,2023-03-08,TG,A,J
...,...,...,...,...,...
1087,RD_F_J_20230327,2023-03-27,RD,F,J
1088,RD_F_J_20230328,2023-03-28,RD,F,J
1089,RD_F_J_20230329,2023-03-29,RD,F,J
1090,RD_F_J_20230330,2023-03-30,RD,F,J


## Data Pre-Processing

In [6]:
import holidays
import pandas as pd

def add_weekday(df) :
    df['date'] = pd.to_datetime(df['timestamp'])
    df['weekcode'] = df['date'].dt.weekday

    year_min = df['date'].dt.date.unique().min().year
    year_max = df['date'].dt.date.unique().max().year
    kor_holidays = list(holidays.KOR(years=range(year_min, year_max)).keys())
    idx_kor_holidays = pd.to_datetime(kor_holidays)

    df_temp = df[['date', 'weekcode']].copy()
    df_temp.set_index('date', inplace=True)
    #공휴일은 10으로 표시
    df_temp.loc[idx_kor_holidays, 'weekcode'] = 10
    df_temp.reset_index(inplace=True)

    df['weekcode'] = df_temp['weekcode']

    return df

In [7]:
train_df = add_weekday(train_df)
test_df = add_weekday(test_df)

In [8]:
train_df

Unnamed: 0,ID,timestamp,item,corporation,location,supply(kg),price(원/kg),date,weekcode
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0,2019-01-01,10
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0,2019-01-02,2
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0,2019-01-03,3
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0,2019-01-04,4
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0,2019-01-05,5
...,...,...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,RD,F,J,452440.0,468.0,2023-02-27,0
59393,RD_F_J_20230228,2023-02-28,RD,F,J,421980.0,531.0,2023-02-28,1
59394,RD_F_J_20230301,2023-03-01,RD,F,J,382980.0,574.0,2023-03-01,2
59395,RD_F_J_20230302,2023-03-02,RD,F,J,477220.0,523.0,2023-03-02,3


In [9]:
train_df

Unnamed: 0,ID,timestamp,item,corporation,location,supply(kg),price(원/kg),date,weekcode
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0,2019-01-01,10
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0,2019-01-02,2
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0,2019-01-03,3
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0,2019-01-04,4
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0,2019-01-05,5
...,...,...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,RD,F,J,452440.0,468.0,2023-02-27,0
59393,RD_F_J_20230228,2023-02-28,RD,F,J,421980.0,531.0,2023-02-28,1
59394,RD_F_J_20230301,2023-03-01,RD,F,J,382980.0,574.0,2023-03-01,2
59395,RD_F_J_20230302,2023-03-02,RD,F,J,477220.0,523.0,2023-03-02,3


In [11]:
#학습에 사용하지 않을 변수들을 제거합니다
train_df_mod = train_df.drop(columns=['supply(kg)', 'date'])

In [12]:
train_df_mod

Unnamed: 0,ID,timestamp,item,corporation,location,price(원/kg),weekcode
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,10
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,2
2,TG_A_J_20190103,2019-01-03,TG,A,J,1728.0,3
3,TG_A_J_20190104,2019-01-04,TG,A,J,1408.0,4
4,TG_A_J_20190105,2019-01-05,TG,A,J,1250.0,5
...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,RD,F,J,468.0,0
59393,RD_F_J_20230228,2023-02-28,RD,F,J,531.0,1
59394,RD_F_J_20230301,2023-03-01,RD,F,J,574.0,2
59395,RD_F_J_20230302,2023-03-02,RD,F,J,523.0,3


In [13]:
#질적 변수들을 수치화합니다
qual_col = ['item', 'corporation', 'location']

for i in qual_col:
    le = LabelEncoder()
    train_df_mod[i]=le.fit_transform(train_df_mod[i])
    test_df[i]=le.transform(test_df[i]) #test 데이터에 대해서 fit하는 것은 data leakage에 해당합니다

print('Done.')

Done.


In [15]:
test_df

Unnamed: 0,ID,timestamp,item,corporation,location,date,weekcode
0,TG_A_J_20230304,2023-03-04,4,0,0,2023-03-04,5
1,TG_A_J_20230305,2023-03-05,4,0,0,2023-03-05,6
2,TG_A_J_20230306,2023-03-06,4,0,0,2023-03-06,0
3,TG_A_J_20230307,2023-03-07,4,0,0,2023-03-07,1
4,TG_A_J_20230308,2023-03-08,4,0,0,2023-03-08,2
...,...,...,...,...,...,...,...
1087,RD_F_J_20230327,2023-03-27,3,5,0,2023-03-27,0
1088,RD_F_J_20230328,2023-03-28,3,5,0,2023-03-28,1
1089,RD_F_J_20230329,2023-03-29,3,5,0,2023-03-29,2
1090,RD_F_J_20230330,2023-03-30,3,5,0,2023-03-30,3


## Regression Model Fit

In [17]:
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor


In [18]:
train_df_mod['item_id']=train_df_mod.ID.str[0:6]

In [19]:
train_data = TimeSeriesDataFrame(train_df_mod.drop(columns=['ID']))

In [20]:
predictor = TimeSeriesPredictor( 
    prediction_length=28,
    target="price(원/kg)",
    eval_metric="RMSE",
)

predictor.fit(train_data, random_seed=42)

TimeSeriesPredictor.fit() called
Fitting with arguments:
{'enable_ensemble': True,
 'evaluation_metric': 'RMSE',
 'excluded_model_types': None,
 'hyperparameter_tune_kwargs': None,
 'hyperparameters': 'default',
 'num_val_windows': 1,
 'prediction_length': 28,
 'random_seed': 42,
 'target': 'price(원/kg)',
 'time_limit': None,
 'verbosity': 2}
Provided training data set with 59397 rows, 39 items (item = single time series). Average time series length is 1523.0. Data frequency is 'D'.
Global seed set to 42


AutoGluon will save models to AutogluonModels\ag-20231116_142025\
AutoGluon will gauge predictive performance using evaluation metric: 'RMSE'
	This metric's sign has been flipped to adhere to being 'higher is better'. The reported score can be multiplied by -1 to get the metric value.

Provided dataset contains following columns:
	target:           'price(원/kg)'
	past covariates:  ['item', 'corporation', 'location', 'weekcode']

Starting training. Start time is 2023-11-16 23:20:26
Models that will be trained: ['Naive', 'SeasonalNaive', 'Theta', 'AutoETS', 'RecursiveTabular', 'DeepAR']
Training timeseries model Naive. 
	-997.0634     = Validation score (-RMSE)
	0.03    s     = Training runtime
	7.75    s     = Validation (prediction) runtime
Training timeseries model SeasonalNaive. 
	-843.9258     = Validation score (-RMSE)
	0.03    s     = Training runtime
	0.04    s     = Validation (prediction) runtime
Training timeseries model Theta. 
	-838.8347     = Validation score (-RMSE)
	0.03 

<autogluon.timeseries.predictor.TimeSeriesPredictor at 0x1373d020220>

## 모델 학습


In [21]:
preds = predictor.predict(train_data, random_seed=42)

Global seed set to 42
Model not specified in predict, will default to the model with the best validation score: WeightedEnsemble


## Inference

In [23]:
preds

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
item_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
TG_A_J,2023-03-04,3305.099072,1585.863805,2236.841469,2663.834687,3014.349782,3312.825353,3631.836279,3962.654706,4371.442224,4987.538884
TG_A_J,2023-03-05,543.990556,-1285.068602,-656.704042,-204.257119,180.603015,539.654848,901.427346,1287.269430,1739.741426,2366.138272
TG_A_J,2023-03-06,3253.064348,379.547692,1392.910740,2044.830154,2653.802954,3186.212817,3718.454210,4328.340109,5062.691118,6197.420663
TG_A_J,2023-03-07,3526.161860,474.836194,1550.015314,2289.466743,2921.100309,3494.863582,4099.329227,4723.865927,5506.695405,6591.927269
TG_A_J,2023-03-08,3396.568735,-10.900963,1234.965073,2051.567070,2750.735726,3398.991692,4042.873254,4733.643005,5543.470846,6758.502881
...,...,...,...,...,...,...,...,...,...,...,...
RD_F_J,2023-03-27,525.635440,-89.723425,174.901249,306.852705,428.666252,536.995084,646.435404,760.451198,899.086454,1102.054688
RD_F_J,2023-03-28,541.643659,-72.281207,169.225231,325.749660,440.123665,547.579332,667.326516,782.979773,923.880175,1129.097972
RD_F_J,2023-03-29,547.976154,-58.583322,180.188609,329.778030,444.955090,557.439860,669.830658,788.347764,940.716902,1147.186611
RD_F_J,2023-03-30,530.794590,-203.304543,170.165488,311.098635,428.803774,546.263052,660.345919,791.447021,929.674680,1152.713059


In [24]:
pred_df = pd.DataFrame(preds).reset_index()
pred_df

Unnamed: 0,item_id,timestamp,mean,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
0,TG_A_J,2023-03-04,3305.099072,1585.863805,2236.841469,2663.834687,3014.349782,3312.825353,3631.836279,3962.654706,4371.442224,4987.538884
1,TG_A_J,2023-03-05,543.990556,-1285.068602,-656.704042,-204.257119,180.603015,539.654848,901.427346,1287.269430,1739.741426,2366.138272
2,TG_A_J,2023-03-06,3253.064348,379.547692,1392.910740,2044.830154,2653.802954,3186.212817,3718.454210,4328.340109,5062.691118,6197.420663
3,TG_A_J,2023-03-07,3526.161860,474.836194,1550.015314,2289.466743,2921.100309,3494.863582,4099.329227,4723.865927,5506.695405,6591.927269
4,TG_A_J,2023-03-08,3396.568735,-10.900963,1234.965073,2051.567070,2750.735726,3398.991692,4042.873254,4733.643005,5543.470846,6758.502881
...,...,...,...,...,...,...,...,...,...,...,...,...
1087,RD_F_J,2023-03-27,525.635440,-89.723425,174.901249,306.852705,428.666252,536.995084,646.435404,760.451198,899.086454,1102.054688
1088,RD_F_J,2023-03-28,541.643659,-72.281207,169.225231,325.749660,440.123665,547.579332,667.326516,782.979773,923.880175,1129.097972
1089,RD_F_J,2023-03-29,547.976154,-58.583322,180.188609,329.778030,444.955090,557.439860,669.830658,788.347764,940.716902,1147.186611
1090,RD_F_J,2023-03-30,530.794590,-203.304543,170.165488,311.098635,428.803774,546.263052,660.345919,791.447021,929.674680,1152.713059


In [25]:
pred_mean = pred_df['mean']
pred_mean

0       3305.099072
1        543.990556
2       3253.064348
3       3526.161860
4       3396.568735
           ...     
1087     525.635440
1088     541.643659
1089     547.976154
1090     530.794590
1091     527.822932
Name: mean, Length: 1092, dtype: float64

In [27]:
(pred_mean<0).sum()

19

In [28]:
pred_mean[pred_mean<0]=0

## Submission

In [30]:
submission = pd.read_csv('./sample_submission.csv')
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,0
1,TG_A_J_20230305,0
2,TG_A_J_20230306,0
3,TG_A_J_20230307,0
4,TG_A_J_20230308,0
...,...,...
1087,RD_F_J_20230327,0
1088,RD_F_J_20230328,0
1089,RD_F_J_20230329,0
1090,RD_F_J_20230330,0


In [31]:
submission['answer']=pred_mean

In [119]:
submission.to_csv('./submission_231116.csv', index=False)