In [1]:
import pandas as pd
import numpy as np
import os
import random
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
from sklearn.preprocessing import LabelEncoder

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path = os.getcwd()
file_path = os.path.join(path,"file")
sub_path = os.path.join(path,"제출")
sub_num = 48
train_df = pd.read_csv(os.path.join(file_path,"train.csv"))
submission = pd.read_csv(os.path.join(file_path,"sample_submission.csv"))

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed = 16
seed_everything(seed) # Seed 고정

In [4]:
import holidays
import pandas as pd

def add_weekday(df) :
    df['date'] = pd.to_datetime(df['timestamp'])
    df['weekcode'] = df['date'].dt.weekday

    year_min = df['date'].dt.date.unique().min().year
    year_max = df['date'].dt.date.unique().max().year
    kor_holidays = list(holidays.KOR(years=range(year_min, year_max)).keys())
    idx_kor_holidays = pd.to_datetime(kor_holidays)

    df_temp = df[['date', 'weekcode']].copy()
    df_temp.set_index('date', inplace=True)
    #공휴일은 10으로 표시
    df_temp.loc[idx_kor_holidays, 'weekcode'] = 7.
    df_temp.reset_index(inplace=True)

    df['weekcode'] = df_temp['weekcode']

    return df

In [5]:
train_df2 = add_weekday(train_df)
train_df3 = train_df2.drop(columns=['supply(kg)', 'date'])
#질적 변수들을 수치화합니다
qual_col = ['item', 'corporation', 'location']

for i in qual_col:
    le = LabelEncoder()
    train_df3[i]=le.fit_transform(train_df3[i])

print('Done.')

Done.


In [6]:
train_df3['item_id'] = train_df3.ID.str[0:6]

In [7]:
max_y = max(train_df3["price(원/kg)"])
min_y = min(train_df3["price(원/kg)"])
train_df3["price(원/kg)"] = (train_df3["price(원/kg)"]- min_y)/(max_y-min_y)
train_df3

Unnamed: 0,ID,timestamp,item,corporation,location,price(원/kg),weekcode,item_id
0,TG_A_J_20190101,2019-01-01,4,0,0,0.000000,7,TG_A_J
1,TG_A_J_20190102,2019-01-02,4,0,0,0.000000,2,TG_A_J
2,TG_A_J_20190103,2019-01-03,4,0,0,0.082644,3,TG_A_J
3,TG_A_J_20190104,2019-01-04,4,0,0,0.067339,4,TG_A_J
4,TG_A_J_20190105,2019-01-05,4,0,0,0.059783,5,TG_A_J
...,...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,3,5,0,0.022383,0,RD_F_J
59393,RD_F_J_20230228,2023-02-28,3,5,0,0.025396,1,RD_F_J
59394,RD_F_J_20230301,2023-03-01,3,5,0,0.027452,2,RD_F_J
59395,RD_F_J_20230302,2023-03-02,3,5,0,0.025013,3,RD_F_J


In [8]:
data = TimeSeriesDataFrame(train_df3.drop(columns=['ID']))

In [9]:
data

Unnamed: 0_level_0,Unnamed: 1_level_0,item,corporation,location,price(원/kg),weekcode
item_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
TG_A_J,2019-01-01,4,0,0,0.000000,7
TG_A_J,2019-01-02,4,0,0,0.000000,2
TG_A_J,2019-01-03,4,0,0,0.082644,3
TG_A_J,2019-01-04,4,0,0,0.067339,4
TG_A_J,2019-01-05,4,0,0,0.059783,5
...,...,...,...,...,...,...
RD_F_J,2023-02-27,3,5,0,0.022383,0
RD_F_J,2023-02-28,3,5,0,0.025396,1
RD_F_J,2023-03-01,3,5,0,0.027452,2
RD_F_J,2023-03-02,3,5,0,0.025013,3


In [10]:
predictor = TimeSeriesPredictor( 
    prediction_length=28,
    target="price(원/kg)",
    eval_metric="RMSE",
)
# seed 고정
predictor.fit( data, random_seed=seed, )

TimeSeriesPredictor.fit() called
Fitting with arguments:
{'enable_ensemble': True,
 'evaluation_metric': 'RMSE',
 'excluded_model_types': None,
 'hyperparameter_tune_kwargs': None,
 'hyperparameters': 'default',
 'num_val_windows': 1,
 'prediction_length': 28,
 'random_seed': 16,
 'target': 'price(원/kg)',
 'time_limit': None,
 'verbosity': 2}
Provided training data set with 59397 rows, 39 items (item = single time series). Average time series length is 1523.0. Data frequency is 'D'.
Global seed set to 16
AutoGluon will save models to AutogluonModels\ag-20231118_035116\
AutoGluon will gauge predictive performance using evaluation metric: 'RMSE'
	This metric's sign has been flipped to adhere to being 'higher is better'. The reported score can be multiplied by -1 to get the metric value.

Provided dataset contains following columns:
	target:           'price(원/kg)'
	past covariates:  ['item', 'corporation', 'location', 'weekcode']

Starting training. Start time is 2023-11-18 12:51:16
Mode

Fitting simple weighted ensemble.
	-0.0369       = Validation score (-RMSE)
	1.35    s     = Training runtime
	69.67   s     = Validation (prediction) runtime
Training complete. Models trained: ['Naive', 'SeasonalNaive', 'Theta', 'AutoETS', 'RecursiveTabular', 'WeightedEnsemble']
Total runtime: 76.90 s
Best model: WeightedEnsemble
Best model score: -0.0369


<autogluon.timeseries.predictor.TimeSeriesPredictor at 0x28121a208e0>

In [11]:
predictor.refit_full()

Refitting models via `refit_full` using all of the data (combined train and validation)...
	Models trained in this way will have the suffix '_FULL' and have NaN validation score.
	This process is not bound by time_limit, but should take less time than the original `fit` call.
Fitting model: Naive_FULL | Skipping fit via cloning parent ...
Fitting model: SeasonalNaive_FULL | Skipping fit via cloning parent ...
Fitting model: Theta_FULL | Skipping fit via cloning parent ...
Fitting model: AutoETS_FULL | Skipping fit via cloning parent ...
Fitting model: RecursiveTabular_FULL
	1.74    s     = Training runtime
Fitting model: WeightedEnsemble_FULL | Skipping fit via cloning parent ...
Refit complete. Models trained: ['Naive_FULL', 'SeasonalNaive_FULL', 'Theta_FULL', 'AutoETS_FULL', 'RecursiveTabular_FULL', 'WeightedEnsemble_FULL']
Total runtime: 1.80 s
Updated best model to 'WeightedEnsemble_FULL' (Previously 'WeightedEnsemble'). AutoGluon will default to using 'WeightedEnsemble_FULL' for p

{'Naive': 'Naive_FULL',
 'SeasonalNaive': 'SeasonalNaive_FULL',
 'Theta': 'Theta_FULL',
 'AutoETS': 'AutoETS_FULL',
 'RecursiveTabular': 'RecursiveTabular_FULL',
 'WeightedEnsemble': 'WeightedEnsemble_FULL'}

In [12]:
# seed 고정
pred = predictor.predict(data, random_seed=seed, )

Global seed set to 16
Model not specified in predict, will default to the model with the best validation score: WeightedEnsemble_FULL


In [13]:
holiday_test = ["20230305","20230312","20230319","20230326"]
submission['answer'] = pred.reset_index()['mean']
submission['answer'] = submission['answer']*(max_y-min_y) + min_y
submission.loc[ submission['answer'] < 0.0, 'answer'] = 0.0
submission.loc[submission['ID'].str.split("_").str[3].isin(holiday_test),"answer"] = 0
submission.to_csv(os.path.join(sub_path,f"baseline_submission{sub_num}.csv"), index=False)