## 제주경진대회 - 이혜승 Autogluon 모델링

In [2]:
!pip install mxnet
!pip install autogluon

Collecting mxnet
  Downloading mxnet-1.9.1-py3-none-manylinux2014_x86_64.whl (49.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.1/49.1 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
Collecting graphviz<0.9.0,>=0.8.1 (from mxnet)
  Downloading graphviz-0.8.4-py2.py3-none-any.whl (16 kB)
Installing collected packages: graphviz, mxnet
  Attempting uninstall: graphviz
    Found existing installation: graphviz 0.20.1
    Uninstalling graphviz-0.20.1:
      Successfully uninstalled graphviz-0.20.1
Successfully installed graphviz-0.8.4 mxnet-1.9.1
Collecting autogluon
  Downloading autogluon-0.8.2-py3-none-any.whl (9.7 kB)
Collecting autogluon.core[all]==0.8.2 (from autogluon)
  Downloading autogluon.core-0.8.2-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.0/224.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autogluon.features==0.8.2 (from autogluon)
  Downloading autogluon.features-0.8.2-py3-none-any.

## Import

In [8]:
import random
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings(action='ignore')

from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor

## Fixed Random-Seed

In [6]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Load Data

In [9]:
from google.colab import files

uploaded = files.upload()

Saving sample_submission.csv to sample_submission.csv
Saving train_final4.csv to train_final4.csv


In [10]:
# Assuming you uploaded a CSV file
filename = next(iter(uploaded))

In [50]:
train_df = pd.read_csv('train_final4.csv')
submission = pd.read_csv('sample_submission.csv')

In [12]:
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,0
1,TG_A_J_20230305,0
2,TG_A_J_20230306,0
3,TG_A_J_20230307,0
4,TG_A_J_20230308,0
...,...,...
1087,RD_F_J_20230327,0
1088,RD_F_J_20230328,0
1089,RD_F_J_20230329,0
1090,RD_F_J_20230330,0


# 새로운 변수 (sub autogluon5)

In [51]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,ID,timestamp,item,supply(kg),price(원/kg),year,month,day,week,...,수입 금액,무역수지,corporation_A,corporation_B,corporation_C,corporation_D,corporation_E,corporation_F,location_J,location_S
0,0,TG_A_J_20190101,2019-01-01,TG,0.0,0.0,2019,1,1,1,...,0.0,172.0,True,False,False,False,False,False,True,False
1,1,TG_A_J_20190102,2019-01-02,TG,0.0,0.0,2019,1,2,1,...,0.0,172.0,True,False,False,False,False,False,True,False
2,2,TG_A_J_20190103,2019-01-03,TG,60601.0,1728.0,2019,1,3,1,...,0.0,172.0,True,False,False,False,False,False,True,False
3,3,TG_A_J_20190104,2019-01-04,TG,25000.0,1408.0,2019,1,4,1,...,0.0,172.0,True,False,False,False,False,False,True,False
4,4,TG_A_J_20190105,2019-01-05,TG,32352.0,1250.0,2019,1,5,1,...,0.0,172.0,True,False,False,False,False,False,True,False


In [52]:
train_df.ID.str[0:6]

0        TG_A_J
1        TG_A_J
2        TG_A_J
3        TG_A_J
4        TG_A_J
          ...  
59392    RD_F_J
59393    RD_F_J
59394    RD_F_J
59395    RD_F_J
59396    RD_F_J
Name: ID, Length: 59397, dtype: object

In [54]:
train_df['item_id'] = train_df.ID.str[0:6]

In [55]:
# 각 열에서 결측치의 개수 확인
missing_values_count =train_df.isnull().sum()

# 결과 출력
print(missing_values_count)

Unnamed: 0       0
ID               0
timestamp        0
item             0
supply(kg)       0
price(원/kg)      0
year             0
month            0
day              0
week             0
isWeekday        0
isSaturday       0
isSunday         0
holiday          0
수출 중량            0
수출 금액            0
수입 중량            0
수입 금액            0
무역수지             0
corporation_A    0
corporation_B    0
corporation_C    0
corporation_D    0
corporation_E    0
corporation_F    0
location_J       0
location_S       0
item_id          0
dtype: int64


In [16]:
# 결측치를 0으로 대체
train_df.fillna(0, inplace=True)

In [57]:
def preprocess_time_series(df):
    df['sin_month'] = np.sin(2 * np.pi * df['month'] / 12)
    df['cos_month'] = np.cos(2 * np.pi * df['month'] / 12)
    df['spring'] = df['month'].apply(lambda x: 1 if x in [3, 4, 5] else 0)
    df['summer'] = df['month'].apply(lambda x: 1 if x in [6, 7, 8] else 0)
    df['fall'] = df['month'].apply(lambda x: 1 if x in [9, 10, 11] else 0)
    df['winter'] = df['month'].apply(lambda x: 1 if x in [12, 1, 2] else 0)
    df['spring_sin'] = np.sin(2 * np.pi * df['spring'] / 12)
    df['spring_cos'] = np.cos(2 * np.pi * df['spring'] / 12)
    df['summer_sin'] = np.sin(2 * np.pi * df['summer'] / 12)
    df['summer_cos'] = np.cos(2 * np.pi * df['summer'] / 12)
    df['fall_sin'] = np.sin(2 * np.pi * df['fall'] / 12)
    df['fall_cos'] = np.cos(2 * np.pi * df['fall'] / 12)
    df['winter_sin'] = np.sin(2 * np.pi * df['winter'] / 12)
    df['winter_cos'] = np.cos(2 * np.pi * df['winter'] / 12)
    return df



In [58]:
train_data = preprocess_time_series(train_df)

In [59]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,ID,timestamp,item,supply(kg),price(원/kg),year,month,day,week,...,fall,winter,spring_sin,spring_cos,summer_sin,summer_cos,fall_sin,fall_cos,winter_sin,winter_cos
0,0,TG_A_J_20190101,2019-01-01,TG,0.0,0.0,2019,1,1,1,...,0,1,0.0,1.0,0.0,1.0,0.0,1.0,0.5,0.866025
1,1,TG_A_J_20190102,2019-01-02,TG,0.0,0.0,2019,1,2,1,...,0,1,0.0,1.0,0.0,1.0,0.0,1.0,0.5,0.866025
2,2,TG_A_J_20190103,2019-01-03,TG,60601.0,1728.0,2019,1,3,1,...,0,1,0.0,1.0,0.0,1.0,0.0,1.0,0.5,0.866025
3,3,TG_A_J_20190104,2019-01-04,TG,25000.0,1408.0,2019,1,4,1,...,0,1,0.0,1.0,0.0,1.0,0.0,1.0,0.5,0.866025
4,4,TG_A_J_20190105,2019-01-05,TG,32352.0,1250.0,2019,1,5,1,...,0,1,0.0,1.0,0.0,1.0,0.0,1.0,0.5,0.866025


In [60]:
data = TimeSeriesDataFrame(train_data.drop(columns=['ID']))
data = TimeSeriesDataFrame(data.drop(columns=['item']))
data = TimeSeriesDataFrame(data.drop(columns=['supply(kg)']))
data = TimeSeriesDataFrame(data.drop(columns=['Unnamed: 0']))

data

Unnamed: 0_level_0,Unnamed: 1_level_0,price(원/kg),year,month,day,week,isWeekday,isSaturday,isSunday,holiday,수출 중량,...,fall,winter,spring_sin,spring_cos,summer_sin,summer_cos,fall_sin,fall_cos,winter_sin,winter_cos
item_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
TG_A_J,2019-01-01,0.0,2019,1,1,1,1,0,0,1,58368.0,...,0,1,0.0,1.000000,0.0,1.0,0.0,1.0,0.5,0.866025
TG_A_J,2019-01-02,0.0,2019,1,2,1,1,0,0,0,58368.0,...,0,1,0.0,1.000000,0.0,1.0,0.0,1.0,0.5,0.866025
TG_A_J,2019-01-03,1728.0,2019,1,3,1,1,0,0,0,58368.0,...,0,1,0.0,1.000000,0.0,1.0,0.0,1.0,0.5,0.866025
TG_A_J,2019-01-04,1408.0,2019,1,4,1,1,0,0,0,58368.0,...,0,1,0.0,1.000000,0.0,1.0,0.0,1.0,0.5,0.866025
TG_A_J,2019-01-05,1250.0,2019,1,5,1,0,1,0,1,58368.0,...,0,1,0.0,1.000000,0.0,1.0,0.0,1.0,0.5,0.866025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
RD_F_J,2023-02-27,468.0,2023,2,27,9,1,0,0,0,0.0,...,0,1,0.0,1.000000,0.0,1.0,0.0,1.0,0.5,0.866025
RD_F_J,2023-02-28,531.0,2023,2,28,9,1,0,0,0,0.0,...,0,1,0.0,1.000000,0.0,1.0,0.0,1.0,0.5,0.866025
RD_F_J,2023-03-01,574.0,2023,3,1,9,1,0,0,1,0.0,...,0,0,0.5,0.866025,0.0,1.0,0.0,1.0,0.0,1.000000
RD_F_J,2023-03-02,523.0,2023,3,2,9,1,0,0,0,0.0,...,0,0,0.5,0.866025,0.0,1.0,0.0,1.0,0.0,1.000000


data = TimeSeriesDataFrame(train_df.drop(columns=['ID']))
predictor = TimeSeriesPredictor(
    prediction_length=28,
    target="price(원/kg)",
    eval_metric="RMSE",
)

# Predictor fit

In [61]:
predictor = TimeSeriesPredictor(
    prediction_length=28,
    target="price(원/kg)",
    eval_metric="RMSE",
)


In [62]:
# seed 고정
predictor.fit(data,  random_seed=42,)


TimeSeriesPredictor.fit() called
Fitting with arguments:
{'enable_ensemble': True,
 'evaluation_metric': 'RMSE',
 'excluded_model_types': None,
 'hyperparameter_tune_kwargs': None,
 'hyperparameters': 'default',
 'num_val_windows': 1,
 'prediction_length': 28,
 'random_seed': 42,
 'target': 'price(원/kg)',
 'time_limit': None,
 'verbosity': 2}
Provided training data set with 59397 rows, 39 items (item = single time series). Average time series length is 1523.0. Data frequency is 'D'.
INFO:lightning_fabric.utilities.seed:Global seed set to 42
AutoGluon will save models to AutogluonModels/ag-20231115_145207/
AutoGluon will gauge predictive performance using evaluation metric: 'RMSE'
	This metric's sign has been flipped to adhere to being 'higher is better'. The reported score can be multiplied by -1 to get the metric value.

Provided dataset contains following columns:
	target:           'price(원/kg)'
	past covariates:  ['year', 'month', 'day', 'week', 'isWeekday', 'isSaturday', 'isSunday

<autogluon.timeseries.predictor.TimeSeriesPredictor at 0x7e468ead97e0>

In [63]:
predictor.refit_full()

Refitting models via `refit_full` using all of the data (combined train and validation)...
	Models trained in this way will have the suffix '_FULL' and have NaN validation score.
	This process is not bound by time_limit, but should take less time than the original `fit` call.
Fitting model: Naive_FULL | Skipping fit via cloning parent ...
Fitting model: SeasonalNaive_FULL | Skipping fit via cloning parent ...
Fitting model: Theta_FULL | Skipping fit via cloning parent ...
Fitting model: AutoETS_FULL | Skipping fit via cloning parent ...
Fitting model: RecursiveTabular_FULL
	5.70    s     = Training runtime
Fitting model: DeepAR_FULL | Skipping fit via cloning parent ...
Fitting model: WeightedEnsemble_FULL | Skipping fit via cloning parent ...
Refit complete. Models trained: ['Naive_FULL', 'SeasonalNaive_FULL', 'Theta_FULL', 'AutoETS_FULL', 'RecursiveTabular_FULL', 'DeepAR_FULL', 'WeightedEnsemble_FULL']
Total runtime: 5.77 s
Updated best model to 'WeightedEnsemble_FULL' (Previously 'W

{'Naive': 'Naive_FULL',
 'SeasonalNaive': 'SeasonalNaive_FULL',
 'Theta': 'Theta_FULL',
 'AutoETS': 'AutoETS_FULL',
 'RecursiveTabular': 'RecursiveTabular_FULL',
 'DeepAR': 'DeepAR_FULL',
 'WeightedEnsemble': 'WeightedEnsemble_FULL'}

In [64]:
# seed 고정
pred = predictor.predict(data, random_seed=42,)

INFO:lightning_fabric.utilities.seed:Global seed set to 42
Model not specified in predict, will default to the model with the best validation score: WeightedEnsemble_FULL


In [65]:
pred

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
item_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
TG_A_J,2023-03-04,3348.149784,1390.396974,2076.701129,2539.256282,2953.394319,3312.425015,3702.036055,4115.160209,4600.379043,5309.063278
TG_A_J,2023-03-05,600.211059,-1531.421366,-800.973685,-274.728461,171.819004,587.284538,1009.279466,1457.722531,1981.984655,2707.407289
TG_A_J,2023-03-06,3158.511358,27.542819,1150.418513,1906.071977,2569.287445,3167.429172,3797.872139,4489.051826,5238.966959,6285.080141
TG_A_J,2023-03-07,3437.620075,-55.226727,1175.229210,2063.297277,2795.398808,3475.201692,4149.793124,4885.435397,5740.289014,7047.823439
TG_A_J,2023-03-08,3379.721990,-461.245426,921.714069,1865.999624,2668.823703,3396.173664,4114.562266,4912.528482,5835.471833,7333.412629
...,...,...,...,...,...,...,...,...,...,...,...
RD_F_J,2023-03-27,527.559869,-164.957287,110.786269,271.638248,402.114047,520.025062,636.617106,771.834754,938.762250,1199.259385
RD_F_J,2023-03-28,522.948168,-238.019607,105.819032,279.898679,417.359598,541.796236,665.691807,802.773264,968.928185,1196.282466
RD_F_J,2023-03-29,522.287128,-248.895112,92.589562,272.027764,412.201432,539.672862,667.445479,810.219065,974.840095,1204.601490
RD_F_J,2023-03-30,505.508347,-271.223064,61.200060,251.778946,395.017314,525.742280,648.571698,798.761624,964.520470,1195.967194


In [66]:
submission['answer'] = pred.reset_index()['mean']
submission.loc[ submission['answer'] < 0.0, 'answer'] = 0.0

In [67]:
submission['date'] = pd.to_datetime(submission['ID'].str[-8:], format='%Y%m%d')

# Determine the day of the week (0=Monday, 6=Sunday)
submission['day_of_week'] = submission['date'].dt.dayofweek

# Set 'answer' to 0 where the day of the week is Sunday
submission.loc[submission['day_of_week'] == 6, 'answer'] = 0

# Drop the temporary columns if they are not needed
submission.drop(columns=['date', 'day_of_week'], inplace=True)


In [68]:
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,3348.149784
1,TG_A_J_20230305,0.000000
2,TG_A_J_20230306,3158.511358
3,TG_A_J_20230307,3437.620075
4,TG_A_J_20230308,3379.721990
...,...,...
1087,RD_F_J_20230327,527.559869
1088,RD_F_J_20230328,522.948168
1089,RD_F_J_20230329,522.287128
1090,RD_F_J_20230330,505.508347


In [69]:
# 결과를 CSV 파일로 저장
submission.to_csv('submission_gluon5.csv', index=False)

In [70]:
from google.colab import files

files.download('/content/submission_gluon5.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>