## 제주경진대회 - 이혜승 Autogluon 모델링

In [1]:
!pip install mxnet
!pip install autogluon

Collecting mxnet
  Downloading mxnet-1.9.1-py3-none-manylinux2014_x86_64.whl (49.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.1/49.1 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
Collecting graphviz<0.9.0,>=0.8.1 (from mxnet)
  Downloading graphviz-0.8.4-py2.py3-none-any.whl (16 kB)
Installing collected packages: graphviz, mxnet
  Attempting uninstall: graphviz
    Found existing installation: graphviz 0.20.1
    Uninstalling graphviz-0.20.1:
      Successfully uninstalled graphviz-0.20.1
Successfully installed graphviz-0.8.4 mxnet-1.9.1
Collecting autogluon
  Downloading autogluon-0.8.2-py3-none-any.whl (9.7 kB)
Collecting autogluon.core[all]==0.8.2 (from autogluon)
  Downloading autogluon.core-0.8.2-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.0/224.0 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autogluon.features==0.8.2 (from autogluon)
  Downloading autogluon.features-0.8.2-py3-none-any.

## Import

In [113]:
import random
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings(action='ignore')

from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor

## Fixed Random-Seed

In [114]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Load Data

In [6]:
from google.colab import files

uploaded = files.upload()

Saving train_final3.csv to train_final3.csv
Saving submission.csv to submission.csv


In [115]:
# Assuming you uploaded a CSV file
filename = next(iter(uploaded))

In [166]:
train_df = pd.read_csv('train_final3.csv')
submission = pd.read_csv('submission.csv')

In [167]:
submission

Unnamed: 0,ID,price(원/kg)
0,TG_A_J_20230304,0.00000
1,TG_A_J_20230305,-147.88028
2,TG_A_J_20230306,2697.44600
3,TG_A_J_20230307,2929.46530
4,TG_A_J_20230308,2970.20210
...,...,...
1087,RD_F_J_20230327,87.23653
1088,RD_F_J_20230328,297.42844
1089,RD_F_J_20230329,297.08304
1090,RD_F_J_20230330,296.74770


# 새로운 변수 (sub autogluon5)

In [168]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,ID,timestamp,item,supply(kg),price(원/kg),year,month,day,week,...,corporation_C,corporation_D,corporation_E,corporation_F,location_J,location_S,무역 규모,평균 수출 가격,평균 수입 가격,무역수지 비율
0,0,TG_A_J_20190101,2019-01-01,TG,0.0,0.0,2019,1,1,1,...,False,False,False,False,True,False,58368.0,0.002947,0.0,1.0
1,1,TG_A_J_20190102,2019-01-02,TG,0.0,0.0,2019,1,2,1,...,False,False,False,False,True,False,58368.0,0.002947,0.0,1.0
2,2,TG_A_J_20190103,2019-01-03,TG,60601.0,1728.0,2019,1,3,1,...,False,False,False,False,True,False,58368.0,0.002947,0.0,1.0
3,3,TG_A_J_20190104,2019-01-04,TG,25000.0,1408.0,2019,1,4,1,...,False,False,False,False,True,False,58368.0,0.002947,0.0,1.0
4,4,TG_A_J_20190105,2019-01-05,TG,32352.0,1250.0,2019,1,5,1,...,False,False,False,False,True,False,58368.0,0.002947,0.0,1.0


In [169]:
train_df.ID.str[0:6]

0        TG_A_J
1        TG_A_J
2        TG_A_J
3        TG_A_J
4        TG_A_J
          ...  
59392    RD_F_J
59393    RD_F_J
59394    RD_F_J
59395    RD_F_J
59396    RD_F_J
Name: ID, Length: 59397, dtype: object

In [170]:
train_df['item_id'] = train_df.ID.str[0:6]

In [171]:
train_df['week by month'] = train_df['month'].astype(str) + "_" + train_df['week'].astype(str)

In [172]:
# unique_values_quality 리스트 생성
unique_values_quality = train_df['item_id'].unique()

# 결과를 저장할 리스트 생성
filtered_dataframes = []
test_dataframes = []
# for 루프를 통해 데이터프레임 필터링
for i in unique_values_quality:
    filtered_dataframes.append(train_df[train_df['item_id'] == i])

In [173]:
for i in range(len(filtered_dataframes)):
    monthly_means = filtered_dataframes[i].groupby(['month']).mean()[["수출 금액","수입 금액",]].add_suffix('month_mean')
    filtered_dataframes[i] = filtered_dataframes[i].merge(monthly_means, on='month', suffixes=('', '_mean'))
    weekly_means = filtered_dataframes[i].groupby(['week by month']).mean()[["supply(kg)","price(원/kg)"]].add_suffix('week_mean')
    filtered_dataframes[i] = filtered_dataframes[i].merge(weekly_means, on='week by month', suffixes=('', '_mean'))
    filtered_dataframes[i] = filtered_dataframes[i].drop(['수출 중량','수출 금액','수입 중량','수입 금액','무역수지'], axis = 1)

In [174]:
train_df = pd.concat(filtered_dataframes, ignore_index=True)

In [175]:
train_df.columns

Index(['Unnamed: 0', 'ID', 'timestamp', 'item', 'supply(kg)', 'price(원/kg)',
       'year', 'month', 'day', 'week', 'isWeekday', 'isSaturday', 'isSunday',
       'holiday', '기간', 'corporation_A', 'corporation_B', 'corporation_C',
       'corporation_D', 'corporation_E', 'corporation_F', 'location_J',
       'location_S', '무역 규모', '평균 수출 가격', '평균 수입 가격', '무역수지 비율', 'item_id',
       'week by month', '수출 금액month_mean', '수입 금액month_mean',
       'supply(kg)week_mean', 'price(원/kg)week_mean'],
      dtype='object')

In [176]:
train_df = train_df.drop(['무역 규모', '평균 수출 가격', '평균 수입 가격', '무역수지 비율', '기간', 'week by month'], axis = 1)

In [177]:
# 각 열에서 결측치의 개수 확인
missing_values_count =train_df.isnull().sum()

# 결과 출력
print(missing_values_count)

Unnamed: 0              0
ID                      0
timestamp               0
item                    0
supply(kg)              0
price(원/kg)             0
year                    0
month                   0
day                     0
week                    0
isWeekday               0
isSaturday              0
isSunday                0
holiday                 0
corporation_A           0
corporation_B           0
corporation_C           0
corporation_D           0
corporation_E           0
corporation_F           0
location_J              0
location_S              0
item_id                 0
수출 금액month_mean         0
수입 금액month_mean         0
supply(kg)week_mean     0
price(원/kg)week_mean    0
dtype: int64


In [178]:
# 결측치를 0으로 대체
train_df.fillna(0, inplace=True)

In [179]:
def preprocess_time_series(df):
    df['sin_month'] = np.sin(2 * np.pi * df['month'] / 12)
    df['cos_month'] = np.cos(2 * np.pi * df['month'] / 12)
    df['spring'] = df['month'].apply(lambda x: 1 if x in [3, 4, 5] else 0)
    df['summer'] = df['month'].apply(lambda x: 1 if x in [6, 7, 8] else 0)
    df['fall'] = df['month'].apply(lambda x: 1 if x in [9, 10, 11] else 0)
    df['winter'] = df['month'].apply(lambda x: 1 if x in [12, 1, 2] else 0)
    return df



In [180]:
train_data = preprocess_time_series(train_df)

In [181]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,ID,timestamp,item,supply(kg),price(원/kg),year,month,day,week,...,수출 금액month_mean,수입 금액month_mean,supply(kg)week_mean,price(원/kg)week_mean,sin_month,cos_month,spring,summer,fall,winter
0,0,TG_A_J_20190101,2019-01-01,TG,0.0,0.0,2019,1,1,1,...,313.019355,0.0,35149.3,1106.90625,0.5,0.866025,0,0,0,1
1,1,TG_A_J_20190102,2019-01-02,TG,0.0,0.0,2019,1,2,1,...,313.019355,0.0,35149.3,1106.90625,0.5,0.866025,0,0,0,1
2,2,TG_A_J_20190103,2019-01-03,TG,60601.0,1728.0,2019,1,3,1,...,313.019355,0.0,35149.3,1106.90625,0.5,0.866025,0,0,0,1
3,3,TG_A_J_20190104,2019-01-04,TG,25000.0,1408.0,2019,1,4,1,...,313.019355,0.0,35149.3,1106.90625,0.5,0.866025,0,0,0,1
4,4,TG_A_J_20190105,2019-01-05,TG,32352.0,1250.0,2019,1,5,1,...,313.019355,0.0,35149.3,1106.90625,0.5,0.866025,0,0,0,1


In [182]:
data = TimeSeriesDataFrame(train_data.drop(columns=['ID']))
data = TimeSeriesDataFrame(data.drop(columns=['item']))
data = TimeSeriesDataFrame(data.drop(columns=['supply(kg)']))
data = TimeSeriesDataFrame(data.drop(columns=['Unnamed: 0']))

data

Unnamed: 0_level_0,Unnamed: 1_level_0,price(원/kg),year,month,day,week,isWeekday,isSaturday,isSunday,holiday,corporation_A,...,수출 금액month_mean,수입 금액month_mean,supply(kg)week_mean,price(원/kg)week_mean,sin_month,cos_month,spring,summer,fall,winter
item_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
TG_A_J,2019-01-01,0.0,2019,1,1,1,1,0,0,1,True,...,313.019355,0.0,35149.3,1106.90625,5.000000e-01,0.866025,0,0,0,1
TG_A_J,2019-01-02,0.0,2019,1,2,1,1,0,0,0,True,...,313.019355,0.0,35149.3,1106.90625,5.000000e-01,0.866025,0,0,0,1
TG_A_J,2019-01-03,1728.0,2019,1,3,1,1,0,0,0,True,...,313.019355,0.0,35149.3,1106.90625,5.000000e-01,0.866025,0,0,0,1
TG_A_J,2019-01-04,1408.0,2019,1,4,1,1,0,0,0,True,...,313.019355,0.0,35149.3,1106.90625,5.000000e-01,0.866025,0,0,0,1
TG_A_J,2019-01-05,1250.0,2019,1,5,1,0,1,0,0,True,...,313.019355,0.0,35149.3,1106.90625,5.000000e-01,0.866025,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
RD_F_J,2019-12-31,1408.0,2020,12,31,1,1,0,0,0,False,...,0.000000,0.0,111170.0,1332.50000,-2.449294e-16,1.000000,0,0,0,1
RD_F_J,2020-12-28,606.0,2020,12,28,53,1,0,0,0,False,...,0.000000,0.0,301490.0,642.00000,-2.449294e-16,1.000000,0,0,0,1
RD_F_J,2020-12-29,648.0,2020,12,29,53,1,0,0,0,False,...,0.000000,0.0,301490.0,642.00000,-2.449294e-16,1.000000,0,0,0,1
RD_F_J,2020-12-30,686.0,2020,12,30,53,1,0,0,0,False,...,0.000000,0.0,301490.0,642.00000,-2.449294e-16,1.000000,0,0,0,1


data = TimeSeriesDataFrame(train_df.drop(columns=['ID']))
predictor = TimeSeriesPredictor(
    prediction_length=28,
    target="price(원/kg)",
    eval_metric="RMSE",
)

# Predictor fit

In [208]:
predictor = TimeSeriesPredictor(
    prediction_length=28,
    target="price(원/kg)",
    eval_metric="RMSE",
)


In [209]:
from autogluon.common import space

In [None]:
# seed 고정

# predictor.fit(
#     ...
#     hyperparameters={
#         "DeepAR": {
#             "hidden_size": space.Int(20, 100),
#             "dropout_rate": space.Categorical(0.1, 0.3),
#         },
#     },
#     hyperparameter_tune_kwargs="auto",
# )


predictor.fit(data,  random_seed=42,
              hyperparameters={
            'Theta':{},
            'AutoETS':{},
            'RecursiveTabular':{},
            "DeepAR": {
            "hidden_size": space.Int(20, 100),
            "dropout_rate": space.Categorical(0.1, 0.3),
            "learning_rate": space.Real(0.001, 0.1),
                'epochs': space.Int(10, 100),
                'context_length': space.Int(7, 90),
                'mini_batch_size': space.Int(32, 128)},
            },
              hyperparameter_tune_kwargs={
        "scheduler": "local",
        "searcher": "random",
        "num_trials": 50,
    },
        )

TimeSeriesPredictor.fit() called
Fitting with arguments:
{'enable_ensemble': True,
 'evaluation_metric': 'RMSE',
 'excluded_model_types': None,
 'hyperparameter_tune_kwargs': {'num_trials': 50,
                                'scheduler': 'local',
                                'searcher': 'random'},
 'hyperparameters': {'AutoETS': {},
                     'DeepAR': {'context_length': Int: lower=7, upper=90,
                                'dropout_rate': Categorical[0.1, 0.3],
                                'epochs': Int: lower=10, upper=100,
                                'hidden_size': Int: lower=20, upper=100,
                                'learning_rate': Real: lower=0.001, upper=0.1,
                                'mini_batch_size': Int: lower=32, upper=128},
                     'RecursiveTabular': {},
                     'Theta': {}},
 'num_val_windows': 1,
 'prediction_length': 28,
 'random_seed': 42,
 'target': 'price(원/kg)',
 'time_limit': None,
 'verbosity': 2}
Provi

  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
print(predictor.leaderboard(silent = True))

In [190]:
predictor.refit_full()

Refitting models via `refit_full` using all of the data (combined train and validation)...
	Models trained in this way will have the suffix '_FULL' and have NaN validation score.
	This process is not bound by time_limit, but should take less time than the original `fit` call.
Fitting model: Theta_FULL | Skipping fit via cloning parent ...
Fitting model: AutoETS_FULL | Skipping fit via cloning parent ...
Fitting model: RecursiveTabular_FULL
	6.73    s     = Training runtime
Fitting model: DeepAR/T1_FULL | Skipping fit via cloning parent ...
Fitting model: DeepAR/T2_FULL | Skipping fit via cloning parent ...
Fitting model: DeepAR/T3_FULL | Skipping fit via cloning parent ...
Fitting model: DeepAR/T4_FULL | Skipping fit via cloning parent ...
Fitting model: DeepAR/T5_FULL | Skipping fit via cloning parent ...
Fitting model: DeepAR/T6_FULL | Skipping fit via cloning parent ...
Fitting model: DeepAR/T7_FULL | Skipping fit via cloning parent ...
Fitting model: DeepAR/T8_FULL | Skipping fit v

{'Theta': 'Theta_FULL',
 'AutoETS': 'AutoETS_FULL',
 'RecursiveTabular': 'RecursiveTabular_FULL',
 'DeepAR/T1': 'DeepAR/T1_FULL',
 'DeepAR/T2': 'DeepAR/T2_FULL',
 'DeepAR/T3': 'DeepAR/T3_FULL',
 'DeepAR/T4': 'DeepAR/T4_FULL',
 'DeepAR/T5': 'DeepAR/T5_FULL',
 'DeepAR/T6': 'DeepAR/T6_FULL',
 'DeepAR/T7': 'DeepAR/T7_FULL',
 'DeepAR/T8': 'DeepAR/T8_FULL',
 'DeepAR/T9': 'DeepAR/T9_FULL',
 'DeepAR/T10': 'DeepAR/T10_FULL',
 'WeightedEnsemble': 'WeightedEnsemble_FULL'}

In [199]:
# seed 고정
pred = predictor.predict(data, random_seed=42,)

INFO:lightning_fabric.utilities.seed:Global seed set to 42
Model not specified in predict, will default to the model with the best validation score: WeightedEnsemble_FULL


In [200]:
pred

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
item_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
TG_A_J,2023-03-04,3413.847302,2446.023835,2813.686405,3041.635967,3221.546893,3373.076794,3543.809943,3748.146589,4004.190023,4347.575738
TG_A_J,2023-03-05,38.514133,-534.528057,-337.899824,-195.842987,-74.994388,38.046381,151.078417,272.074671,414.039928,610.758229
TG_A_J,2023-03-06,3314.881943,2051.675850,2518.932441,2814.579448,3072.981636,3275.962632,3504.393456,3739.003896,4102.368006,4503.141259
TG_A_J,2023-03-07,3370.879185,1921.854259,2453.972002,2838.041841,3132.064316,3401.586094,3680.460960,3973.096411,4342.950979,4849.694699
TG_A_J,2023-03-08,3252.891165,1667.621404,2275.047459,2680.330232,2994.330522,3290.291067,3588.304530,3899.369456,4293.091377,4951.609350
...,...,...,...,...,...,...,...,...,...,...,...
RD_F_J,2023-03-27,578.231440,308.841331,399.999976,457.954818,508.961677,562.075968,624.027356,689.089604,771.037381,885.653714
RD_F_J,2023-03-28,563.672848,291.785440,384.955324,457.036048,510.394733,562.065853,615.049031,687.948136,760.000586,880.949782
RD_F_J,2023-03-29,563.006673,247.450181,385.549998,453.247417,508.789535,566.990041,615.977947,683.592407,766.410355,883.107356
RD_F_J,2023-03-30,539.403190,249.955866,375.983986,444.818168,499.228859,550.746391,600.723655,665.322960,758.929801,871.632748


In [201]:
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,3413.847302
1,TG_A_J_20230305,0.000000
2,TG_A_J_20230306,3314.881943
3,TG_A_J_20230307,3370.879185
4,TG_A_J_20230308,3252.891165
...,...,...
1087,RD_F_J_20230327,578.231440
1088,RD_F_J_20230328,563.672848
1089,RD_F_J_20230329,563.006673
1090,RD_F_J_20230330,539.403190


In [202]:
submission['answer'] = pred.reset_index()['mean']
submission.loc[ submission['answer'] < 0.0, 'answer'] = 0.0

In [204]:
submission['date'] = pd.to_datetime(submission['ID'].str[-8:], format='%Y%m%d')

# Determine the day of the week (0=Monday, 6=Sunday)
submission['day_of_week'] = submission['date'].dt.dayofweek

# Set 'answer' to 0 where the day of the week is Sunday
submission.loc[submission['day_of_week'] == 6, 'answer'] = 0

# Drop the temporary columns if they are not needed
submission.drop(columns=['date', 'day_of_week'], inplace=True)


In [205]:
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,3413.847302
1,TG_A_J_20230305,0.000000
2,TG_A_J_20230306,3314.881943
3,TG_A_J_20230307,3370.879185
4,TG_A_J_20230308,3252.891165
...,...,...
1087,RD_F_J_20230327,578.231440
1088,RD_F_J_20230328,563.672848
1089,RD_F_J_20230329,563.006673
1090,RD_F_J_20230330,539.403190


In [206]:
# 결과를 CSV 파일로 저장
submission.to_csv('autogluon__hyper_ensemble.csv', index=False)

In [207]:
from google.colab import files

files.download('/content/autogluon__hyper_ensemble.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>