## 제주경진대회 - 이혜승 Autogluon 모델링

## Import

In [1]:
import random
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings(action='ignore')

## Fixed Random-Seed

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Load Data

In [3]:
from google.colab import files

uploaded = files.upload()

Saving sample_submission.csv to sample_submission.csv
Saving test1110.csv to test1110.csv
Saving train1110.csv to train1110.csv


In [19]:
from google.colab import files

uploaded = files.upload()

Saving test_final3.csv to test_final3.csv
Saving train_final3.csv to train_final3.csv


In [20]:
# Assuming you uploaded a CSV file
filename = next(iter(uploaded))

In [21]:
train_df = pd.read_csv('train_final3.csv')
test_df = pd.read_csv('test_final3.csv')
submission = pd.read_csv('sample_submission.csv')

In [42]:
train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])
train_df['day_of_week'] = train_df['timestamp'].dt.dayofweek


In [43]:
# 공휴일
holi_weekday = ['2019-01-01', '2019-02-04', '2019-02-05', '2019-02-06', '2019-03-01', '2019-05-05', '2019-05-12', '2019-06-06', '2019-08-15', '2019-09-12', '2019-09-13', '2019-09-14', '2019-10-03', '2019-10-09', '2019-12-25',
                '2020-01-01' ,'2020-01-24' ,'2020-01-25', '2020-01-26', '2020-03-01', '2020-04-30', '2020-05-05', '2020-06-06', '2020-08-15', '2020-08-17', '2020-09-30', '2020-10-01', '2020-10-02', '2020-10-03', '2020-10-09', '2020-12-25',
                '2021-01-01' ,'2021-02-11' ,'2021-02-12', '2021-02-13', '2021-03-01', '2021-05-05', '2021-05-19', '2021-06-06', '2021-08-15', '2021-09-20', '2021-09-21', '2021-09-22', '2021-10-03', '2021-10-09', '2021-12-25',
                '2022-01-01' ,'2022-01-31' ,'2022-02-01', '2022-02-02', '2022-03-01', '2022-05-05', '2022-05-08', '2022-06-06', '2022-08-15', '2022-09-09', '2022-09-10', '2022-09-11', '2022-09-12', '2022-10-03', '2022-10-09', '2020-10-10', '2022-12-25',
                '2023-01-01' ,'2023-01-21' ,'2023-01-22', '2023-01-23', '2023-01-24', '2023-03-01']

In [44]:
train_df['holiday'] = np.where((train_df.day_of_week >= 5) | (train_df.timestamp.dt.strftime('%Y-%m-%d').isin(holi_weekday)), 1, 0)

In [53]:
test_df .info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1092 entries, 0 to 1091
Data columns (total 26 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     1092 non-null   int64  
 1   ID             1092 non-null   object 
 2   timestamp      1092 non-null   object 
 3   item           1092 non-null   object 
 4   supply(kg)     1092 non-null   float64
 5   year           1092 non-null   int64  
 6   month          1092 non-null   int64  
 7   day            1092 non-null   int64  
 8   week           1092 non-null   int64  
 9   isWeekday      1092 non-null   int64  
 10  isSaturday     1092 non-null   int64  
 11  isSunday       1092 non-null   int64  
 12  holiday        1092 non-null   int64  
 13  수출 중량          1092 non-null   float64
 14  수출 금액          1092 non-null   float64
 15  수입 중량          1092 non-null   float64
 16  수입 금액          1092 non-null   float64
 17  무역수지           1092 non-null   float64
 18  corporat

In [38]:
train_df.drop('무역 규모', axis=1, inplace=True)
train_df.drop('평균 수출 가격', axis=1, inplace=True)
train_df.drop('평균 수입 가격', axis=1, inplace=True)
train_df.drop('무역수지 비율', axis=1, inplace=True)
train_df.drop('기간', axis=1, inplace=True)
train_df.drop('day_of_week', axis=1, inplace=True)

In [52]:
test_df.drop('무역 규모', axis=1, inplace=True)
test_df.drop('평균 수출 가격', axis=1, inplace=True)
test_df.drop('평균 수입 가격', axis=1, inplace=True)
test_df.drop('무역수지 비율', axis=1, inplace=True)

# 1. 과일 + 회사 + 지역 + 월 별로나누기

In [49]:
# unique_values_quality 리스트 생성
unique_values_quality = train_df['item_id'].unique()

# 결과를 저장할 리스트 생성
filtered_dataframes = []
test_dataframes = []
# for 루프를 통해 데이터프레임 필터링
for i in unique_values_quality:
    filtered_dataframes.append(train_df[train_df['item_id'] == i])
    test_dataframes.append(test_df[test_df["item_id"]==i])

In [None]:
for i in range(len(filtered_dataframes)):
    monthly_means = filtered_dataframes[i].groupby(['month']).mean()[["supply(kg)","price(원/kg)","수출 중량","수출 금액","수입 중량","수입 금액","무역수지"]].add_suffix('_mean')
    filtered_dataframes[i] = filtered_dataframes[i].merge(monthly_means, on='month', suffixes=('', '_mean'))
    test_dataframes[i] = test_dataframes[i].merge(monthly_means, on='month', suffixes=('', '_mean'))

In [50]:
for i in range(len(filtered_dataframes)):
    company =filtered_dataframes[i]['item_id'].iloc[1]
    df = filtered_dataframes[i].drop(['ID','timestamp','item',"기간",'item_id','corporation_A','corporation_B','corporation_C',
                                      "corporation_D","corporation_E","corporation_F","location_J","location_S"],axis = 1)

    sns.set(rc={"figure.figsize": (20, 20)})
    sns.heatmap(df.corr(),annot=True,cmap='Reds').set(title=company)
    plt.rc('font', family='NanumGothic')
    plt.show()

In [40]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1092 entries, 0 to 1091
Data columns (total 30 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     1092 non-null   int64  
 1   ID             1092 non-null   object 
 2   timestamp      1092 non-null   object 
 3   item           1092 non-null   object 
 4   supply(kg)     1092 non-null   float64
 5   year           1092 non-null   int64  
 6   month          1092 non-null   int64  
 7   day            1092 non-null   int64  
 8   week           1092 non-null   int64  
 9   isWeekday      1092 non-null   int64  
 10  isSaturday     1092 non-null   int64  
 11  isSunday       1092 non-null   int64  
 12  holiday        1092 non-null   int64  
 13  수출 중량          1092 non-null   float64
 14  수출 금액          1092 non-null   float64
 15  수입 중량          1092 non-null   float64
 16  수입 금액          1092 non-null   float64
 17  무역수지           1092 non-null   float64
 18  corporat

TG : 감귤 (Citrus fruits)

BC : 브로콜리 (Broccoli)

RD : 무 (Radish)

CR : 당근 (Carrots)

CB : 양배추 (Cabbage)

In [8]:
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,0
1,TG_A_J_20230305,0
2,TG_A_J_20230306,0
3,TG_A_J_20230307,0
4,TG_A_J_20230308,0
...,...,...
1087,RD_F_J_20230327,0
1088,RD_F_J_20230328,0
1089,RD_F_J_20230329,0
1090,RD_F_J_20230330,0


In [9]:
!pip install mxnet
!pip install autogluon

Collecting mxnet
  Downloading mxnet-1.9.1-py3-none-manylinux2014_x86_64.whl (49.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.1/49.1 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
Collecting graphviz<0.9.0,>=0.8.1 (from mxnet)
  Downloading graphviz-0.8.4-py2.py3-none-any.whl (16 kB)
Installing collected packages: graphviz, mxnet
  Attempting uninstall: graphviz
    Found existing installation: graphviz 0.20.1
    Uninstalling graphviz-0.20.1:
      Successfully uninstalled graphviz-0.20.1
Successfully installed graphviz-0.8.4 mxnet-1.9.1
Collecting autogluon
  Downloading autogluon-0.8.2-py3-none-any.whl (9.7 kB)
Collecting autogluon.core[all]==0.8.2 (from autogluon)
  Downloading autogluon.core-0.8.2-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.0/224.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autogluon.features==0.8.2 (from autogluon)
  Downloading autogluon.features-0.8.2-py3-none-any.

In [10]:
import pandas as pd
import numpy as np
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor

In [56]:
train_df.ID.str[0:6]

0        TG_A_J
1        TG_A_J
2        TG_A_J
3        TG_A_J
4        TG_A_J
          ...  
59392    RD_F_J
59393    RD_F_J
59394    RD_F_J
59395    RD_F_J
59396    RD_F_J
Name: ID, Length: 59397, dtype: object

In [57]:
train_df['item_id'] = train_df.ID.str[0:6]
test_df['item_id'] = test_df.ID.str[0:6]

In [58]:
# 각 열에서 결측치의 개수 확인
missing_values_count =train_df.isnull().sum()

# 결과 출력
print(missing_values_count)

Unnamed: 0       0
ID               0
timestamp        0
item             0
supply(kg)       0
price(원/kg)      0
year             0
month            0
day              0
week             0
isWeekday        0
isSaturday       0
isSunday         0
holiday          0
수출 중량            0
수출 금액            0
수입 중량            0
수입 금액            0
무역수지             0
corporation_A    0
corporation_B    0
corporation_C    0
corporation_D    0
corporation_E    0
corporation_F    0
location_J       0
location_S       0
item_id          0
dtype: int64


In [59]:
# 결측치를 0으로 대체
train_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)

In [60]:
data = TimeSeriesDataFrame(train_df.drop(columns=['ID']))
data2 = TimeSeriesDataFrame(data.drop(columns=['item']))
data2 = TimeSeriesDataFrame(data2.drop(columns=['supply(kg)']))

data2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,price(원/kg),year,month,day,week,isWeekday,isSaturday,isSunday,holiday,...,수입 금액,무역수지,corporation_A,corporation_B,corporation_C,corporation_D,corporation_E,corporation_F,location_J,location_S
item_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
TG_A_J,2019-01-01,0,0.0,2019,1,1,1,1,0,0,1,...,0.0,172.0,True,False,False,False,False,False,True,False
TG_A_J,2019-01-02,1,0.0,2019,1,2,1,1,0,0,0,...,0.0,172.0,True,False,False,False,False,False,True,False
TG_A_J,2019-01-03,2,1728.0,2019,1,3,1,1,0,0,0,...,0.0,172.0,True,False,False,False,False,False,True,False
TG_A_J,2019-01-04,3,1408.0,2019,1,4,1,1,0,0,0,...,0.0,172.0,True,False,False,False,False,False,True,False
TG_A_J,2019-01-05,4,1250.0,2019,1,5,1,0,1,0,1,...,0.0,172.0,True,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
RD_F_J,2023-02-27,59392,468.0,2023,2,27,9,1,0,0,0,...,0.0,0.0,False,False,False,False,False,True,True,False
RD_F_J,2023-02-28,59393,531.0,2023,2,28,9,1,0,0,0,...,0.0,0.0,False,False,False,False,False,True,True,False
RD_F_J,2023-03-01,59394,574.0,2023,3,1,9,1,0,0,1,...,0.0,0.0,False,False,False,False,False,True,True,False
RD_F_J,2023-03-02,59395,523.0,2023,3,2,9,1,0,0,0,...,0.0,0.0,False,False,False,False,False,True,True,False


data = TimeSeriesDataFrame(train_df.drop(columns=['ID']))
predictor = TimeSeriesPredictor(
    prediction_length=28,
    target="price(원/kg)",
    eval_metric="RMSE",
)

In [61]:
test_df['item_id'] = test_df.ID.str[0:6]

In [62]:
test = TimeSeriesDataFrame(test_df.drop(columns=['ID']))
test = TimeSeriesDataFrame(test.drop(columns=['item']))

In [None]:
test

Unnamed: 0_level_0,Unnamed: 1_level_0,year,month,day,public_holiday,수출 중량,수출 금액,수입 중량,수입 금액,무역수지,corporation_A,corporation_B,corporation_C,corporation_D,corporation_E,corporation_F,location_J,location_S,weekday_dummy,saturday_dummy,sunday_dummy
item_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TG_A_J,2023-03-04,2023,3,4,0,12674.5,59.0,7031.5,23.0,36.0,1,0,0,0,0,0,1,0,0,1,0
TG_A_J,2023-03-05,2023,3,5,0,12674.5,59.0,7031.5,23.0,36.0,1,0,0,0,0,0,1,0,0,0,1
TG_A_J,2023-03-06,2023,3,6,0,12674.5,59.0,7031.5,23.0,36.0,1,0,0,0,0,0,1,0,1,0,0
TG_A_J,2023-03-07,2023,3,7,0,12674.5,59.0,7031.5,23.0,36.0,1,0,0,0,0,0,1,0,1,0,0
TG_A_J,2023-03-08,2023,3,8,0,12674.5,59.0,7031.5,23.0,36.0,1,0,0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
RD_F_J,2023-03-27,2023,3,27,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,1,1,0,1,0,0
RD_F_J,2023-03-28,2023,3,28,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,1,1,0,1,0,0
RD_F_J,2023-03-29,2023,3,29,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,1,1,0,1,0,0
RD_F_J,2023-03-30,2023,3,30,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,1,1,0,1,0,0


In [63]:
predictor = TimeSeriesPredictor(
    prediction_length=28,
    target="price(원/kg)",
    eval_metric="RMSE",
)


In [64]:
# seed 고정
predictor.fit(data2,  random_seed=42,)


TimeSeriesPredictor.fit() called
Fitting with arguments:
{'enable_ensemble': True,
 'evaluation_metric': 'RMSE',
 'excluded_model_types': None,
 'hyperparameter_tune_kwargs': None,
 'hyperparameters': 'default',
 'num_val_windows': 1,
 'prediction_length': 28,
 'random_seed': 42,
 'target': 'price(원/kg)',
 'time_limit': None,
 'verbosity': 2}
Provided training data set with 59397 rows, 39 items (item = single time series). Average time series length is 1523.0. Data frequency is 'D'.
INFO:lightning_fabric.utilities.seed:Global seed set to 42
AutoGluon will save models to AutogluonModels/ag-20231115_114346/
AutoGluon will gauge predictive performance using evaluation metric: 'RMSE'
	This metric's sign has been flipped to adhere to being 'higher is better'. The reported score can be multiplied by -1 to get the metric value.

Provided dataset contains following columns:
	target:           'price(원/kg)'
	past covariates:  ['Unnamed: 0', 'year', 'month', 'day', 'week', 'isWeekday', 'isSaturd

<autogluon.timeseries.predictor.TimeSeriesPredictor at 0x7ca0f51156c0>

In [65]:
predictor.refit_full()

Refitting models via `refit_full` using all of the data (combined train and validation)...
	Models trained in this way will have the suffix '_FULL' and have NaN validation score.
	This process is not bound by time_limit, but should take less time than the original `fit` call.
Fitting model: Naive_FULL | Skipping fit via cloning parent ...
Fitting model: SeasonalNaive_FULL | Skipping fit via cloning parent ...
Fitting model: Theta_FULL | Skipping fit via cloning parent ...
Fitting model: AutoETS_FULL | Skipping fit via cloning parent ...
Fitting model: RecursiveTabular_FULL
	6.58    s     = Training runtime
Fitting model: DeepAR_FULL | Skipping fit via cloning parent ...
Fitting model: WeightedEnsemble_FULL | Skipping fit via cloning parent ...
Refit complete. Models trained: ['Naive_FULL', 'SeasonalNaive_FULL', 'Theta_FULL', 'AutoETS_FULL', 'RecursiveTabular_FULL', 'DeepAR_FULL', 'WeightedEnsemble_FULL']
Total runtime: 6.67 s
Updated best model to 'WeightedEnsemble_FULL' (Previously 'W

{'Naive': 'Naive_FULL',
 'SeasonalNaive': 'SeasonalNaive_FULL',
 'Theta': 'Theta_FULL',
 'AutoETS': 'AutoETS_FULL',
 'RecursiveTabular': 'RecursiveTabular_FULL',
 'DeepAR': 'DeepAR_FULL',
 'WeightedEnsemble': 'WeightedEnsemble_FULL'}

In [66]:
# seed 고정
pred = predictor.predict(data2, random_seed=42,)

INFO:lightning_fabric.utilities.seed:Global seed set to 42
Model not specified in predict, will default to the model with the best validation score: WeightedEnsemble_FULL


In [67]:
pred

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
item_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
TG_A_J,2023-03-04,3309.629806,1252.716753,1973.674595,2456.460071,2892.270271,3270.190024,3678.717406,4112.025811,4615.661397,5362.042258
TG_A_J,2023-03-05,629.586961,-1608.642483,-843.663518,-292.320907,173.360978,605.231258,1048.891185,1518.042657,2066.267777,2822.561403
TG_A_J,2023-03-06,3134.373623,-36.272894,1092.270301,1868.681181,2526.350343,3131.846459,3760.980251,4448.408040,5210.901550,6290.512187
TG_A_J,2023-03-07,3411.484519,-121.771310,1095.293878,2010.297931,2740.037598,3420.933974,4099.380557,4860.357170,5689.636201,7041.347607
TG_A_J,2023-03-08,3346.819216,-483.823765,872.537515,1822.490785,2613.878802,3344.280844,4072.966379,4868.422707,5827.314952,7265.042765
...,...,...,...,...,...,...,...,...,...,...,...
RD_F_J,2023-03-27,522.733171,-91.763941,122.820814,271.402918,399.601347,515.436344,630.312030,756.934543,921.019560,1150.462472
RD_F_J,2023-03-28,520.690109,-112.602066,127.031220,275.122132,406.650610,526.779755,648.925944,779.255955,935.432994,1161.412181
RD_F_J,2023-03-29,525.264705,-140.453774,116.479614,274.057585,406.202259,526.512181,650.626146,787.704882,948.613686,1172.169106
RD_F_J,2023-03-30,505.481055,-159.660676,100.874003,255.833001,389.108606,510.796484,634.417103,773.486996,930.871450,1162.689850


In [69]:
submission['answer'] = pred.reset_index()['mean']
submission.loc[ submission['answer'] < 0.0, 'answer'] = 0.0

In [70]:
submission['date'] = pd.to_datetime(submission['ID'].str[-8:], format='%Y%m%d')

# Determine the day of the week (0=Monday, 6=Sunday)
submission['day_of_week'] = submission['date'].dt.dayofweek

# Set 'answer' to 0 where the day of the week is Sunday
submission.loc[submission['day_of_week'] == 6, 'answer'] = 0

# Drop the temporary columns if they are not needed
submission.drop(columns=['date', 'day_of_week'], inplace=True)


In [71]:
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,3309.629806
1,TG_A_J_20230305,0.000000
2,TG_A_J_20230306,3134.373623
3,TG_A_J_20230307,3411.484519
4,TG_A_J_20230308,3346.819216
...,...,...
1087,RD_F_J_20230327,522.733171
1088,RD_F_J_20230328,520.690109
1089,RD_F_J_20230329,525.264705
1090,RD_F_J_20230330,505.481055


In [72]:
# 결과를 CSV 파일로 저장
submission.to_csv('submission_gluon4.csv', index=False)

In [74]:
from google.colab import files

files.download('/content/submission_gluon4.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>