## 제주경진대회 - 이혜승 Autogluon 모델링

## Import

In [1]:
import random
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings(action='ignore')

## Fixed Random-Seed

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Load Data

In [3]:
path = "/gdrive/My Drive/filename"

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
from google.colab import files

uploaded = files.upload()

Saving train1110.csv to train1110 (2).csv
Saving test1110.csv to test1110 (2).csv
Saving sample_submission.csv to sample_submission (2).csv


In [6]:
# Assuming you uploaded a CSV file
filename = next(iter(uploaded))

In [61]:
train_df = pd.read_csv('train1110.csv')
test_df = pd.read_csv('test1110.csv')
submission = pd.read_csv('sample_submission.csv')

In [62]:
# For the training data
train_df['weekday_dummy'] = (train_df['weekday'].between(0, 4)).astype(int)
train_df['saturday_dummy'] = (train_df['weekday'] == 5).astype(int)
train_df['sunday_dummy'] = (train_df['weekday'] == 6).astype(int)

# For the testing data
test_df['weekday_dummy'] = (test_df['weekday'].between(0, 4)).astype(int)
test_df['saturday_dummy'] = (test_df['weekday'] == 5).astype(int)
test_df['sunday_dummy'] = (test_df['weekday'] == 6).astype(int)

test_df.drop(columns=['weekday'], inplace=True)
train_df.drop(columns=['weekday'], inplace=True)

In [63]:
train_df

Unnamed: 0,ID,timestamp,item,supply(kg),price(원/kg),year,month,day,public_holiday,기간,...,corporation_B,corporation_C,corporation_D,corporation_E,corporation_F,location_J,location_S,weekday_dummy,saturday_dummy,sunday_dummy
0,TG_A_J_20190101,2019-01-01,TG,0.0,0.0,2019,1,1,1,2019-01,...,0,0,0,0,0,1,0,1,0,0
1,TG_A_J_20190102,2019-01-02,TG,0.0,0.0,2019,1,2,0,2019-01,...,0,0,0,0,0,1,0,1,0,0
2,TG_A_J_20190103,2019-01-03,TG,60601.0,1728.0,2019,1,3,0,2019-01,...,0,0,0,0,0,1,0,1,0,0
3,TG_A_J_20190104,2019-01-04,TG,25000.0,1408.0,2019,1,4,0,2019-01,...,0,0,0,0,0,1,0,1,0,0
4,TG_A_J_20190105,2019-01-05,TG,32352.0,1250.0,2019,1,5,0,2019-01,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,RD,452440.0,468.0,2023,2,27,0,,...,0,0,0,0,1,1,0,1,0,0
59393,RD_F_J_20230228,2023-02-28,RD,421980.0,531.0,2023,2,28,0,,...,0,0,0,0,1,1,0,1,0,0
59394,RD_F_J_20230301,2023-03-01,RD,382980.0,574.0,2023,3,1,1,,...,0,0,0,0,1,1,0,1,0,0
59395,RD_F_J_20230302,2023-03-02,RD,477220.0,523.0,2023,3,2,0,,...,0,0,0,0,1,1,0,1,0,0


TG : 감귤 (Citrus fruits)

BC : 브로콜리 (Broccoli)

RD : 무 (Radish)

CR : 당근 (Carrots)

CB : 양배추 (Cabbage)

In [64]:
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,0
1,TG_A_J_20230305,0
2,TG_A_J_20230306,0
3,TG_A_J_20230307,0
4,TG_A_J_20230308,0
...,...,...
1087,RD_F_J_20230327,0
1088,RD_F_J_20230328,0
1089,RD_F_J_20230329,0
1090,RD_F_J_20230330,0


In [None]:
# Initialize the TabularPredictor
from autogluon.tabular import TabularPredictor

# Define the target column
target_column = 'price(원/kg)'

predictor = TabularPredictor(label=target_column)

# Specify hyperparameters for time series models
hyperparameters = {
    'GBM': [{'num_boost_round': 100, 'learning_rate': lr} for lr in [0.01, 0.05, 0.1]],
    'CAT': [{'iterations': 100, 'learning_rate': lr} for lr in [0.01, 0.05, 0.1]],
    'XGB': [{'n_estimators': 100, 'learning_rate': lr} for lr in [0.01, 0.05, 0.1]]
}

predictor.fit(
    train_data=train_df,
    # presets='best_quality',
    # hyperparameters=hyperparameters,
    # num_stack_levels=1,  # Enables simple stacking (weighted ensemble)
    # time_limit=None,  # 1 hour, increase or decrease as per your computational resource
    # search_strategy='auto'
)

predictions = predictor.predict(test_df)

# Add the predictions to your test data or create a new DataFrame for submission
test_df['answer'] = predictions

# Save the predictions to a new CSV file for submission
test_df.to_csv('your_submission_file.csv', index=False)

No path specified. Models will be saved in: "AutogluonModels/ag-20231113_183940/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20231113_183940/"
AutoGluon Version:  0.8.2
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Wed Aug 30 11:19:59 UTC 2023
Disk Space Avail:   43.92 GB / 83.96 GB (52.3%)
Train Data Rows:    59397
Train Data Columns: 26
Label Column: price(원/kg)
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (20909.0, 0.0, 1131.68067, 2029.94145)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerato

[1000]	valid_set's rmse: 471.877
[2000]	valid_set's rmse: 450.634
[3000]	valid_set's rmse: 445.536
[4000]	valid_set's rmse: 441.606
[5000]	valid_set's rmse: 439.878
[6000]	valid_set's rmse: 439.411
[7000]	valid_set's rmse: 440.082
[8000]	valid_set's rmse: 439.439
[9000]	valid_set's rmse: 440.367
[10000]	valid_set's rmse: 440.23


	-438.9341	 = Validation score   (-root_mean_squared_error)
	56.57s	 = Training   runtime
	2.98s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's rmse: 380.912
[2000]	valid_set's rmse: 375.505
[3000]	valid_set's rmse: 374.166


	-373.8616	 = Validation score   (-root_mean_squared_error)
	16.69s	 = Training   runtime
	0.68s	 = Validation runtime
Fitting model: RandomForestMSE ...


In [65]:
import pandas as pd
import numpy as np
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor

In [66]:
train_df.ID.str[0:6]

0        TG_A_J
1        TG_A_J
2        TG_A_J
3        TG_A_J
4        TG_A_J
          ...  
59392    RD_F_J
59393    RD_F_J
59394    RD_F_J
59395    RD_F_J
59396    RD_F_J
Name: ID, Length: 59397, dtype: object

In [67]:
train_df['item_id'] = train_df.ID.str[0:6]
test_df['item_id'] = test_df.ID.str[0:6]

In [68]:
# 각 열에서 결측치의 개수 확인
missing_values_count =train_df.isnull().sum()

# 결과 출력
print(missing_values_count)

ID                    0
timestamp             0
item                  0
supply(kg)            0
price(원/kg)           0
year                  0
month                 0
day                   0
public_holiday        0
기간                12277
수출 중량             12277
수출 금액             12277
수입 중량             12277
수입 금액             12277
무역수지              12277
corporation_A         0
corporation_B         0
corporation_C         0
corporation_D         0
corporation_E         0
corporation_F         0
location_J            0
location_S            0
weekday_dummy         0
saturday_dummy        0
sunday_dummy          0
item_id               0
dtype: int64


In [69]:
# 결측치를 0으로 대체
train_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)

In [70]:
data = TimeSeriesDataFrame(train_df.drop(columns=['ID']))
data2 = TimeSeriesDataFrame(data.drop(columns=['item']))
data2 = TimeSeriesDataFrame(data2.drop(columns=['기간']))
data2 = TimeSeriesDataFrame(data2.drop(columns=['supply(kg)']))

data2

Unnamed: 0_level_0,Unnamed: 1_level_0,price(원/kg),year,month,day,public_holiday,수출 중량,수출 금액,수입 중량,수입 금액,무역수지,...,corporation_B,corporation_C,corporation_D,corporation_E,corporation_F,location_J,location_S,weekday_dummy,saturday_dummy,sunday_dummy
item_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
TG_A_J,2019-01-01,0.0,2019,1,1,1,58368.0,172.0,0.0,0.0,172.0,...,0,0,0,0,0,1,0,1,0,0
TG_A_J,2019-01-02,0.0,2019,1,2,0,58368.0,172.0,0.0,0.0,172.0,...,0,0,0,0,0,1,0,1,0,0
TG_A_J,2019-01-03,1728.0,2019,1,3,0,58368.0,172.0,0.0,0.0,172.0,...,0,0,0,0,0,1,0,1,0,0
TG_A_J,2019-01-04,1408.0,2019,1,4,0,58368.0,172.0,0.0,0.0,172.0,...,0,0,0,0,0,1,0,1,0,0
TG_A_J,2019-01-05,1250.0,2019,1,5,0,58368.0,172.0,0.0,0.0,172.0,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
RD_F_J,2023-02-27,468.0,2023,2,27,0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,1,0,1,0,0
RD_F_J,2023-02-28,531.0,2023,2,28,0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,1,0,1,0,0
RD_F_J,2023-03-01,574.0,2023,3,1,1,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,1,0,1,0,0
RD_F_J,2023-03-02,523.0,2023,3,2,0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,1,0,1,0,0


data = TimeSeriesDataFrame(train_df.drop(columns=['ID']))
predictor = TimeSeriesPredictor(
    prediction_length=28,
    target="price(원/kg)",
    eval_metric="RMSE",
)

In [None]:
data = TimeSeriesDataFrame(train_df.drop(columns=['ID']))
predictor = TimeSeriesPredictor(
    prediction_length=28,
    target="price(원/kg)",
    eval_metric="RMSE",
)

In [52]:
test_df['item_id'] = test_df.ID.str[0:6]

In [53]:
test = TimeSeriesDataFrame(test_df.drop(columns=['ID']))
test = TimeSeriesDataFrame(test.drop(columns=['item']))

In [54]:
test

Unnamed: 0_level_0,Unnamed: 1_level_0,year,month,day,public_holiday,수출 중량,수출 금액,수입 중량,수입 금액,무역수지,corporation_A,corporation_B,corporation_C,corporation_D,corporation_E,corporation_F,location_J,location_S,weekday_dummy,saturday_dummy,sunday_dummy
item_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TG_A_J,2023-03-04,2023,3,4,0,12674.5,59.0,7031.5,23.0,36.0,1,0,0,0,0,0,1,0,0,1,0
TG_A_J,2023-03-05,2023,3,5,0,12674.5,59.0,7031.5,23.0,36.0,1,0,0,0,0,0,1,0,0,0,1
TG_A_J,2023-03-06,2023,3,6,0,12674.5,59.0,7031.5,23.0,36.0,1,0,0,0,0,0,1,0,1,0,0
TG_A_J,2023-03-07,2023,3,7,0,12674.5,59.0,7031.5,23.0,36.0,1,0,0,0,0,0,1,0,1,0,0
TG_A_J,2023-03-08,2023,3,8,0,12674.5,59.0,7031.5,23.0,36.0,1,0,0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
RD_F_J,2023-03-27,2023,3,27,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,1,1,0,1,0,0
RD_F_J,2023-03-28,2023,3,28,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,1,1,0,1,0,0
RD_F_J,2023-03-29,2023,3,29,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,1,1,0,1,0,0
RD_F_J,2023-03-30,2023,3,30,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,1,1,0,1,0,0


In [55]:
predictor= TimeSeriesPredictor(
    prediction_length=28,
    target="price(원/kg)",
    eval_metric="RMSE",
)


In [56]:
# seed 고정
predictor.fit(data2,random_seed=42, )


TimeSeriesPredictor.fit() called
Fitting with arguments:
{'enable_ensemble': True,
 'evaluation_metric': 'RMSE',
 'excluded_model_types': None,
 'hyperparameter_tune_kwargs': None,
 'hyperparameters': 'default',
 'num_val_windows': 1,
 'prediction_length': 28,
 'random_seed': 42,
 'target': 'price(원/kg)',
 'time_limit': None,
 'verbosity': 2}
Provided training data set with 59397 rows, 39 items (item = single time series). Average time series length is 1523.0. Data frequency is 'D'.
INFO:lightning_fabric.utilities.seed:Global seed set to 42
AutoGluon will save models to AutogluonModels/ag-20231113_182136/
AutoGluon will gauge predictive performance using evaluation metric: 'RMSE'
	This metric's sign has been flipped to adhere to being 'higher is better'. The reported score can be multiplied by -1 to get the metric value.

Provided dataset contains following columns:
	target:           'price(원/kg)'
	past covariates:  ['year', 'month', 'day', 'public_holiday', '수출 중량', '수출 금액', '수입 중량',

<autogluon.timeseries.predictor.TimeSeriesPredictor at 0x78ad467ebb50>

In [57]:
predictor.refit_full()

Refitting models via `refit_full` using all of the data (combined train and validation)...
	Models trained in this way will have the suffix '_FULL' and have NaN validation score.
	This process is not bound by time_limit, but should take less time than the original `fit` call.
Fitting model: Naive_FULL | Skipping fit via cloning parent ...
Fitting model: SeasonalNaive_FULL | Skipping fit via cloning parent ...
Fitting model: Theta_FULL | Skipping fit via cloning parent ...
Fitting model: AutoETS_FULL | Skipping fit via cloning parent ...
Fitting model: RecursiveTabular_FULL
	5.34    s     = Training runtime
Fitting model: DeepAR_FULL | Skipping fit via cloning parent ...
Fitting model: WeightedEnsemble_FULL | Skipping fit via cloning parent ...
Refit complete. Models trained: ['Naive_FULL', 'SeasonalNaive_FULL', 'Theta_FULL', 'AutoETS_FULL', 'RecursiveTabular_FULL', 'DeepAR_FULL', 'WeightedEnsemble_FULL']
Total runtime: 5.42 s
Updated best model to 'WeightedEnsemble_FULL' (Previously 'W

{'Naive': 'Naive_FULL',
 'SeasonalNaive': 'SeasonalNaive_FULL',
 'Theta': 'Theta_FULL',
 'AutoETS': 'AutoETS_FULL',
 'RecursiveTabular': 'RecursiveTabular_FULL',
 'DeepAR': 'DeepAR_FULL',
 'WeightedEnsemble': 'WeightedEnsemble_FULL'}

In [58]:
# seed 고정
pred = predictor.predict(test,random_seed=42)

INFO:lightning_fabric.utilities.seed:Global seed set to 42


In [None]:
pred.head()

In [None]:
submission['answer'] = pred.reset_index()['mean']
submission.loc[ submission['answer'] < 0.0, 'answer'] = 0.0

In [None]:
submission['date'] = pd.to_datetime(submission['ID'].str[-8:], format='%Y%m%d')

# Determine the day of the week (0=Monday, 6=Sunday)
submission['day_of_week'] = submission['date'].dt.dayofweek

# Set 'answer' to 0 where the day of the week is Sunday
submission.loc[submission['day_of_week'] == 6, 'answer'] = 0

# Drop the temporary columns if they are not needed
submission.drop(columns=['date', 'day_of_week'], inplace=True)


In [None]:
submission.head()

In [None]:
# 결과를 CSV 파일로 저장
submission.to_csv('submission_gluon5.csv', index=False)

In [None]:
from google.colab import files

files.download('/content/submission_gluon5.csv')


In [None]:
$