## 제주경진대회 - 이승우 TabularPredictor 모델링

## Import

In [1]:
import random
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings(action='ignore')

## Fixed Random-Seed

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Load Data

In [3]:
path = "/gdrive/My Drive/filename"

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
from google.colab import files

uploaded = files.upload()

Saving train1110.csv to train1110.csv
Saving test1110.csv to test1110.csv
Saving sample_submission.csv to sample_submission.csv


In [6]:
# Assuming you uploaded a CSV file
filename = next(iter(uploaded))

In [26]:
train_df = pd.read_csv('train1110.csv')
test_df = pd.read_csv('test1110.csv')
submission = pd.read_csv('sample_submission.csv')

In [27]:
# For the training data
train_df['weekday_dummy'] = (train_df['weekday'].between(0, 4)).astype(int)
train_df['saturday_dummy'] = (train_df['weekday'] == 5).astype(int)
train_df['sunday_dummy'] = (train_df['weekday'] == 6).astype(int)

# For the testing data
test_df['weekday_dummy'] = (test_df['weekday'].between(0, 4)).astype(int)
test_df['saturday_dummy'] = (test_df['weekday'] == 5).astype(int)
test_df['sunday_dummy'] = (test_df['weekday'] == 6).astype(int)

test_df.drop(columns=['weekday'], inplace=True)
train_df.drop(columns=['weekday'], inplace=True)

In [28]:
train_df

Unnamed: 0,ID,timestamp,item,supply(kg),price(원/kg),year,month,day,public_holiday,기간,...,corporation_B,corporation_C,corporation_D,corporation_E,corporation_F,location_J,location_S,weekday_dummy,saturday_dummy,sunday_dummy
0,TG_A_J_20190101,2019-01-01,TG,0.0,0.0,2019,1,1,1,2019-01,...,0,0,0,0,0,1,0,1,0,0
1,TG_A_J_20190102,2019-01-02,TG,0.0,0.0,2019,1,2,0,2019-01,...,0,0,0,0,0,1,0,1,0,0
2,TG_A_J_20190103,2019-01-03,TG,60601.0,1728.0,2019,1,3,0,2019-01,...,0,0,0,0,0,1,0,1,0,0
3,TG_A_J_20190104,2019-01-04,TG,25000.0,1408.0,2019,1,4,0,2019-01,...,0,0,0,0,0,1,0,1,0,0
4,TG_A_J_20190105,2019-01-05,TG,32352.0,1250.0,2019,1,5,0,2019-01,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,RD,452440.0,468.0,2023,2,27,0,,...,0,0,0,0,1,1,0,1,0,0
59393,RD_F_J_20230228,2023-02-28,RD,421980.0,531.0,2023,2,28,0,,...,0,0,0,0,1,1,0,1,0,0
59394,RD_F_J_20230301,2023-03-01,RD,382980.0,574.0,2023,3,1,1,,...,0,0,0,0,1,1,0,1,0,0
59395,RD_F_J_20230302,2023-03-02,RD,477220.0,523.0,2023,3,2,0,,...,0,0,0,0,1,1,0,1,0,0


TG : 감귤 (Citrus fruits)

BC : 브로콜리 (Broccoli)

RD : 무 (Radish)

CR : 당근 (Carrots)

CB : 양배추 (Cabbage)

In [29]:
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,0
1,TG_A_J_20230305,0
2,TG_A_J_20230306,0
3,TG_A_J_20230307,0
4,TG_A_J_20230308,0
...,...,...
1087,RD_F_J_20230327,0
1088,RD_F_J_20230328,0
1089,RD_F_J_20230329,0
1090,RD_F_J_20230330,0


In [30]:
# 결측치를 0으로 대체
train_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)

In [31]:
train_df.drop(columns=['기간'],inplace=True)

In [32]:
!pip install autogluon

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/base_command.py", line 169, in exc_logging_wrapper
    status = run_func(*args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/req_command.py", line 242, in wrapper
    return func(self, options, args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/commands/install.py", line 377, in run
    requirement_set = resolver.resolve(
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/resolution/resolvelib/resolver.py", line 92, in resolve
    result = self._result = resolver.resolve(
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/resolvelib/resolvers.py", line 546, in resolve
    state = resolution.resolve(requirements, max_rounds=max_rounds)
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/resolvelib/resolvers.py", line 443, in resolve
    newly_unsatisfied_names = {
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor

In [44]:
train_notprice = train_df.drop(columns=['price(원/kg)'])

# Initialize the TabularPredictor
from autogluon.tabular import TabularPredictor

# Define the target column
target_column = 'supply(kg)'

predictor = TabularPredictor(label=target_column)

predictor.fit(
    train_data=train_notprice,
    # presets='best_quality',
    # hyperparameters=hyperparameters,
    # num_stack_levels=1,  # Enables simple stacking (weighted ensemble)
    time_limit=None,  # 1 hour, increase or decrease as per your computational resource
    # search_strategy='auto'
)

predictions = predictor.predict(test_df)

# Add the predictions to your test data or create a new DataFrame for submission
test_df['supply(kg)'] = predictions
test_df.loc[test_df['sunday_dummy'] == 1, 'supply(kg)'] = 0

No path specified. Models will be saved in: "AutogluonModels/ag-20231113_194744/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20231113_194744/"
AutoGluon Version:  0.8.2
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Wed Aug 30 11:19:59 UTC 2023
Disk Space Avail:   44.75 GB / 83.96 GB (53.3%)
Train Data Rows:    59397
Train Data Columns: 23
Label Column: supply(kg)
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (1222800.0, 0.0, 11894.52519, 52263.99983)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGener

[1000]	valid_set's rmse: 23522.3
[2000]	valid_set's rmse: 22272
[3000]	valid_set's rmse: 21717.2
[4000]	valid_set's rmse: 21405.1
[5000]	valid_set's rmse: 21170.5
[6000]	valid_set's rmse: 21030.2
[7000]	valid_set's rmse: 21002.2
[8000]	valid_set's rmse: 20952.3
[9000]	valid_set's rmse: 20945.7
[10000]	valid_set's rmse: 20926.3


	-20908.5254	 = Validation score   (-root_mean_squared_error)
	47.29s	 = Training   runtime
	2.53s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's rmse: 20252.8
[2000]	valid_set's rmse: 20115.6


	-19990.2354	 = Validation score   (-root_mean_squared_error)
	9.76s	 = Training   runtime
	0.31s	 = Validation runtime
Fitting model: RandomForestMSE ...
	-18608.8039	 = Validation score   (-root_mean_squared_error)
	70.73s	 = Training   runtime
	0.41s	 = Validation runtime
Fitting model: CatBoost ...
	-20415.5518	 = Validation score   (-root_mean_squared_error)
	330.18s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-19163.9959	 = Validation score   (-root_mean_squared_error)
	37.16s	 = Training   runtime
	0.47s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	-23786.5787	 = Validation score   (-root_mean_squared_error)
	66.72s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: XGBoost ...
	-17448.5118	 = Validation score   (-root_mean_squared_error)
	11.09s	 = Training   runtime
	0.14s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	-27461.5504	 = Validation score   (-root_mean_squared_error)
	135.13s	 = Training

[1000]	valid_set's rmse: 19228.8


	-19163.4851	 = Validation score   (-root_mean_squared_error)
	11.86s	 = Training   runtime
	0.48s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	-17339.9878	 = Validation score   (-root_mean_squared_error)
	0.62s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 740.55s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20231113_194744/")


In [45]:
# Initialize the TabularPredictor
from autogluon.tabular import TabularPredictor

# Define the target column
target_column = 'price(원/kg)'

predictor = TabularPredictor(label=target_column)

# Specify hyperparameters for time series models
hyperparameters = {
    'GBM': [{'num_boost_round': [100,200,300,400,500], 'learning_rate': lr} for lr in [0.01, 0.05, 0.1]],
    'CAT': [{'iterations': 100, 'learning_rate': lr} for lr in [0.01, 0.05, 0.1]],
    'XGB': [{'n_estimators': 100, 'learning_rate': lr} for lr in [0.01, 0.05, 0.1]]
}

predictor.fit(
    train_data=train_df,
    # presets='best_quality',
    # hyperparameters=hyperparameters,
    # num_stack_levels=1,  # Enables simple stacking (weighted ensemble)
    time_limit=None,  # 1 hour, increase or decrease as per your computational resource
    # search_strategy='auto'
)

predictions = predictor.predict(test_df)

# Add the predictions to your test data or create a new DataFrame for submission
submission['answer'] = predictions

No path specified. Models will be saved in: "AutogluonModels/ag-20231113_200006/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20231113_200006/"
AutoGluon Version:  0.8.2
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Wed Aug 30 11:19:59 UTC 2023
Disk Space Avail:   43.51 GB / 83.96 GB (51.8%)
Train Data Rows:    59397
Train Data Columns: 24
Label Column: price(원/kg)
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (20909.0, 0.0, 1131.68067, 2029.94145)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerato

[1000]	valid_set's rmse: 461.772
[2000]	valid_set's rmse: 443.658
[3000]	valid_set's rmse: 438.553
[4000]	valid_set's rmse: 432.278
[5000]	valid_set's rmse: 430.239
[6000]	valid_set's rmse: 429.177
[7000]	valid_set's rmse: 428.477
[8000]	valid_set's rmse: 426.914
[9000]	valid_set's rmse: 426.271
[10000]	valid_set's rmse: 425.176


	-424.958	 = Validation score   (-root_mean_squared_error)
	50.06s	 = Training   runtime
	2.63s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's rmse: 394.345
[2000]	valid_set's rmse: 384.552
[3000]	valid_set's rmse: 381.205
[4000]	valid_set's rmse: 378.305
[5000]	valid_set's rmse: 377.83
[6000]	valid_set's rmse: 377.583
[7000]	valid_set's rmse: 378.41


	-376.4638	 = Validation score   (-root_mean_squared_error)
	33.38s	 = Training   runtime
	1.27s	 = Validation runtime
Fitting model: RandomForestMSE ...
	-377.3031	 = Validation score   (-root_mean_squared_error)
	50.98s	 = Training   runtime
	0.18s	 = Validation runtime
Fitting model: CatBoost ...
	-378.136	 = Validation score   (-root_mean_squared_error)
	442.98s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-483.0341	 = Validation score   (-root_mean_squared_error)
	46.86s	 = Training   runtime
	0.21s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
No improvement since epoch 6: early stopping
	-974.6919	 = Validation score   (-root_mean_squared_error)
	81.64s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: XGBoost ...
	-384.149	 = Validation score   (-root_mean_squared_error)
	25.92s	 = Training   runtime
	0.35s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	-380.4505	 = Validation score   (-root_mean_squar

[1000]	valid_set's rmse: 369.501
[2000]	valid_set's rmse: 368.258


	-367.4662	 = Validation score   (-root_mean_squared_error)
	18.41s	 = Training   runtime
	0.49s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	-350.8498	 = Validation score   (-root_mean_squared_error)
	0.39s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 1051.64s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20231113_200006/")


In [46]:
submission.loc[submission['answer'] < 0.0, 'answer'] = 0.0

In [47]:
submission['date'] = pd.to_datetime(submission['ID'].str[-8:], format='%Y%m%d')

# Determine the day of the week (0=Monday, 6=Sunday)
submission['day_of_week'] = submission['date'].dt.dayofweek

# Set 'answer' to 0 where the day of the week is Sunday
submission.loc[submission['day_of_week'] == 6, 'answer'] = 0

# Drop the temporary columns if they are not needed
submission.drop(columns=['date', 'day_of_week'], inplace=True)

In [48]:
submission.head()

Unnamed: 0,ID,answer
0,TG_A_J_20230304,3401.557373
1,TG_A_J_20230305,0.0
2,TG_A_J_20230306,3088.053711
3,TG_A_J_20230307,3044.452637
4,TG_A_J_20230308,3266.715576


In [49]:
# 결과를 CSV 파일로 저장
submission.to_csv('submission_tabl.csv', index=False)

In [51]:
from google.colab import files

files.download('/content/submission_tabl.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>