# import / 라이브러리 호출

In [1]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import LabelEncoder
from autogluon.tabular import TabularDataset, TabularPredictor
import warnings
warnings.filterwarnings("ignore")

# Fixed RandomSeed / 랜덤시드 고정

seed를 고정하지 않는다면 같은 코드라도 매번 다른 결과가 나오게됩니다.

항상 동일한 결과를 얻기 위해서 사용합니다.

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(1994) # Seed 고정

# Data Load / 데이터 불러오기

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
train.head()

Unnamed: 0,ID,추석까지 남은 기간(주),쇼핑몰 구분,가격(원),프로모션 여부,도시 유형,지역 유형,쇼핑몰 유형,선물 유형,수요량
0,TRAIN_0000,1,쇼핑몰 15,212000,0,도시 6,지역 1,쇼핑몰 유형 2,명절혼합과일선물세트,28
1,TRAIN_0001,2,쇼핑몰 72,113000,0,도시 21,지역 1,쇼핑몰 유형 1,발효홍삼선물세트,27
2,TRAIN_0002,0,쇼핑몰 15,67000,0,도시 6,지역 1,쇼핑몰 유형 2,실속스팸선물세트,769
3,TRAIN_0003,1,쇼핑몰 13,206000,0,도시 12,지역 3,쇼핑몰 유형 1,자연산프리미엄버섯선물세트,27
4,TRAIN_0004,1,쇼핑몰 65,140000,0,도시 16,지역 2,쇼핑몰 유형 2,자연산새우선물세트,337


In [5]:
train_data = TabularDataset(train)
test_data = TabularDataset(test)

train_data.drop('ID',axis = 1, inplace = True)
test_data.drop('ID',axis = 1, inplace = True)


label = '수요량'
eval_metric = 'rmse'
time_limit = 3600*1

# Model fitting / 모델 학습

In [9]:
predictor = TabularPredictor(
    label=label, eval_metric=eval_metric
).fit(train_data, 
      presets='best_quality', 
      num_stack_levels=3,
      time_limit=time_limit, num_gpus=1)

No path specified. Models will be saved in: "AutogluonModels\ag-20230930_021527\"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=3, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 3600s
AutoGluon will save models to "AutogluonModels\ag-20230930_021527\"
AutoGluon Version:  0.6.0
Python Version:     3.8.13
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
Train Data Rows:    5872
Train Data Columns: 8
Label Column: 수요량
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == int and many unique label-values observed).
	Label info (max, min, mean, stddev): (4455, 13, 264.31369, 338.78921)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to prepr

# Inference / 추론

In [10]:
print(predictor.leaderboard(silent = True))

                     model   score_val  pred_time_val     fit_time   
0      WeightedEnsemble_L3 -121.571063       7.304684  1236.380791  \
1   NeuralNetFastAI_BAG_L2 -121.777652       6.424649  1138.232935   
2      WeightedEnsemble_L2 -122.371272       2.489990   935.776309   
3      WeightedEnsemble_L4 -122.549777      14.034612  2226.043240   
4   NeuralNetFastAI_BAG_L3 -123.601617      13.864613  2225.410239   
5      WeightedEnsemble_L5 -123.621480      19.526532  3083.405011   
6   NeuralNetFastAI_BAG_L4 -123.977172      19.356532  3082.747011   
7     ExtraTreesMSE_BAG_L2 -124.791474       3.055035   936.658302   
8   RandomForestMSE_BAG_L3 -125.538774      10.918646  2044.078336   
9     ExtraTreesMSE_BAG_L3 -126.317820      10.916646  2043.874335   
10          XGBoost_BAG_L2 -126.742956       3.402039  1033.463162   
11  RandomForestMSE_BAG_L2 -126.890902       3.060065   936.925305   
12          XGBoost_BAG_L3 -127.223658      11.226654  2128.905087   
13  NeuralNetFastAI_

In [12]:
predictor.get_model_best()

'WeightedEnsemble_L3'

In [13]:
model_to_use = predictor.get_model_best()
model_pred = predictor.predict(test_data, model=model_to_use)

# submit / 제출

In [14]:
submission = pd.read_csv('sample_submission.csv')

In [None]:
submission['수요량'] = model_pred

In [16]:
submission.to_csv('./submit_autogluon.csv', index = False)