# 라이브러리 설치

In [1]:
## auto-gluon ver0.8.2
# !pip install autogluon

# 라이브러리 로딩 및 초기화

In [2]:
import pandas as pd
import random
import os
import numpy as np
from autogluon.tabular import TabularPredictor

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

SEED = 42
seed_everything(SEED) # Seed 고정

# 데이터 로딩 및 라벨 영문화

In [4]:
train_df = pd.read_csv('train.csv').rename(columns={'추석까지 남은 기간(주)':'remain_week', '쇼핑몰 구분':'shop', '가격(원)':'price', '프로모션 여부':'promotion', '도시 유형':'city', '지역 유형':'region', '쇼핑몰 유형':'shop_type', '선물 유형':'gift', '수요량':'order_count' })
test_df = pd.read_csv('test.csv').rename(columns={'추석까지 남은 기간(주)':'remain_week', '쇼핑몰 구분':'shop', '가격(원)':'price', '프로모션 여부':'promotion', '도시 유형':'city', '지역 유형':'region', '쇼핑몰 유형':'shop_type', '선물 유형':'gift', '수요량':'order_count' })

# 데이터에 특성 변수 추가

In [5]:
# 선물세트별 수요량 변수
gift_mean_map = train_df.groupby('gift')['order_count'].mean().to_dict()
gift_std_map = train_df.groupby('gift')['order_count'].std().to_dict()
# 도시별 수요량 변수
city_mean_map = train_df.groupby('city')['order_count'].mean().to_dict()
city_std_map = train_df.groupby('city')['order_count'].std().to_dict()
# 쇼핑몰별 수요량 변수
mall_mean_map = train_df.groupby('shop')['order_count'].mean().to_dict()
mall_std_map = train_df.groupby('shop')['order_count'].std().to_dict()

In [6]:
def preprocess(df):
    df = df.copy()
    df['gift_mean'] = df['gift'].map(gift_mean_map)
    df['gift_std'] = df['gift'].map(gift_std_map)
    
    df['city_mean'] = df['city'].map(city_mean_map)
    df['city_std'] = df['city'].map(city_std_map)

    df['mall_mean'] = df['shop'].map(mall_mean_map)
    df['mall_std']  = df['shop'].map(mall_std_map)
    
# 동일한 이름의 선물세트인데, 가격대는 두가지로 분리된 경우가 있어,
# 이를 반영하려고 아래 항목을 추가하였으나, 실제 점수가 하락하는 현상을
# 마지막에 확인하였으나, 제출결과에서는 반영이 안됬네요. 빼고하니, 0.4점 정도 이득이 있습니다. 
    df['gift_type'] = 0
    gift_type_list = [
        ('실속스팸선물세트', 50000), 
        ('행복스팸선물세트', 50000), 
        ('특선스팸선물세트', 50000), 
        ('특별한선택스팸선물세트', 95000), 
        ('한과종합선물세트', 200000), 
        ('프리미엄고당도샤인머스캣선물세트', 145000), 
    ]

    for gift, price in gift_type_list:
        selected_index = df[df.gift == gift].index
        df.loc[selected_index, 'gift_type'] = df.loc[selected_index, 'price'] // price
    
    return df

train = preprocess( train_df )
test  = preprocess( test_df )

# 공용 함수 함수

In [7]:
# 모델 메시지와 점수 출력
def print_msg(predictor, title_msg='predict', model_name=None):
    lb = predictor.leaderboard(silent=True)
    if model_name is None:
        model_name = lb.model[0]
    score_val = lb.loc[lb.model==model_name,'score_val'].iloc[0]
    msg = f'{title_msg}\nmodel:{model_name}\nscore:{score_val:.2f}'
    print(msg)
    return score_val

In [8]:
# 모델 훈련 및 예측 결과 저장
def model_fit(predictor, train_data, **kwargs):
    predictor.fit(train_data=train_data, **kwargs)
    title_msg = f'predict'
    score = print_msg(predictor, title_msg, model_name=None)
    print(f'{title_msg}: {score:.2f}')
    model_predict( predictor, title_msg=title_msg )
    return score

In [9]:
# 훈련 모델로 예측 및 저장
def model_predict(predictor, model_name = None, title_msg=None ):
    submission = pd.read_csv('sample_submission.csv')
    lb = predictor.leaderboard(silent=True)
    if model_name is None:
        model_name = lb.model[0]
    if title_msg is None:
        title_msg = model_name.lower()
    pred = predictor.predict( test.drop(columns='ID'), model=model_name )
    submission['수요량'] = pred
    print(submission.head())
    from datetime import datetime
    time_str = datetime.now().strftime('%Y%m%d_%H%M%S')
    save_filename = f'./submit_{title_msg}_{time_str}.csv'
    submission.to_csv(save_filename, index = False)
    print(f'file saved : {save_filename}')

In [10]:
# DataAugmentation 기법을 적용 결과 저장
def model_distill(predictor):
    hyperparameters = {'GBM':{}}
    size_factors=[None,5]
    scores = []
    for size_factor in size_factors:
        if size_factor is None:
            distill_models = predictor.distill(hyperparameters=hyperparameters, augment_method=None)
        else:
            distill_models = predictor.distill(hyperparameters=hyperparameters, augment_method='spunge', augment_args={'size_factor': size_factor} )
    
        model_name = distill_models[0]
        title_msg = f'DSTLx{size_factor}'
        scores.append( print_msg(predictor, title_msg, model_name=model_name) )
        model_predict( predictor, model_name=model_name, title_msg=title_msg )
    return scores

# 모델 훈련 및 결과 저장

In [11]:
# best_quality를 지정하면, bag_fold=8, bag_sets=1로 지정되는 데, bag_sets을 늘리는게 점수를 좋게 받았습니다. 
fit_parameters = {
    'presets': 'best_quality',
    'num_bag_folds': 8,  ### 값을 키울수록 점수가 좋아지나, 너무 키우면 검증세트 크기가 작아져 잘못된 결과를 얻게 됨
    'num_bag_sets': 3,  ### 값을 키울수록 점수가 좋아짐, 속도가 느려짐.
    'num_stack_levels': 2,  ### 2번이 의미가 없어, 1번만 해도 됨
    'excluded_model_types': ['CAT', 'NN_TORCH', ],  ### 속도가 느린 모델은 제외시켰습니다.
}

predictor = TabularPredictor(problem_type='regression', label='order_count', eval_metric='rmse', sample_weight='auto_weight')
model_fit( predictor, train_data=train.drop(columns=['ID']), **fit_parameters)

No path specified. Models will be saved in: "AutogluonModels/ag-20231004_052546/"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=2, num_bag_folds=8, num_bag_sets=3
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20231004_052546/"
AutoGluon Version:  0.8.2
Python Version:     3.8.17
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #224-Ubuntu SMP Mon Jun 19 13:30:12 UTC 2023
Disk Space Avail:   80.72 GB / 121.67 GB (66.3%)
Train Data Rows:    5872
Train Data Columns: 15
Label Column: order_count
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    15967.08 MB
	Train Data (Original)  Memory Usage: 3.57 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators

predict
model:WeightedEnsemble_L2
score:-114.90
predict: -114.90
          ID         수요량
0  TEST_0000  203.106400
1  TEST_0001   42.830845
2  TEST_0002  350.928070
3  TEST_0003  174.663818
4  TEST_0004  249.217163
file saved : ./submit_predict_20231004_143349.csv


-114.89955418023898

In [12]:
# 데이터 Augmentation 을 위해 distillation 적용
model_distill( predictor )

Distilling with teacher='WeightedEnsemble_L2', teacher_preds=soft, augment_method=None ...
Distilling with each of these student models: ['LightGBM_DSTL']
Fitting 1 L1 models ...
Fitting model: LightGBM_DSTL ...


[1000]	valid_set's rmse: 105.763
[2000]	valid_set's rmse: 105.125
[3000]	valid_set's rmse: 105.034


	-105.0206	 = Validation score   (-root_mean_squared_error)
	3.29s	 = Training   runtime
	0.06s	 = Validation runtime
Distilling with each of these student models: ['WeightedEnsemble_L2_DSTL']
Fitting model: WeightedEnsemble_L2_DSTL ...
	-105.0206	 = Validation score   (-root_mean_squared_error)
	0.0s	 = Training   runtime
	0.0s	 = Validation runtime
Distilled model leaderboard:
                      model   score_val  pred_time_val  fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0             LightGBM_DSTL -105.020638       0.058983  3.287784                0.058983           3.287784            1       True         27
1  WeightedEnsemble_L2_DSTL -105.020638       0.059355  3.290339                0.000372           0.002556            2       True         28


DSTLxNone
model:LightGBM_DSTL
score:-105.02


Distilling with teacher='WeightedEnsemble_L2', teacher_preds=soft, augment_method=spunge ...
SPUNGE: Augmenting training data with 26420 synthetic samples for distillation...


          ID         수요량
0  TEST_0000  237.418594
1  TEST_0001   38.506824
2  TEST_0002  326.258331
3  TEST_0003  177.262268
4  TEST_0004  189.185791
file saved : ./submit_DSTLxNone_20231004_143416.csv


Distilling with each of these student models: ['LightGBM_2_DSTL']
Fitting 1 L1 models ...
Fitting model: LightGBM_2_DSTL ...


[1000]	valid_set's rmse: 102.035
[2000]	valid_set's rmse: 99.4658
[3000]	valid_set's rmse: 98.3735
[4000]	valid_set's rmse: 97.9612
[5000]	valid_set's rmse: 97.6781
[6000]	valid_set's rmse: 97.4859
[7000]	valid_set's rmse: 97.3972
[8000]	valid_set's rmse: 97.2813
[9000]	valid_set's rmse: 97.2761
[10000]	valid_set's rmse: 97.2772


	-97.2479	 = Validation score   (-root_mean_squared_error)
	13.59s	 = Training   runtime
	0.2s	 = Validation runtime
Distilling with each of these student models: ['WeightedEnsemble_2_L2_DSTL']
Fitting model: WeightedEnsemble_2_L2_DSTL ...
	-97.2479	 = Validation score   (-root_mean_squared_error)
	0.0s	 = Training   runtime
	0.0s	 = Validation runtime
Distilled model leaderboard:
                        model  score_val  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0             LightGBM_2_DSTL -97.247944       0.198548  13.589706                0.198548          13.589706            1       True         29
1  WeightedEnsemble_2_L2_DSTL -97.247944       0.199022  13.592421                0.000473           0.002714            2       True         30


DSTLx5
model:LightGBM_2_DSTL
score:-97.25
          ID         수요량
0  TEST_0000  226.371185
1  TEST_0001   55.560329
2  TEST_0002  384.381836
3  TEST_0003  173.837433
4  TEST_0004  228.028076
file saved : ./submit_DSTLx5_20231004_143656.csv


[-105.0206377368004, -97.24794392918379]