In [1]:
# for "2. Data Loading"
import pandas as pd

# for "3-1. Feature Generation"
import numpy as np

# for "3-2. Feature Engineering"
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import RobustScaler, StandardScaler

# for "4. Modeling with Pycaret"
from pycaret.regression import *

# for "5. Modeling with CatBoostRegressor"
from catboost import CatBoostRegressor
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, StratifiedKFold

In [2]:
# 데이터 로드
pre_tr = pd.read_csv('data_preprocess/pre_tr_1118.csv',encoding = "cp949")
pre_te = pd.read_csv('data_preprocess/pre_te_1118.csv',encoding = "cp949")
pre_tr = pre_tr.set_index("일자")
pre_te = pre_te.set_index("일자")
today = "1118_2"

In [3]:
# 데이터 확인
pre_tr.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1205 entries, 2016-02-01 to 2021-01-26
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   요일      1205 non-null   int64  
 1   정원수     1205 non-null   int64  
 2   휴가자수    1205 non-null   int64  
 3   출장자수    1205 non-null   int64  
 4   야근자수    1205 non-null   int64  
 5   재택근무자수  1205 non-null   float64
 6   조식메뉴    1205 non-null   object 
 7   중식메뉴    1205 non-null   object 
 8   석식메뉴    1205 non-null   object 
 9   중식계     1205 non-null   float64
 10  석식계     1205 non-null   float64
 11  출근자수    1205 non-null   float64
 12  월       1205 non-null   int64  
 13  년도      1205 non-null   int64  
dtypes: float64(4), int64(7), object(3)
memory usage: 141.2+ KB


조식메뉴, 중식메뉴, 석식메뉴 다 없어도 될거 같은데,,,?
일자도 일단 제외하고 진행

In [4]:
#중식 타겟 데이터
y_lun = pre_tr["중식계"]
# 석식 타겟 데이터
y_din = pre_tr["석식계"]

## 중식계 예측

In [5]:
train_dr = pre_tr.loc[:,"조식메뉴":"석식계"]
X_train = pre_tr.drop(train_dr,axis=1)
X_train["중식계"] = pre_tr["중식계"]
display(X_train)

Unnamed: 0_level_0,요일,정원수,휴가자수,출장자수,야근자수,재택근무자수,출근자수,월,년도,중식계
일자,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2016-02-01,1,2601,50,150,238,0.0,2401.0,2,2016,1039.0
2016-02-02,2,2601,50,173,319,0.0,2378.0,2,2016,867.0
2016-02-03,3,2601,56,180,111,0.0,2365.0,2,2016,1017.0
2016-02-04,4,2601,104,220,355,0.0,2277.0,2,2016,978.0
2016-02-05,5,2601,278,181,34,0.0,2142.0,2,2016,925.0
...,...,...,...,...,...,...,...,...,...,...
2021-01-20,3,2983,75,198,4,391.0,2319.0,1,2021,1093.0
2021-01-21,4,2983,92,231,462,351.0,2309.0,1,2021,832.0
2021-01-22,5,2983,255,248,1,303.0,2177.0,1,2021,579.0
2021-01-25,1,2983,107,153,616,327.0,2396.0,1,2021,1145.0


In [6]:
reg = setup(X_train, 
            preprocess = False, # True로 설정되면, 자체적인 Feature Engineering을 추가로 진행해 Predict가 불가능해진다.
            train_size = 0.999,  # 우리는 전체 데이터를 학습해 test를 예측하는게 목표이기 때문에, 0.999로 설정한다.
            target = '중식계', # 목표 변수는 중식계 이다.
            silent = True, # 엔터를 누르기 귀찮다. 궁금하면 풀어보세요
            use_gpu = False, # GPU가 있으면 사용하세요 (Cat BOost 속도 향상)
            numeric_features=list(X_train.drop(columns = ['중식계']).columns), # 모든 변수가 숫자로써의 의미가 있다.
            session_id = 2021,
            fold_shuffle = True
            )

Unnamed: 0,Description,Value
0,session_id,2021
1,Target,중식계
2,Original Data,"(1205, 10)"
3,Missing Values,False
4,Numeric Features,9
5,Categorical Features,0
6,Transformed Train Set,"(1203, 9)"
7,Transformed Test Set,"(2, 9)"
8,Shuffle Train-Test,True
9,Stratify Train-Test,False


In [7]:
top5 = compare_models(n_select = 5, sort = 'MAE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,64.8528,7581.0303,86.6841,0.8243,0.106,0.0781,0.549
gbr,Gradient Boosting Regressor,66.4967,8145.576,89.7545,0.8109,0.1102,0.0806,0.031
et,Extra Trees Regressor,69.8119,9094.6202,94.7995,0.7888,0.1174,0.085,0.069
lightgbm,Light Gradient Boosting Machine,70.25,8791.6637,93.235,0.7954,0.1149,0.085,0.143
rf,Random Forest Regressor,70.4964,8977.1772,94.2017,0.7913,0.1164,0.0857,0.093
xgboost,Extreme Gradient Boosting,72.7911,9002.0387,94.5132,0.7906,0.116,0.0878,0.343
ada,AdaBoost Regressor,85.0271,11588.5128,107.4326,0.7307,0.1326,0.1048,0.024
lar,Least Angle Regression,86.7158,12719.2215,112.3469,0.7065,0.1354,0.1036,0.004
lr,Linear Regression,86.7169,12722.7467,112.3835,0.7064,0.1355,0.1037,0.648
ridge,Ridge Regression,86.7188,12719.1398,112.3467,0.7065,0.1354,0.1037,0.004


In [8]:
models = []
for m in top5:
    models.append(tune_model(m, 
                             optimize = 'MAE', 
                             choose_better = True,
                            n_iter = 30))

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,78.9133,11023.2563,104.9917,0.7111,0.1264,0.0963
1,63.0461,6535.581,80.8429,0.8674,0.105,0.08
2,68.2223,7886.016,88.8032,0.8365,0.1225,0.0888
3,71.7691,9510.7079,97.5229,0.7821,0.1135,0.0837
4,78.1947,10877.3321,104.2944,0.7403,0.1305,0.0957
5,73.5371,9807.812,99.0344,0.7935,0.1199,0.0866
6,61.1062,6165.5632,78.5211,0.8393,0.0983,0.0761
7,66.4347,7420.5241,86.1425,0.8212,0.1025,0.0776
8,64.478,7564.1511,86.9721,0.8074,0.1055,0.078
9,72.6857,9972.1079,99.8604,0.7885,0.1272,0.0903


In [9]:
test_dr = pre_te.loc[:,"조식메뉴":"석식메뉴"]
X_test = pre_te.drop(test_dr,axis=1)
display(X_test)

Unnamed: 0_level_0,요일,정원수,휴가자수,출장자수,야근자수,재택근무자수,출근자수,월,년도
일자,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2021-01-27,3,2983,88,182,5,358.0,2401.0,1,2021
2021-01-28,4,2983,104,212,409,348.0,2378.0,1,2021
2021-01-29,5,2983,270,249,0,294.0,2365.0,1,2021
2021-02-01,1,2924,108,154,538,322.0,2277.0,2,2021
2021-02-02,2,2924,62,186,455,314.0,2142.0,2,2021
2021-02-03,3,2924,59,199,5,286.0,2075.0,2,2021
2021-02-04,4,2924,61,211,476,288.0,2056.0,2,2021
2021-02-05,5,2924,169,252,0,256.0,2310.0,2,2021
2021-02-08,1,2924,88,174,690,329.0,2293.0,2,2021
2021-02-09,2,2924,94,183,542,329.0,2273.0,2,2021


In [10]:
voting = blend_models(models, optimize = 'MAE')
voting = tune_model(voting, 
                 optimize = 'MAE', 
                 choose_better = True,
                 n_iter = 30)

voting = finalize_model(voting)
sample = pd.read_csv('sub/sample_submission.csv', encoding = "cp949")
layer1_pred = voting.predict(X_test)
sample['중식계'] = layer1_pred

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,72.8386,10267.0319,101.3264,0.7309,0.1202,0.088
1,59.9191,6155.7416,78.4585,0.8751,0.1049,0.0768
2,61.6992,6527.112,80.7905,0.8647,0.1122,0.0799
3,64.723,7851.9428,88.6112,0.8201,0.1035,0.0757
4,70.3462,9168.5053,95.7523,0.7811,0.1181,0.0859
5,68.2992,8532.2282,92.3701,0.8204,0.1124,0.0805
6,58.1261,5530.3385,74.3662,0.8559,0.0953,0.0724
7,62.5336,6495.246,80.5931,0.8435,0.0929,0.0716
8,58.2902,6263.796,79.1441,0.8405,0.0921,0.0687
9,67.1601,8984.5731,94.787,0.8094,0.121,0.0836


## 석식계 예측

In [11]:
train_dr = pre_tr.loc[:,"조식메뉴":"석식계"]
X_train = pre_tr.drop(train_dr,axis=1)
X_train["석식계"] = pre_tr["석식계"]
display(X_train)

Unnamed: 0_level_0,요일,정원수,휴가자수,출장자수,야근자수,재택근무자수,출근자수,월,년도,석식계
일자,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2016-02-01,1,2601,50,150,238,0.0,2401.0,2,2016,331.0
2016-02-02,2,2601,50,173,319,0.0,2378.0,2,2016,560.0
2016-02-03,3,2601,56,180,111,0.0,2365.0,2,2016,573.0
2016-02-04,4,2601,104,220,355,0.0,2277.0,2,2016,525.0
2016-02-05,5,2601,278,181,34,0.0,2142.0,2,2016,330.0
...,...,...,...,...,...,...,...,...,...,...
2021-01-20,3,2983,75,198,4,391.0,2319.0,1,2021,421.0
2021-01-21,4,2983,92,231,462,351.0,2309.0,1,2021,353.0
2021-01-22,5,2983,255,248,1,303.0,2177.0,1,2021,217.0
2021-01-25,1,2983,107,153,616,327.0,2396.0,1,2021,502.0


In [12]:
reg = setup(X_train, 
            preprocess = False, # True로 설정되면, 자체적인 Feature Engineering을 추가로 진행해 Predict가 불가능해진다.
            train_size = 0.999,  # 우리는 전체 데이터를 학습해 test를 예측하는게 목표이기 때문에, 0.999로 설정한다.
            target = '석식계', # 목표 변수는 중식계 이다.
            silent = True, # 엔터를 누르기 귀찮다. 궁금하면 풀어보세요
            use_gpu = False, # GPU가 있으면 사용하세요 (Cat BOost 속도 향상)
            numeric_features=list(X_train.drop(columns = ['석식계']).columns), # 모든 변수가 숫자로써의 의미가 있다.
            session_id = 2021,
            fold_shuffle = True
            )

Unnamed: 0,Description,Value
0,session_id,2021
1,Target,석식계
2,Original Data,"(1205, 10)"
3,Missing Values,False
4,Numeric Features,9
5,Categorical Features,0
6,Transformed Train Set,"(1203, 9)"
7,Transformed Test Set,"(2, 9)"
8,Shuffle Train-Test,True
9,Stratify Train-Test,False


In [13]:
top5 = compare_models(n_select = 5, sort = 'MAE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,60.76,9429.0205,96.4159,0.5104,1.0984,0.1173,0.072
catboost,CatBoost Regressor,60.9388,9087.1311,94.7219,0.5278,1.1039,0.1164,0.551
rf,Random Forest Regressor,62.4998,9449.7826,96.6563,0.5097,1.1062,0.12,0.097
gbr,Gradient Boosting Regressor,62.6725,9178.9025,95.37,0.524,1.1073,0.1213,0.031
lightgbm,Light Gradient Boosting Machine,64.7391,9923.2519,98.8226,0.4848,1.1057,0.1252,0.051
xgboost,Extreme Gradient Boosting,67.0079,10425.2287,101.6715,0.4538,1.1072,0.1303,0.227
knn,K Neighbors Regressor,68.1121,11239.8584,105.2175,0.4209,1.1261,0.1312,0.006
lasso,Lasso Regression,69.3655,10446.383,101.6034,0.4599,1.1167,0.1349,0.005
ridge,Ridge Regression,69.5112,10442.5511,101.5859,0.46,1.1161,0.1353,0.004
lar,Least Angle Regression,69.5155,10442.6132,101.5861,0.46,1.1161,0.1353,0.005


In [14]:
models = []
for m in top5:
    models.append(tune_model(m, 
                             optimize = 'MAE', 
                             choose_better = True,
                            n_iter = 30))

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,63.1694,9611.9795,98.0407,0.5111,1.2017,0.1239
1,60.0873,10774.1344,103.7985,0.4104,1.2277,0.1044
2,57.7665,7492.898,86.5615,0.5543,0.9367,0.1191
3,56.5318,7052.7179,83.9805,0.6168,0.927,0.1217
4,74.6915,13537.2881,116.3499,0.48,1.3318,0.159
5,57.5131,6570.1503,81.0565,0.6386,0.9103,0.1135
6,54.68,6708.8386,81.9075,0.6165,0.9255,0.1102
7,74.8618,11966.1424,109.3899,0.4598,1.419,0.1376
8,57.9118,7071.7276,84.0936,0.5704,0.9341,0.1156
9,62.979,9651.9074,98.2441,0.4791,1.2146,0.1082


In [15]:
test_dr = pre_te.loc[:,"조식메뉴":"석식메뉴"]
X_test = pre_te.drop(test_dr,axis=1)
display(X_test)

Unnamed: 0_level_0,요일,정원수,휴가자수,출장자수,야근자수,재택근무자수,출근자수,월,년도
일자,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2021-01-27,3,2983,88,182,5,358.0,2401.0,1,2021
2021-01-28,4,2983,104,212,409,348.0,2378.0,1,2021
2021-01-29,5,2983,270,249,0,294.0,2365.0,1,2021
2021-02-01,1,2924,108,154,538,322.0,2277.0,2,2021
2021-02-02,2,2924,62,186,455,314.0,2142.0,2,2021
2021-02-03,3,2924,59,199,5,286.0,2075.0,2,2021
2021-02-04,4,2924,61,211,476,288.0,2056.0,2,2021
2021-02-05,5,2924,169,252,0,256.0,2310.0,2,2021
2021-02-08,1,2924,88,174,690,329.0,2293.0,2,2021
2021-02-09,2,2924,94,183,542,329.0,2273.0,2,2021


In [16]:
voting = blend_models(models, optimize = 'MAE')
voting = tune_model(voting, 
                 optimize = 'MAE', 
                 choose_better = True,
                 n_iter = 30)

voting = finalize_model(voting)
layer1_pred = voting.predict(X_test)
sample['석식계'] = layer1_pred

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,61.1692,10130.8555,100.6522,0.4847,1.2164,0.1161
1,56.7054,10353.2812,101.7511,0.4334,1.2302,0.0949
2,56.2733,7693.6234,87.7133,0.5423,0.9328,0.112
3,56.4075,7290.716,85.3857,0.6039,0.9286,0.1197
4,68.2172,12200.2571,110.4548,0.5314,1.3252,0.1386
5,52.2206,6030.6166,77.657,0.6683,0.9211,0.0979
6,51.611,6676.175,81.7079,0.6184,0.9337,0.0985
7,69.1178,12260.1617,110.7256,0.4465,1.4391,0.1185
8,52.0136,6691.0918,81.7991,0.5935,0.9378,0.105
9,61.2773,9758.9224,98.7873,0.4733,1.2209,0.1025


In [17]:
sample

Unnamed: 0,일자,중식계,석식계
0,2021-01-27,1015.248763,383.667581
1,2021-01-28,935.156816,394.715289
2,2021-01-29,627.762992,291.483673
3,2021-02-01,1197.140135,530.464559
4,2021-02-02,965.302636,460.924395
5,2021-02-03,922.009973,388.087844
6,2021-02-04,892.967347,457.70442
7,2021-02-05,657.765397,349.671924
8,2021-02-08,1233.564635,598.234468
9,2021-02-09,1023.76043,509.414848


In [18]:
sample.to_csv('sub/Model_Cat_{}.csv'.format(today), index=False)