In [1]:
# for "2. Data Loading"
import pandas as pd

# for "3-1. Feature Generation"
import numpy as np

# for "3-2. Feature Engineering"
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import RobustScaler, StandardScaler

# for "4. Modeling with Pycaret"
from pycaret.regression import *

# for "5. Modeling with CatBoostRegressor"
from catboost import CatBoostRegressor
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, StratifiedKFold

### 1121
추가 사항 석식계 예측에 자기계발 열을 제거하여 학습 진행

In [2]:
# 데이터 로드
pre_tr = pd.read_csv('data_preprocess/pre_tr_1118.csv',encoding = "cp949")
pre_te = pd.read_csv('data_preprocess/pre_te_1118.csv',encoding = "cp949")
pre_tr = pre_tr.set_index("일자")
pre_te = pre_te.set_index("일자")
today = "1121_1"

In [3]:
# 데이터 확인
pre_tr.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1205 entries, 2016-02-01 to 2021-01-26
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   요일      1205 non-null   int64  
 1   정원수     1205 non-null   int64  
 2   휴가자수    1205 non-null   int64  
 3   출장자수    1205 non-null   int64  
 4   야근자수    1205 non-null   int64  
 5   재택근무자수  1205 non-null   float64
 6   조식메뉴    1205 non-null   object 
 7   중식메뉴    1205 non-null   object 
 8   석식메뉴    1205 non-null   object 
 9   중식계     1205 non-null   float64
 10  석식계     1205 non-null   float64
 11  출근자수    1205 non-null   float64
 12  월       1205 non-null   int64  
 13  년도      1205 non-null   int64  
dtypes: float64(4), int64(7), object(3)
memory usage: 141.2+ KB


조식메뉴, 중식메뉴, 석식메뉴 다 없어도 될거 같은데,,,?
일자도 일단 제외하고 진행

In [4]:
#중식 타겟 데이터
y_lun = pre_tr["중식계"]
# 석식 타겟 데이터
y_din = pre_tr["석식계"]

## 중식계 예측

In [5]:
train_dr = pre_tr.loc[:,"조식메뉴":"석식계"]
X_train = pre_tr.drop(train_dr,axis=1)
X_train["중식계"] = pre_tr["중식계"]
display(X_train)

Unnamed: 0_level_0,요일,정원수,휴가자수,출장자수,야근자수,재택근무자수,출근자수,월,년도,중식계
일자,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2016-02-01,1,2601,50,150,238,0.0,2401.0,2,2016,1039.0
2016-02-02,2,2601,50,173,319,0.0,2378.0,2,2016,867.0
2016-02-03,3,2601,56,180,111,0.0,2365.0,2,2016,1017.0
2016-02-04,4,2601,104,220,355,0.0,2277.0,2,2016,978.0
2016-02-05,5,2601,278,181,34,0.0,2142.0,2,2016,925.0
...,...,...,...,...,...,...,...,...,...,...
2021-01-20,3,2983,75,198,4,391.0,2319.0,1,2021,1093.0
2021-01-21,4,2983,92,231,462,351.0,2309.0,1,2021,832.0
2021-01-22,5,2983,255,248,1,303.0,2177.0,1,2021,579.0
2021-01-25,1,2983,107,153,616,327.0,2396.0,1,2021,1145.0


In [6]:
reg = setup(X_train, 
            preprocess = False, # True로 설정되면, 자체적인 Feature Engineering을 추가로 진행해 Predict가 불가능해진다.
            train_size = 0.999,  # 우리는 전체 데이터를 학습해 test를 예측하는게 목표이기 때문에, 0.999로 설정한다.
            target = '중식계', # 목표 변수는 중식계 이다.
            silent = True, # 엔터를 누르기 귀찮다. 궁금하면 풀어보세요
            use_gpu = False, # GPU가 있으면 사용하세요 (Cat BOost 속도 향상)
            numeric_features=list(X_train.drop(columns = ['중식계']).columns), # 모든 변수가 숫자로써의 의미가 있다.
            session_id = 2021,
            fold_shuffle = True
            )

Unnamed: 0,Description,Value
0,session_id,2021
1,Target,중식계
2,Original Data,"(1205, 10)"
3,Missing Values,False
4,Numeric Features,9
5,Categorical Features,0
6,Transformed Train Set,"(1203, 9)"
7,Transformed Test Set,"(2, 9)"
8,Shuffle Train-Test,True
9,Stratify Train-Test,False


In [7]:
top5 = compare_models(n_select = 5, sort = 'MAE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,64.8528,7581.0303,86.6841,0.8243,0.106,0.0781,0.528
gbr,Gradient Boosting Regressor,66.4967,8145.576,89.7545,0.8109,0.1102,0.0806,0.032
et,Extra Trees Regressor,69.8119,9094.6202,94.7995,0.7888,0.1174,0.085,0.076
lightgbm,Light Gradient Boosting Machine,70.25,8791.6637,93.235,0.7954,0.1149,0.085,0.135
rf,Random Forest Regressor,70.4964,8977.1772,94.2017,0.7913,0.1164,0.0857,0.091
xgboost,Extreme Gradient Boosting,72.7911,9002.0387,94.5132,0.7906,0.116,0.0878,0.317
ada,AdaBoost Regressor,85.0271,11588.5128,107.4326,0.7307,0.1326,0.1048,0.028
lar,Least Angle Regression,86.7158,12719.2215,112.3469,0.7065,0.1354,0.1036,0.005
lr,Linear Regression,86.7169,12722.7467,112.3835,0.7064,0.1355,0.1037,0.558
ridge,Ridge Regression,86.7188,12719.1398,112.3467,0.7065,0.1354,0.1037,0.006


In [8]:
models = []
for m in top5:
    models.append(tune_model(m, 
                             optimize = 'MAE', 
                             choose_better = True,
                            n_iter = 30))

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,78.9133,11023.2563,104.9917,0.7111,0.1264,0.0963
1,63.0461,6535.581,80.8429,0.8674,0.105,0.08
2,68.2223,7886.016,88.8032,0.8365,0.1225,0.0888
3,71.7691,9510.7079,97.5229,0.7821,0.1135,0.0837
4,78.1947,10877.3321,104.2944,0.7403,0.1305,0.0957
5,73.5371,9807.812,99.0344,0.7935,0.1199,0.0866
6,61.1062,6165.5632,78.5211,0.8393,0.0983,0.0761
7,66.4347,7420.5241,86.1425,0.8212,0.1025,0.0776
8,64.478,7564.1511,86.9721,0.8074,0.1055,0.078
9,72.6857,9972.1079,99.8604,0.7885,0.1272,0.0903


In [9]:
test_dr = pre_te.loc[:,"조식메뉴":"석식메뉴"]
X_test = pre_te.drop(test_dr,axis=1)
display(X_test)

Unnamed: 0_level_0,요일,정원수,휴가자수,출장자수,야근자수,재택근무자수,출근자수,월,년도
일자,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2021-01-27,3,2983,88,182,5,358.0,2401.0,1,2021
2021-01-28,4,2983,104,212,409,348.0,2378.0,1,2021
2021-01-29,5,2983,270,249,0,294.0,2365.0,1,2021
2021-02-01,1,2924,108,154,538,322.0,2277.0,2,2021
2021-02-02,2,2924,62,186,455,314.0,2142.0,2,2021
2021-02-03,3,2924,59,199,5,286.0,2075.0,2,2021
2021-02-04,4,2924,61,211,476,288.0,2056.0,2,2021
2021-02-05,5,2924,169,252,0,256.0,2310.0,2,2021
2021-02-08,1,2924,88,174,690,329.0,2293.0,2,2021
2021-02-09,2,2924,94,183,542,329.0,2273.0,2,2021


In [10]:
voting = blend_models(models, optimize = 'MAE')
voting = tune_model(voting, 
                 optimize = 'MAE', 
                 choose_better = True,
                 n_iter = 30)

voting = finalize_model(voting)
sample = pd.read_csv('sub/sample_submission.csv', encoding = "cp949")
layer1_pred = voting.predict(X_test)
sample['중식계'] = layer1_pred

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,72.8386,10267.0319,101.3264,0.7309,0.1202,0.088
1,59.9191,6155.7416,78.4585,0.8751,0.1049,0.0768
2,61.6992,6527.112,80.7905,0.8647,0.1122,0.0799
3,64.723,7851.9428,88.6112,0.8201,0.1035,0.0757
4,70.3462,9168.5053,95.7523,0.7811,0.1181,0.0859
5,68.2992,8532.2282,92.3701,0.8204,0.1124,0.0805
6,58.1261,5530.3385,74.3662,0.8559,0.0953,0.0724
7,62.5336,6495.246,80.5931,0.8435,0.0929,0.0716
8,58.2902,6263.796,79.1441,0.8405,0.0921,0.0687
9,67.1601,8984.5731,94.787,0.8094,0.121,0.0836


## 석식계 예측

In [11]:
train_dr = pre_tr.loc[:,"조식메뉴":"석식계"]
X_train = pre_tr.drop(train_dr,axis=1)
X_train["석식계"] = pre_tr["석식계"]
display(X_train)

Unnamed: 0_level_0,요일,정원수,휴가자수,출장자수,야근자수,재택근무자수,출근자수,월,년도,석식계
일자,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2016-02-01,1,2601,50,150,238,0.0,2401.0,2,2016,331.0
2016-02-02,2,2601,50,173,319,0.0,2378.0,2,2016,560.0
2016-02-03,3,2601,56,180,111,0.0,2365.0,2,2016,573.0
2016-02-04,4,2601,104,220,355,0.0,2277.0,2,2016,525.0
2016-02-05,5,2601,278,181,34,0.0,2142.0,2,2016,330.0
...,...,...,...,...,...,...,...,...,...,...
2021-01-20,3,2983,75,198,4,391.0,2319.0,1,2021,421.0
2021-01-21,4,2983,92,231,462,351.0,2309.0,1,2021,353.0
2021-01-22,5,2983,255,248,1,303.0,2177.0,1,2021,217.0
2021-01-25,1,2983,107,153,616,327.0,2396.0,1,2021,502.0


In [12]:
# 자기계발 날 제거 
# (석식계 0인 이상치가 발견되는 열들 제거 하여 학습 데이터 생성) 
# 총 43개 열 제거
day_of_dinner0 = X_train.loc[X_train["석식계"] == 0]
display(day_of_dinner0)

Unnamed: 0_level_0,요일,정원수,휴가자수,출장자수,야근자수,재택근무자수,출근자수,월,년도,석식계
일자,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2016-11-30,3,2689,68,207,0,0.0,2414.0,11,2016,0.0
2016-12-28,3,2705,166,225,0,0.0,2314.0,12,2016,0.0
2017-01-25,3,2697,79,203,0,0.0,2415.0,1,2017,0.0
2017-02-22,3,2632,75,252,0,0.0,2305.0,2,2017,0.0
2017-03-22,3,2627,53,235,0,0.0,2339.0,3,2017,0.0
2017-04-26,3,2626,45,304,0,0.0,2277.0,4,2017,0.0
2017-05-31,3,2637,43,265,0,0.0,2329.0,5,2017,0.0
2017-06-28,3,2648,58,259,0,0.0,2331.0,6,2017,0.0
2017-07-26,3,2839,254,246,0,0.0,2339.0,7,2017,0.0
2017-09-01,5,2642,177,303,45,0.0,2162.0,9,2017,0.0


In [13]:
# 자기계발 날 제거 
# (석식계 0인 이상치가 발견되는 열들 제거 하여 학습 데이터 생성) 
# 총 43개 열 제거
X_train = X_train.drop(day_of_dinner0.index,axis=0)
display(X_train)

Unnamed: 0_level_0,요일,정원수,휴가자수,출장자수,야근자수,재택근무자수,출근자수,월,년도,석식계
일자,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2016-02-01,1,2601,50,150,238,0.0,2401.0,2,2016,331.0
2016-02-02,2,2601,50,173,319,0.0,2378.0,2,2016,560.0
2016-02-03,3,2601,56,180,111,0.0,2365.0,2,2016,573.0
2016-02-04,4,2601,104,220,355,0.0,2277.0,2,2016,525.0
2016-02-05,5,2601,278,181,34,0.0,2142.0,2,2016,330.0
...,...,...,...,...,...,...,...,...,...,...
2021-01-20,3,2983,75,198,4,391.0,2319.0,1,2021,421.0
2021-01-21,4,2983,92,231,462,351.0,2309.0,1,2021,353.0
2021-01-22,5,2983,255,248,1,303.0,2177.0,1,2021,217.0
2021-01-25,1,2983,107,153,616,327.0,2396.0,1,2021,502.0


In [14]:
reg = setup(X_train, 
            preprocess = False, # True로 설정되면, 자체적인 Feature Engineering을 추가로 진행해 Predict가 불가능해진다.
            train_size = 0.999,  # 우리는 전체 데이터를 학습해 test를 예측하는게 목표이기 때문에, 0.999로 설정한다.
            target = '석식계', # 목표 변수는 석식계 이다.
            silent = True, # 엔터를 누르기 귀찮다. 궁금하면 풀어보세요
            use_gpu = False, # GPU가 있으면 사용하세요 (Cat BOost 속도 향상)
            numeric_features=list(X_train.drop(columns = ['석식계']).columns), # 모든 변수가 숫자로써의 의미가 있다.
            session_id = 2021,
            fold_shuffle = True
            )

Unnamed: 0,Description,Value
0,session_id,2021
1,Target,석식계
2,Original Data,"(1162, 10)"
3,Missing Values,False
4,Numeric Features,9
5,Categorical Features,0
6,Transformed Train Set,"(1160, 9)"
7,Transformed Test Set,"(2, 9)"
8,Shuffle Train-Test,True
9,Stratify Train-Test,False


In [15]:
top5 = compare_models(n_select = 5, sort = 'MAE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,43.4579,3451.3073,58.4054,0.704,0.1428,0.1035,0.65
gbr,Gradient Boosting Regressor,44.7495,3644.2389,59.9329,0.6893,0.142,0.1046,0.034
et,Extra Trees Regressor,45.5735,3889.0646,61.7017,0.6671,0.1486,0.1081,0.079
lightgbm,Light Gradient Boosting Machine,45.6344,3683.9193,60.4097,0.6835,0.1454,0.1077,0.075
rf,Random Forest Regressor,46.189,3871.8292,61.6732,0.6686,0.1481,0.109,0.099
xgboost,Extreme Gradient Boosting,47.4496,4105.1217,63.6527,0.6492,0.1516,0.1108,0.282
knn,K Neighbors Regressor,51.916,4945.4081,69.7131,0.5802,0.1667,0.1236,0.007
ridge,Ridge Regression,53.2876,5016.1805,70.4193,0.5714,0.1691,0.1272,0.005
lar,Least Angle Regression,53.2876,5016.1811,70.4193,0.5714,0.1691,0.1272,0.005
lasso,Lasso Regression,53.294,5019.46,70.443,0.5712,0.1695,0.1274,0.005


In [16]:
models = []
for m in top5:
    models.append(tune_model(m, 
                             optimize = 'MAE', 
                             choose_better = True,
                            n_iter = 30))

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,46.8728,4424.5371,66.5172,0.6339,0.177,0.1207
1,50.8662,5255.8704,72.4974,0.5018,0.1398,0.1023
2,51.2296,4214.6299,64.9202,0.614,0.1778,0.1325
3,48.4059,3959.7486,62.9265,0.6552,0.1499,0.1139
4,50.9509,5646.9274,75.146,0.5932,0.1882,0.1291
5,43.5191,3232.2699,56.8531,0.73,0.1345,0.1021
6,43.2433,3244.9736,56.9647,0.7237,0.1446,0.1062
7,43.9487,3402.0225,58.3269,0.74,0.1625,0.1149
8,40.0318,2650.6251,51.4842,0.682,0.1121,0.0885
9,50.506,4971.4967,70.5088,0.6283,0.1814,0.128


In [17]:
test_dr = pre_te.loc[:,"조식메뉴":"석식메뉴"]
X_test = pre_te.drop(test_dr,axis=1)
display(X_test)

Unnamed: 0_level_0,요일,정원수,휴가자수,출장자수,야근자수,재택근무자수,출근자수,월,년도
일자,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2021-01-27,3,2983,88,182,5,358.0,2401.0,1,2021
2021-01-28,4,2983,104,212,409,348.0,2378.0,1,2021
2021-01-29,5,2983,270,249,0,294.0,2365.0,1,2021
2021-02-01,1,2924,108,154,538,322.0,2277.0,2,2021
2021-02-02,2,2924,62,186,455,314.0,2142.0,2,2021
2021-02-03,3,2924,59,199,5,286.0,2075.0,2,2021
2021-02-04,4,2924,61,211,476,288.0,2056.0,2,2021
2021-02-05,5,2924,169,252,0,256.0,2310.0,2,2021
2021-02-08,1,2924,88,174,690,329.0,2293.0,2,2021
2021-02-09,2,2924,94,183,542,329.0,2273.0,2,2021


In [18]:
voting = blend_models(models, optimize = 'MAE')
voting = tune_model(voting, 
                 optimize = 'MAE', 
                 choose_better = True,
                 n_iter = 30)

voting = finalize_model(voting)
layer1_pred = voting.predict(X_test)
sample['석식계'] = layer1_pred

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,44.8779,3892.0085,62.386,0.678,0.1661,0.1137
1,50.5556,4948.5212,70.3457,0.531,0.1379,0.1025
2,45.3688,3261.598,57.1104,0.7013,0.1482,0.113
3,43.1735,3111.5031,55.7808,0.7291,0.1282,0.0981
4,46.3778,4751.1331,68.9285,0.6577,0.1755,0.1175
5,39.3728,2714.2633,52.0986,0.7733,0.1175,0.0885
6,37.2317,2450.9653,49.5072,0.7913,0.1193,0.0882
7,42.1146,2975.7838,54.5507,0.7726,0.1507,0.1081
8,38.4144,2338.6093,48.3592,0.7194,0.104,0.0844
9,43.1777,3816.6995,61.7794,0.7146,0.1599,0.1088


In [19]:
sample

Unnamed: 0,일자,중식계,석식계
0,2021-01-27,1015.248763,383.431533
1,2021-01-28,935.156816,396.384741
2,2021-01-29,627.762992,280.762065
3,2021-02-01,1197.140135,548.302926
4,2021-02-02,965.302636,457.910859
5,2021-02-03,922.009973,395.864416
6,2021-02-04,892.967347,450.565957
7,2021-02-05,657.765397,351.613373
8,2021-02-08,1233.564635,632.984212
9,2021-02-09,1023.76043,534.007242


In [20]:
sample.to_csv('sub/Model_Cat_{}.csv'.format(today), index=False)