In [1]:
# for "2. Data Loading"
import pandas as pd

# for "3-1. Feature Generation"
import numpy as np

# for "3-2. Feature Engineering"
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import RobustScaler, StandardScaler

# for "4. Modeling with Pycaret"
from pycaret.regression import *

# for "5. Modeling with CatBoostRegressor"
from catboost import CatBoostRegressor
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, StratifiedKFold

### 1121
추가 사항 석식계 예측에 자기계발 열을 제거하여 학습 진행

### 1122
추가 사항 독립변수 스케일링 진행

### 1124
- 타겟 변수와 상관관계가 낮은 "재택근무자수"열 제거
- 앙상블 개수 변화

In [2]:
# 데이터 로드
pre_tr = pd.read_csv('data_preprocess/pre_tr_1118.csv',encoding = "cp949")
pre_te = pd.read_csv('data_preprocess/pre_te_1118.csv',encoding = "cp949")
pre_tr = pre_tr.set_index("일자")
pre_te = pre_te.set_index("일자")
today = "1124"

In [3]:
# 데이터 확인
pre_tr.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1205 entries, 2016-02-01 to 2021-01-26
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   요일      1205 non-null   int64  
 1   정원수     1205 non-null   int64  
 2   휴가자수    1205 non-null   int64  
 3   출장자수    1205 non-null   int64  
 4   야근자수    1205 non-null   int64  
 5   재택근무자수  1205 non-null   float64
 6   조식메뉴    1205 non-null   object 
 7   중식메뉴    1205 non-null   object 
 8   석식메뉴    1205 non-null   object 
 9   중식계     1205 non-null   float64
 10  석식계     1205 non-null   float64
 11  출근자수    1205 non-null   float64
 12  월       1205 non-null   int64  
 13  년도      1205 non-null   int64  
dtypes: float64(4), int64(7), object(3)
memory usage: 141.2+ KB


조식메뉴, 중식메뉴, 석식메뉴 다 없어도 될거 같은데,,,?
일자도 일단 제외하고 진행

In [4]:
#중식 타겟 데이터
y_lun = pre_tr["중식계"]
# 석식 타겟 데이터
y_din = pre_tr["석식계"]

In [5]:
pre_tr.corr()["중식계"]

요일       -0.731563
정원수      -0.115529
휴가자수     -0.391975
출장자수     -0.512680
야근자수      0.535611
재택근무자수    0.076509
중식계       1.000000
석식계       0.508287
출근자수      0.286810
월        -0.154664
년도       -0.078804
Name: 중식계, dtype: float64

In [6]:
pre_tr.corr()["석식계"]

요일       -0.312112
정원수      -0.173852
휴가자수     -0.316894
출장자수     -0.188164
야근자수      0.571168
재택근무자수   -0.057534
중식계       0.508287
석식계       1.000000
출근자수      0.172373
월        -0.127142
년도       -0.194792
Name: 석식계, dtype: float64

## 중식계 예측

In [7]:
train_dr = pre_tr.loc[:,"조식메뉴":"석식계"]
X_train = pre_tr.drop(train_dr,axis=1)
X_train = X_train.drop("재택근무자수",axis = 1)

In [8]:
# minmax scale 활용
from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler()
col_na = X_train.loc[:,  "요일":"년도"].columns
col_na
X_train.loc[:, "요일":"년도"] = minmax.fit_transform(
    X_train[col_na])

In [9]:
X_train["중식계"] = pre_tr["중식계"]
display(X_train)

Unnamed: 0_level_0,요일,정원수,휴가자수,출장자수,야근자수,출근자수,월,년도,중식계
일자,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2016-02-01,0.00,0.000000,0.022481,0.323442,0.227969,0.664300,0.090909,0.0,1039.0
2016-02-02,0.25,0.000000,0.022481,0.391691,0.305556,0.649451,0.090909,0.0,867.0
2016-02-03,0.50,0.000000,0.027477,0.412463,0.106322,0.641059,0.090909,0.0,1017.0
2016-02-04,0.75,0.000000,0.067444,0.531157,0.340038,0.584248,0.090909,0.0,978.0
2016-02-05,1.00,0.000000,0.212323,0.415430,0.032567,0.497095,0.090909,0.0,925.0
...,...,...,...,...,...,...,...,...,...
2021-01-20,0.50,0.542614,0.043297,0.465875,0.003831,0.611362,0.000000,1.0,1093.0
2021-01-21,0.75,0.542614,0.057452,0.563798,0.442529,0.604906,0.000000,1.0,832.0
2021-01-22,1.00,0.542614,0.193172,0.614243,0.000958,0.519690,0.000000,1.0,579.0
2021-01-25,0.00,0.542614,0.069942,0.332344,0.590038,0.661072,0.000000,1.0,1145.0


In [10]:
reg = setup(X_train, 
            preprocess = False, # True로 설정되면, 자체적인 Feature Engineering을 추가로 진행해 Predict가 불가능해진다.
            train_size = 0.999,  # 우리는 전체 데이터를 학습해 test를 예측하는게 목표이기 때문에, 0.999로 설정한다.
            target = '중식계', # 목표 변수는 중식계 이다.
            silent = True, # 엔터를 누르기 귀찮다. 궁금하면 풀어보세요
            use_gpu = False, # GPU가 있으면 사용하세요 (Cat BOost 속도 향상)
            numeric_features=list(X_train.drop(columns = ['중식계']).columns), # 모든 변수가 숫자로써의 의미가 있다.
            session_id = 2021,
            fold_shuffle = True
            )

Unnamed: 0,Description,Value
0,session_id,2021
1,Target,중식계
2,Original Data,"(1205, 9)"
3,Missing Values,False
4,Numeric Features,8
5,Categorical Features,0
6,Transformed Train Set,"(1203, 8)"
7,Transformed Test Set,"(2, 8)"
8,Shuffle Train-Test,True
9,Stratify Train-Test,False


In [11]:
top5 = compare_models(n_select = 5, sort = 'MAE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,65.1557,7663.8813,87.1872,0.8224,0.1073,0.0787,0.499
gbr,Gradient Boosting Regressor,66.768,8128.6041,89.6218,0.8113,0.1095,0.0806,0.029
et,Extra Trees Regressor,70.1822,9191.4909,95.2951,0.7863,0.1182,0.0856,0.069
rf,Random Forest Regressor,70.5879,9026.133,94.4479,0.7902,0.1165,0.0857,0.09
lightgbm,Light Gradient Boosting Machine,70.7518,8806.0815,93.3002,0.7949,0.1153,0.0857,0.122
xgboost,Extreme Gradient Boosting,72.7449,9332.3995,96.2259,0.7829,0.118,0.0875,0.32
knn,K Neighbors Regressor,77.5355,10676.6241,102.8873,0.7517,0.1275,0.0944,0.007
ada,AdaBoost Regressor,84.5244,11637.5152,107.7307,0.7302,0.1327,0.1042,0.028
huber,Huber Regressor,86.223,12848.7381,112.8962,0.7036,0.1367,0.1035,0.007
lar,Least Angle Regression,86.7158,12719.2244,112.3469,0.7065,0.1354,0.1036,0.004


In [12]:
models = []
for m in top5:
    models.append(tune_model(m, 
                             optimize = 'MAE', 
                             choose_better = True,
                            n_iter = 30))

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,75.6648,11000.7429,104.8844,0.7117,0.1333,0.0944
1,60.9145,6173.0499,78.5688,0.8748,0.1022,0.0769
2,68.0592,7776.3786,88.1838,0.8388,0.1221,0.0881
3,61.3076,6834.6432,82.6719,0.8434,0.0963,0.0714
4,75.3883,9075.316,95.2645,0.7833,0.1216,0.0937
5,75.6746,10712.777,103.5025,0.7744,0.1332,0.0919
6,65.2731,7059.338,84.0199,0.816,0.1139,0.0831
7,65.3406,7196.8695,84.8344,0.8266,0.0958,0.0734
8,64.8833,7879.1867,88.7648,0.7994,0.1049,0.0767
9,74.2154,10092.6444,100.4622,0.7859,0.1279,0.0919


In [13]:
test_dr = pre_te.loc[:,"조식메뉴":"석식메뉴"]
X_test = pre_te.drop(test_dr,axis=1)
X_test = X_test.drop("재택근무자수",axis = 1)
X_test.loc[:, "요일":"년도"] = minmax.transform(
    X_test[col_na])
display(X_test)

Unnamed: 0_level_0,요일,정원수,휴가자수,출장자수,야근자수,출근자수,월,년도
일자,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-01-27,0.5,0.542614,0.054122,0.418398,0.004789,0.6643,0.0,1.0
2021-01-28,0.75,0.542614,0.067444,0.507418,0.391762,0.649451,0.0,1.0
2021-01-29,1.0,0.542614,0.205662,0.617211,0.0,0.641059,0.0,1.0
2021-02-01,0.0,0.458807,0.070774,0.335312,0.515326,0.584248,0.090909,1.0
2021-02-02,0.25,0.458807,0.032473,0.430267,0.435824,0.497095,0.090909,1.0
2021-02-03,0.5,0.458807,0.029975,0.468843,0.004789,0.453841,0.090909,1.0
2021-02-04,0.75,0.458807,0.03164,0.504451,0.455939,0.441575,0.090909,1.0
2021-02-05,1.0,0.458807,0.121565,0.626113,0.0,0.605552,0.090909,1.0
2021-02-08,0.0,0.458807,0.054122,0.394659,0.66092,0.594577,0.090909,1.0
2021-02-09,0.25,0.458807,0.059117,0.421365,0.519157,0.581666,0.090909,1.0


In [14]:
voting = blend_models(models, optimize = 'MAE')
voting = tune_model(voting, 
                 optimize = 'MAE', 
                 choose_better = True,
                 n_iter = 30)

voting = finalize_model(voting)
sample = pd.read_csv('sub/sample_submission.csv', encoding = "cp949")
layer1_pred = voting.predict(X_test)
sample['중식계'] = layer1_pred

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,73.4998,10479.3203,102.3686,0.7253,0.1207,0.0885
1,59.856,6109.9132,78.1659,0.876,0.1032,0.0766
2,63.2513,6988.3739,83.5965,0.8551,0.1164,0.0824
3,64.1827,7947.3236,89.1478,0.8179,0.1046,0.0753
4,71.73,9315.7414,96.5181,0.7776,0.1196,0.0877
5,67.9307,8530.6015,92.3613,0.8204,0.1109,0.0792
6,58.6766,5429.8283,73.6874,0.8585,0.0928,0.0724
7,61.5556,6506.1936,80.661,0.8432,0.0928,0.0703
8,59.5097,6411.5777,80.0723,0.8368,0.0949,0.0709
9,68.0848,8967.22,94.6954,0.8098,0.12,0.0843


## 석식계 예측

In [15]:
train_dr = pre_tr.loc[:,"조식메뉴":"석식계"]
X_train = pre_tr.drop(train_dr,axis=1)
X_train = X_train.drop("재택근무자수",axis=1)
X_train.loc[:, "요일":"년도"] = minmax.fit_transform(
    X_train[col_na])
X_train["석식계"] = pre_tr["석식계"]
display(X_train)

Unnamed: 0_level_0,요일,정원수,휴가자수,출장자수,야근자수,출근자수,월,년도,석식계
일자,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2016-02-01,0.00,0.000000,0.022481,0.323442,0.227969,0.664300,0.090909,0.0,331.0
2016-02-02,0.25,0.000000,0.022481,0.391691,0.305556,0.649451,0.090909,0.0,560.0
2016-02-03,0.50,0.000000,0.027477,0.412463,0.106322,0.641059,0.090909,0.0,573.0
2016-02-04,0.75,0.000000,0.067444,0.531157,0.340038,0.584248,0.090909,0.0,525.0
2016-02-05,1.00,0.000000,0.212323,0.415430,0.032567,0.497095,0.090909,0.0,330.0
...,...,...,...,...,...,...,...,...,...
2021-01-20,0.50,0.542614,0.043297,0.465875,0.003831,0.611362,0.000000,1.0,421.0
2021-01-21,0.75,0.542614,0.057452,0.563798,0.442529,0.604906,0.000000,1.0,353.0
2021-01-22,1.00,0.542614,0.193172,0.614243,0.000958,0.519690,0.000000,1.0,217.0
2021-01-25,0.00,0.542614,0.069942,0.332344,0.590038,0.661072,0.000000,1.0,502.0


In [16]:
# 자기계발 날 제거 
# (석식계 0인 이상치가 발견되는 열들 제거 하여 학습 데이터 생성) 
# 총 43개 열 제거
day_of_dinner0 = X_train.loc[X_train["석식계"] == 0]
display(day_of_dinner0)

Unnamed: 0_level_0,요일,정원수,휴가자수,출장자수,야근자수,출근자수,월,년도,석식계
일자,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2016-11-30,0.5,0.125,0.037469,0.492582,0.0,0.672692,0.909091,0.0,0.0
2016-12-28,0.5,0.147727,0.119067,0.545994,0.0,0.608134,1.0,0.0,0.0
2017-01-25,0.5,0.136364,0.046628,0.480712,0.0,0.673338,0.0,0.2,0.0
2017-02-22,0.5,0.044034,0.043297,0.626113,0.0,0.602324,0.090909,0.2,0.0
2017-03-22,0.5,0.036932,0.024979,0.575668,0.0,0.624274,0.181818,0.2,0.0
2017-04-26,0.5,0.035511,0.018318,0.780415,0.0,0.584248,0.272727,0.2,0.0
2017-05-31,0.5,0.051136,0.016653,0.664688,0.0,0.617818,0.363636,0.2,0.0
2017-06-28,0.5,0.066761,0.029142,0.646884,0.0,0.619109,0.454545,0.2,0.0
2017-07-26,0.5,0.338068,0.19234,0.608309,0.0,0.624274,0.545455,0.2,0.0
2017-09-01,1.0,0.058239,0.128226,0.777448,0.043103,0.510006,0.727273,0.2,0.0


In [17]:
# 자기계발 날 제거 
# (석식계 0인 이상치가 발견되는 열들 제거 하여 학습 데이터 생성) 
# 총 43개 열 제거
X_train = X_train.drop(day_of_dinner0.index,axis=0)
display(X_train)

Unnamed: 0_level_0,요일,정원수,휴가자수,출장자수,야근자수,출근자수,월,년도,석식계
일자,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2016-02-01,0.00,0.000000,0.022481,0.323442,0.227969,0.664300,0.090909,0.0,331.0
2016-02-02,0.25,0.000000,0.022481,0.391691,0.305556,0.649451,0.090909,0.0,560.0
2016-02-03,0.50,0.000000,0.027477,0.412463,0.106322,0.641059,0.090909,0.0,573.0
2016-02-04,0.75,0.000000,0.067444,0.531157,0.340038,0.584248,0.090909,0.0,525.0
2016-02-05,1.00,0.000000,0.212323,0.415430,0.032567,0.497095,0.090909,0.0,330.0
...,...,...,...,...,...,...,...,...,...
2021-01-20,0.50,0.542614,0.043297,0.465875,0.003831,0.611362,0.000000,1.0,421.0
2021-01-21,0.75,0.542614,0.057452,0.563798,0.442529,0.604906,0.000000,1.0,353.0
2021-01-22,1.00,0.542614,0.193172,0.614243,0.000958,0.519690,0.000000,1.0,217.0
2021-01-25,0.00,0.542614,0.069942,0.332344,0.590038,0.661072,0.000000,1.0,502.0


In [18]:
reg = setup(X_train, 
            preprocess = False, # True로 설정되면, 자체적인 Feature Engineering을 추가로 진행해 Predict가 불가능해진다.
            train_size = 0.999,  # 우리는 전체 데이터를 학습해 test를 예측하는게 목표이기 때문에, 0.999로 설정한다.
            target = '석식계', # 목표 변수는 석식계 이다.
            silent = True, # 엔터를 누르기 귀찮다. 궁금하면 풀어보세요
            use_gpu = False, # GPU가 있으면 사용하세요 (Cat BOost 속도 향상)
            numeric_features=list(X_train.drop(columns = ['석식계']).columns), # 모든 변수가 숫자로써의 의미가 있다.
            session_id = 2021,
            fold_shuffle = True
            )

Unnamed: 0,Description,Value
0,session_id,2021
1,Target,석식계
2,Original Data,"(1162, 9)"
3,Missing Values,False
4,Numeric Features,8
5,Categorical Features,0
6,Transformed Train Set,"(1160, 8)"
7,Transformed Test Set,"(2, 8)"
8,Shuffle Train-Test,True
9,Stratify Train-Test,False


In [19]:
top5 = compare_models(n_select = 5, sort = 'MAE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,42.9592,3384.7969,57.838,0.7103,0.1417,0.1022,0.525
gbr,Gradient Boosting Regressor,45.0255,3679.3122,60.2324,0.6864,0.1435,0.1057,0.029
et,Extra Trees Regressor,45.3397,3846.7146,61.3393,0.6712,0.1478,0.1074,0.069
lightgbm,Light Gradient Boosting Machine,45.8032,3731.2264,60.7818,0.6797,0.1459,0.1079,0.05
rf,Random Forest Regressor,46.0168,3833.4677,61.4219,0.672,0.1473,0.1084,0.09
xgboost,Extreme Gradient Boosting,47.5448,4117.1186,63.8415,0.6478,0.1515,0.1105,0.235
knn,K Neighbors Regressor,49.0245,4557.7954,67.0042,0.6098,0.1666,0.12,0.007
huber,Huber Regressor,52.982,5060.4596,70.7196,0.5676,0.1715,0.1278,0.008
ridge,Ridge Regression,53.1735,5016.6406,70.4126,0.5716,0.1699,0.1274,0.004
br,Bayesian Ridge,53.2472,5014.3407,70.4025,0.5717,0.1693,0.1272,0.005


In [20]:
models = []
for m in top5:
    models.append(tune_model(m, 
                             optimize = 'MAE', 
                             choose_better = True,
                            n_iter = 30))

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,45.5591,3966.7344,62.982,0.6718,0.1654,0.1152
1,51.9473,5207.5538,72.1634,0.5064,0.1414,0.1055
2,48.262,3671.4726,60.5927,0.6638,0.1714,0.1257
3,47.0934,3609.4411,60.0786,0.6857,0.141,0.1095
4,52.4576,5914.8665,76.9082,0.5739,0.1898,0.1309
5,44.1243,3275.4089,57.2312,0.7264,0.1397,0.1046
6,42.9942,3132.2441,55.9665,0.7333,0.1389,0.1035
7,46.537,3679.7252,60.6607,0.7188,0.1672,0.1204
8,39.7545,2597.2586,50.9633,0.6884,0.1126,0.0883
9,49.1075,4939.1031,70.2788,0.6307,0.1826,0.126


In [21]:
test_dr = pre_te.loc[:,"조식메뉴":"석식메뉴"]
X_test = pre_te.drop(test_dr,axis=1)
X_test = X_test.drop("재택근무자수",axis = 1)
X_test.loc[:,  "요일":"년도"] = minmax.transform(
    X_test[col_na])
display(X_test)

Unnamed: 0_level_0,요일,정원수,휴가자수,출장자수,야근자수,출근자수,월,년도
일자,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-01-27,0.5,0.542614,0.054122,0.418398,0.004789,0.6643,0.0,1.0
2021-01-28,0.75,0.542614,0.067444,0.507418,0.391762,0.649451,0.0,1.0
2021-01-29,1.0,0.542614,0.205662,0.617211,0.0,0.641059,0.0,1.0
2021-02-01,0.0,0.458807,0.070774,0.335312,0.515326,0.584248,0.090909,1.0
2021-02-02,0.25,0.458807,0.032473,0.430267,0.435824,0.497095,0.090909,1.0
2021-02-03,0.5,0.458807,0.029975,0.468843,0.004789,0.453841,0.090909,1.0
2021-02-04,0.75,0.458807,0.03164,0.504451,0.455939,0.441575,0.090909,1.0
2021-02-05,1.0,0.458807,0.121565,0.626113,0.0,0.605552,0.090909,1.0
2021-02-08,0.0,0.458807,0.054122,0.394659,0.66092,0.594577,0.090909,1.0
2021-02-09,0.25,0.458807,0.059117,0.421365,0.519157,0.581666,0.090909,1.0


In [22]:
voting = blend_models(models, optimize = 'MAE')
voting = tune_model(voting, 
                 optimize = 'MAE', 
                 choose_better = True,
                 n_iter = 30)

voting = finalize_model(voting)
layer1_pred = voting.predict(X_test)
sample['석식계'] = layer1_pred

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,44.2113,3712.8014,60.9328,0.6928,0.1638,0.1123
1,49.0984,4785.5833,69.1779,0.5464,0.1359,0.0998
2,44.6108,3209.6169,56.6535,0.7061,0.1495,0.1119
3,43.9281,3175.1703,56.3486,0.7235,0.1294,0.0996
4,47.1439,4787.2228,69.1898,0.6551,0.1763,0.1192
5,38.9417,2592.2236,50.9139,0.7835,0.115,0.0872
6,37.4068,2512.052,50.1204,0.7861,0.1187,0.0879
7,42.5495,2987.7902,54.6607,0.7717,0.1497,0.1081
8,39.071,2401.4721,49.0048,0.7119,0.1056,0.0859
9,43.0119,3849.9622,62.0481,0.7122,0.1604,0.1088


In [23]:
sample

Unnamed: 0,일자,중식계,석식계
0,2021-01-27,1014.185975,387.51472
1,2021-01-28,921.108744,392.075767
2,2021-01-29,612.982036,291.552848
3,2021-02-01,1204.35775,532.405499
4,2021-02-02,967.206057,428.304486
5,2021-02-03,946.698836,409.318064
6,2021-02-04,894.097187,443.34947
7,2021-02-05,655.809704,360.925782
8,2021-02-08,1221.382024,620.696388
9,2021-02-09,1023.654208,513.915398


In [24]:
sample.to_csv('sub/Model_Cat_{}.csv'.format(today), index=False)