In [1]:
# for "2. Data Loading"
import pandas as pd

# for "3-1. Feature Generation"
import numpy as np

# for "3-2. Feature Engineering"
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import RobustScaler, StandardScaler

# for "4. Modeling with Pycaret"
from pycaret.regression import *

# for "5. Modeling with CatBoostRegressor"
from catboost import CatBoostRegressor
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, StratifiedKFold

In [2]:
# 데이터 로드
pre_tr = pd.read_csv('data_preprocess/pre_tr_1117.csv',encoding = "cp949")
pre_te = pd.read_csv('data_preprocess/pre_te_1117.csv',encoding = "cp949")
pre_tr = pre_tr.set_index("일자")
pre_te = pre_te.set_index("일자")

In [3]:
# 데이터 확인
pre_tr.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1205 entries, 2016-02-01 to 2021-01-26
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   요일      1205 non-null   int64  
 1   정원수     1205 non-null   int64  
 2   휴가자수    1205 non-null   int64  
 3   출장자수    1205 non-null   int64  
 4   야근자수    1205 non-null   int64  
 5   재택근무자수  1205 non-null   float64
 6   조식메뉴    1205 non-null   object 
 7   중식메뉴    1205 non-null   object 
 8   석식메뉴    1205 non-null   object 
 9   중식계     1205 non-null   float64
 10  석식계     1205 non-null   float64
 11  출근자수    1205 non-null   float64
dtypes: float64(4), int64(5), object(3)
memory usage: 122.4+ KB


조식메뉴, 중식메뉴, 석식메뉴 다 없어도 될거 같은데,,,?
일자도 일단 제외하고 진행

In [4]:
#학습용 데이터셋 생성
train = pre_tr.loc[:,"요일":"재택근무자수"]
train["출근자수"] = pre_tr["출근자수"]
display(train)
test = pre_te.loc[:,"요일":"재택근무자수"]
test["출근자수"] = pre_te["출근자수"]
display(test)

Unnamed: 0_level_0,요일,정원수,휴가자수,출장자수,야근자수,재택근무자수,출근자수
일자,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-02-01,1,2601,50,150,238,0.0,2401.0
2016-02-02,2,2601,50,173,319,0.0,2378.0
2016-02-03,3,2601,56,180,111,0.0,2365.0
2016-02-04,4,2601,104,220,355,0.0,2277.0
2016-02-05,5,2601,278,181,34,0.0,2142.0
...,...,...,...,...,...,...,...
2021-01-20,3,2983,75,198,4,391.0,2319.0
2021-01-21,4,2983,92,231,462,351.0,2309.0
2021-01-22,5,2983,255,248,1,303.0,2177.0
2021-01-25,1,2983,107,153,616,327.0,2396.0


Unnamed: 0_level_0,요일,정원수,휴가자수,출장자수,야근자수,재택근무자수,출근자수
일자,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-01-27,3,2983,88,182,5,358.0,2401.0
2021-01-28,4,2983,104,212,409,348.0,2378.0
2021-01-29,5,2983,270,249,0,294.0,2365.0
2021-02-01,1,2924,108,154,538,322.0,2277.0
2021-02-02,2,2924,62,186,455,314.0,2142.0
2021-02-03,3,2924,59,199,5,286.0,2075.0
2021-02-04,4,2924,61,211,476,288.0,2056.0
2021-02-05,5,2924,169,252,0,256.0,2310.0
2021-02-08,1,2924,88,174,690,329.0,2293.0
2021-02-09,2,2924,94,183,542,329.0,2273.0


In [5]:
#중식 타겟 데이터
y_lun = pre_tr["중식계"]
# 석식 타겟 데이터
y_din = pre_tr["석식계"]

## 중식계 예측

In [6]:
def objective(trial: Trial) -> float:
    params_cat = {
        "random_state": 42,
        "learning_rate": 0.05,
        "n_estimators": 10000,
        "verbose" : 1,
        "objective" : "MAE",
        "max_depth": trial.suggest_int("max_depth", 1, 16),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.8, 1.0),
        "subsample": trial.suggest_float("subsample", 0.3, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "max_bin": trial.suggest_int("max_bin", 200, 500),
    }
    
    X_tr, X_val, y_tr, y_val = train_test_split(train, y_lun, test_size=0.2)

    model = CatBoostRegressor(**params_cat)
    model.fit(
        X_tr,
        y_tr,
        eval_set=[(X_tr, y_tr), (X_val, y_val)],
        early_stopping_rounds=10,
        verbose=False,
    )

    cat_pred = model.predict(X_val)
    log_score = mean_absolute_error(y_val, cat_pred)
    
    return log_score

In [7]:
sampler = TPESampler(seed=42)
study = optuna.create_study(
    study_name="cat_opt",
    direction="minimize",
    sampler=sampler,
)
study.optimize(objective, n_trials=10)
print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

[32m[I 2021-11-18 13:16:29,344][0m A new study created in memory with name: cat_opt[0m
[32m[I 2021-11-18 13:16:29,839][0m Trial 0 finished with value: 63.96868458525011 and parameters: {'max_depth': 6, 'colsample_bylevel': 0.9901428612819833, 'subsample': 0.8123957592679836, 'min_child_samples': 62, 'max_bin': 246}. Best is trial 0 with value: 63.96868458525011.[0m
[32m[I 2021-11-18 13:16:30,134][0m Trial 1 finished with value: 67.14945225927596 and parameters: {'max_depth': 3, 'colsample_bylevel': 0.8116167224336399, 'subsample': 0.9063233020424546, 'min_child_samples': 62, 'max_bin': 413}. Best is trial 0 with value: 63.96868458525011.[0m
[32m[I 2021-11-18 13:16:30,383][0m Trial 2 finished with value: 75.75011471651162 and parameters: {'max_depth': 1, 'colsample_bylevel': 0.9939819704323989, 'subsample': 0.8827098485602951, 'min_child_samples': 25, 'max_bin': 254}. Best is trial 0 with value: 63.96868458525011.[0m
[32m[I 2021-11-18 13:16:30,587][0m Trial 3 finished with

Best Score: 62.339536117475134
Best trial: {'max_depth': 10, 'colsample_bylevel': 0.8278987721304084, 'subsample': 0.5045012539746527, 'min_child_samples': 40, 'max_bin': 337}


In [8]:
cat_p = study.best_trial.params
cat = CatBoostRegressor(**cat_p)

In [9]:
y_cat = pd.cut(y_lun, 10, labels=range(10))
skf = StratifiedKFold(5)

preds = []
for tr_id, val_id in skf.split(train, y_cat) : 
    X_tr = train.iloc[tr_id]
    y_tr = y_lun.iloc[tr_id]
    
    cat.fit(X_tr, y_tr, verbose = 0)
    
    pred = cat.predict(test)
    preds.append(pred)
cat_pred = np.mean(preds, axis = 0)

In [10]:
sample = pd.read_csv('sub/sample_submission.csv', encoding = "cp949")
sample['중식계'] = cat_pred

## 석식계 예측

In [11]:
def objective(trial: Trial) -> float:
    params_cat = {
        "random_state": 42,
        "learning_rate": 0.05,
        "n_estimators": 10000,
        "verbose" : 1,
        "objective" : "MAE",
        "max_depth": trial.suggest_int("max_depth", 1, 16),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.8, 1.0),
        "subsample": trial.suggest_float("subsample", 0.3, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "max_bin": trial.suggest_int("max_bin", 200, 500),
    }
    
    X_tr, X_val, y_tr, y_val = train_test_split(train, y_din, test_size=0.2)

    model = CatBoostRegressor(**params_cat)
    model.fit(
        X_tr,
        y_tr,
        eval_set=[(X_tr, y_tr), (X_val, y_val)],
        early_stopping_rounds=10,
        verbose=False,
    )

    cat_pred = model.predict(X_val)
    log_score = mean_absolute_error(y_val, cat_pred)
    
    return log_score

In [12]:
sampler = TPESampler(seed=42)
study = optuna.create_study(
    study_name="cat_opt",
    direction="minimize",
    sampler=sampler,
)
study.optimize(objective, n_trials=10)
print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

[32m[I 2021-11-18 13:18:05,329][0m A new study created in memory with name: cat_opt[0m
[32m[I 2021-11-18 13:18:05,595][0m Trial 0 finished with value: 52.803652780267825 and parameters: {'max_depth': 6, 'colsample_bylevel': 0.9901428612819833, 'subsample': 0.8123957592679836, 'min_child_samples': 62, 'max_bin': 246}. Best is trial 0 with value: 52.803652780267825.[0m
[32m[I 2021-11-18 13:18:05,949][0m Trial 1 finished with value: 55.34701134447699 and parameters: {'max_depth': 3, 'colsample_bylevel': 0.8116167224336399, 'subsample': 0.9063233020424546, 'min_child_samples': 62, 'max_bin': 413}. Best is trial 0 with value: 52.803652780267825.[0m
[32m[I 2021-11-18 13:18:06,126][0m Trial 2 finished with value: 63.71481414652357 and parameters: {'max_depth': 1, 'colsample_bylevel': 0.9939819704323989, 'subsample': 0.8827098485602951, 'min_child_samples': 25, 'max_bin': 254}. Best is trial 0 with value: 52.803652780267825.[0m
[32m[I 2021-11-18 13:18:06,408][0m Trial 3 finished 

Best Score: 52.803652780267825
Best trial: {'max_depth': 6, 'colsample_bylevel': 0.9901428612819833, 'subsample': 0.8123957592679836, 'min_child_samples': 62, 'max_bin': 246}


In [13]:
cat_p2 = study.best_trial.params
cat2 = CatBoostRegressor(**cat_p2)

In [14]:
y_cat2 = pd.cut(y_din, 10, labels=range(10))
skf2 = StratifiedKFold(5)

preds = []
for tr_id, val_id in skf2.split(train, y_cat2) : 
    X_tr2 = train.iloc[tr_id]
    y_tr2 = y_din.iloc[tr_id]
    
    cat2.fit(X_tr2, y_tr2, verbose = 0)
    
    pred = cat2.predict(test)
    preds.append(pred)
cat_pred_din = np.mean(preds, axis = 0)

In [15]:
sample['석식계'] = cat_pred_din

In [16]:
sample

Unnamed: 0,일자,중식계,석식계
0,2021-01-27,1048.002154,384.389518
1,2021-01-28,952.936643,419.124756
2,2021-01-29,647.882146,285.219639
3,2021-02-01,1136.738303,517.082375
4,2021-02-02,936.370267,442.616347
5,2021-02-03,921.128117,403.62347
6,2021-02-04,895.130907,433.050957
7,2021-02-05,639.170769,321.101801
8,2021-02-08,1128.917646,561.957233
9,2021-02-09,995.686545,485.834047


In [17]:
sample.to_csv('sub/Model_Cat_1117_1.csv', index=False)