## 데이터 불러오기

In [22]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import RidgeCV
import numpy as np
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.metrics import r2_score
from lightgbm import LGBMRegressor
from hyperopt import hp
from sklearn.model_selection import cross_val_score
from hyperopt import fmin, tpe, Trials, STATUS_OK

In [23]:
df = pd.read_csv('crowd.csv') #read csv
df.head()

Unnamed: 0,day,home,away,stadium,crowd
0,토,두산,한화,잠실,16271
1,토,KIA,LG,광주,16908
2,토,키움,롯데,고척,8257
3,토,NC,SSG,창원,7814
4,토,KT,삼성,수원,17057


In [24]:
df.info() #데이터 타입 확인

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720 entries, 0 to 719
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   day      720 non-null    object
 1   home     720 non-null    object
 2   away     720 non-null    object
 3   stadium  720 non-null    object
 4   crowd    720 non-null    object
dtypes: object(5)
memory usage: 28.2+ KB


In [25]:
#쉼표제거 후 타입변경
df.crowd = df.crowd.str.replace(",","" ).astype(int)

In [26]:
# 이레귤러 데이터인 월요일 경기를 삭제해줌
df = df[df.day != '월']

In [27]:
df.reset_index(drop=True).to_csv("crowd_tr.csv",index = False)

In [18]:
feature = ['day','home','away','stadium']
target = 'crowd'
X = df[feature]
y = df[target]

## 머신러닝 모델링

### 기준모델

In [19]:
#기준모델 설정(평균)
y_real = y
y_base = [y.mean()] * len(y)
r2 = r2_score(y_real, y_base)
print(f"baseline score: {r2}")

baseline score: 0.0


### ridge회귀

In [20]:
#ridge회귀모델
model = make_pipeline(
    OneHotEncoder(use_cat_names=True, cols = feature),
    RidgeCV(alphas=np.arange(0.1, 10, 0.1), cv=5, scoring='r2')
)
model.fit(X,y)

  for cat_name, class_ in values.iteritems():
  for cat_name, class_ in values.iteritems():
  for cat_name, class_ in values.iteritems():
  for cat_name, class_ in values.iteritems():


In [174]:
#모델 cv 성능 확인
print(f"alpha: {model.named_steps['ridgecv'].alpha_}")
print(f"cv best score: {model.named_steps['ridgecv'].best_score_}")

alpha: 5.7
cv best score: 0.611533826401347


### LGBMRegressor

In [52]:
pipe_lgbm = make_pipeline(
        OrdinalEncoder(),
        LGBMRegressor(random_state=53,
                       learning_rate=0.1,
                               n_jobs=-1)
        )


In [149]:
params = {
    "lgbmregressor__max_depth": hp.quniform("max_depth", 5,15,1),
    "lgbmregressor__colsample_bytree" : hp.uniform("colsample_bytree", 0.8, 1.0),
    "lgbmregressor__subsample": hp.uniform("subsample", 0.6, 1.0),
    "lgbmregressor__min_child_weight": hp.quniform("min_child_weight", 4, 20, 4),
    "lgbmregressor__n_estimators": hp.quniform("n_estimators", 100,2000,50),
    "lgbmregressor__learning_rate": hp.quniform("learning_rate", 0.06, 0.2, 0.02),
    "lgbmregressor__alpha": hp.quniform("alpha", 0.1, 10, 0.1)
}

In [150]:
def get_pipe(params):
    params["lgbmregressor__n_estimators"] = int(
        params["lgbmregressor__n_estimators"])
    params["lgbmregressor__max_depth"] = int(
        params["lgbmregressor__max_depth"])
    
    pipe = make_pipeline(
        OrdinalEncoder(),
        LGBMRegressor(random_state=53,                              
                               n_jobs=-1)
        )
    pipe = pipe.set_params(**params)
    return pipe

def fit_and_eval(params):
    pipe = get_pipe(params)  
    score = cross_val_score(pipe, X, y, cv=5, scoring="r2")
    avg_cv_score = np.mean(score)
    return {"loss": -avg_cv_score, "status": STATUS_OK}

trials = (Trials())

best_params = fmin(
    fn=fit_and_eval, trials=trials, space=params, algo=tpe.suggest, max_evals=30)

100%|██████████| 30/30 [00:28<00:00,  1.04trial/s, best loss: -0.5827566098517233]


In [151]:
print("최적 하이퍼파라미터: ", trials.best_trial["misc"]["vals"])
print("최적 r2: ", -trials.best_trial["result"]["loss"])

최적 하이퍼파라미터:  {'alpha': [9.8], 'colsample_bytree': [0.8970662975615508], 'learning_rate': [0.14], 'max_depth': [6.0], 'min_child_weight': [12.0], 'n_estimators': [100.0], 'subsample': [0.7400532061333471]}
최적 r2:  0.5827566098517233


r2스코어가 높은 ridge회귀 모델을 사용하기로 결정.

In [21]:
#임의값으로 경기를 넣어서 모델이 어떻게 예측하는지 확인해보기
X_test =pd.DataFrame([['일','LG','두산','한화']],columns=['day','home','away','stadium'])
model.predict(X_test)

array([12698.0227065])

### 피클링

In [175]:
import pickle

with open('model.pkl','wb') as pickle_file:
    pickle.dump(model, pickle_file)

In [5]:
import pickle

model_decoded = None
with open('model.pkl','rb') as pickle_file:
   model_decoded = pickle.load(pickle_file)

In [12]:
X_test =pd.DataFrame([['일','LG','두산','한화']],columns=['day','home','away','stadium'])
int(model_decoded.predict(X_test))
#복호화 해서 예측값을 확인한 결과 부호화 전의 값과 일치했다.

12698