# 하이퍼 파라미터 최적화: Grid Search & Random Search

- Grid Search는 최적지점을 놓칠 확률이 random search에 비해 상대적으로 높다, 사용자가 parameter를 지정해줌. 탐색시간은 상대적으로 조금 걸린다.

- Random Search는 사용자가 범위를 설정해주고 샘플링 수를 정해주면 알고리즘이 알아서 랜덤하게 cross validation을 해서 최적점을 찾을려고 한다 그래서 최적점을 찾을 확률이 상대적으로 높다. 샘플링 수를 높이면 학습하는 시간은 더 오래 걸리지만 최적점을 찾는 확률은 더 높아짐.

In [1]:
import pandas as pd
import time

In [8]:
start = time.time()
train = pd.read_csv('../train.csv', index_col = 0)
test = pd.read_csv('../test.csv', index_col = 0)
print(time.time() - start)

125.44715166091919


![](https://miro.medium.com/max/1200/1*ZTlQm_WRcrNqL-nLnx6GJA.png)

In [32]:
X = train.iloc[:,:5]
y = train['label']
#시간이 오래 걸려서 데이터를 추출해서 진행.

## GridSearch

In [33]:
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV # GridSearchCV에서 CV는 CROSS VALIDATION, 성능평가를 같이 하기위해
param = {'num_leaves': [10, 15], # 총 2개의 조합으로 하이퍼파라미터를 설정 (50, 10), (50, 15)
        'n_estimators': [50]} # gridsearch

model = lgb.LGBMClassifier(random_state = 0, learning_rate=0.03)

gs = GridSearchCV(estimator=model, param_grid=param, scoring = 'neg_log_loss', cv = 3) # cv = 3 : kfold는 3으로 가져옴. 3개의 분할로 나누겠다. 2개는 학습으로 1개는 val로 한것을 총 3개

# GridSearchCV개발자가 모델이 높은 점수를 가질수록 좋은 지표라고 설정을 해둠 따라서
# 로그로스는 낮을수록 좋은 지표이기 때문에 뒤집어야한다. 따라서 negative를 설정

gs.fit(X, y)

GridSearchCV(cv=3, error_score=nan,
             estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                      colsample_bytree=1.0,
                                      importance_type='split',
                                      learning_rate=0.03, max_depth=-1,
                                      min_child_samples=20,
                                      min_child_weight=0.001,
                                      min_split_gain=0.0, n_estimators=100,
                                      n_jobs=-1, num_leaves=31, objective=None,
                                      random_state=0, reg_alpha=0.0,
                                      reg_lambda=0.0, silent=True,
                                      subsample=1.0, subsample_for_bin=200000,
                                      subsample_freq=0),
             iid='deprecated', n_jobs=None,
             param_grid={'n_estimators': [50], 'num_leaves': [10, 15]},
             pre_dispatc

In [34]:
import pprint # pretty print의 약자, 이쁘게 보여주기 위해
pprint.pprint(gs.best_params_) # best_params_ 이 속성에 최고의 조합이 들어감.

pprint.pprint(gs.cv_results_) # cv_results_ 이 속성에는 gs를 돌렸을 때 나온 결과값이 저장.
#mean_test_score가 평가 점수.
#rank_test_score가 랭킹
#split0_test_score...가 3번의 kfold 점수.

{'n_estimators': 50, 'num_leaves': 10}
{'mean_fit_time': array([5.62044366, 9.72578049]),
 'mean_score_time': array([2.6352133 , 3.34682218]),
 'mean_test_score': array([-4.34061464, -4.37139598]),
 'param_n_estimators': masked_array(data=[50, 50],
             mask=[False, False],
       fill_value='?',
            dtype=object),
 'param_num_leaves': masked_array(data=[10, 15],
             mask=[False, False],
       fill_value='?',
            dtype=object),
 'params': [{'n_estimators': 50, 'num_leaves': 10},
            {'n_estimators': 50, 'num_leaves': 15}],
 'rank_test_score': array([1, 2]),
 'split0_test_score': array([-4.333528  , -4.36132573]),
 'split1_test_score': array([-4.34583619, -4.37842419]),
 'split2_test_score': array([-4.34247974, -4.37443802]),
 'std_fit_time': array([0.19897029, 0.87574399]),
 'std_score_time': array([0.37787969, 0.12742558]),
 'std_test_score': array([0.00519499, 0.00730433])}


In [35]:
y_pred = gs.predict_proba(test.iloc[:, :5]) 
# gs가 위에서 나온 최고의 하이퍼파라미터들을 가지고 자동적으로 다시 학습함.(GridSearchCV의 안에 알고리즘이 있어서) 따라서 predict만 진행하면 됨

## Random search

In [24]:
# Random search
from sklearn.model_selection import RandomizedSearchCV


param = {'num_leaves': range(10, 15), # 범위를 설정해준다.
        'n_estimators': range(100, 200)}

model = lgb.LGBMClassifier(random_state = 0, learning_rate=0.03)

rs = RandomizedSearchCV(estimator=model,  # 튜닝하고자 하는 모델
                        param_distributions=param, 
                        n_iter = 2, # 내가 몇번 샘플링을 반복할 것인지, 2개의 조합만 샘플링 하겠다.
                        random_state=0, 
                        scoring = 'neg_log_loss', 
                        n_jobs = -1, cv = 3) 

rs.fit(X, y)

RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=LGBMClassifier(boosting_type='gbdt',
                                            class_weight=None,
                                            colsample_bytree=1.0,
                                            importance_type='split',
                                            learning_rate=0.03, max_depth=-1,
                                            min_child_samples=20,
                                            min_child_weight=0.001,
                                            min_split_gain=0.0,
                                            n_estimators=100, n_jobs=-1,
                                            num_leaves=31, objective=None,
                                            random_state=0, reg_alpha=0.0,
                                            reg_lambda=0.0, silent=True,
                                            subsample=1.0,
                                            subsample_for_bin=2

In [25]:
pprint.pprint(rs.best_params_) # 가장 좋았던 조합

pprint.pprint(rs.cv_results_) # 결과표

{'n_estimators': 109, 'num_leaves': 12}
{'mean_fit_time': array([60.57826002, 48.56829977]),
 'mean_score_time': array([83.7765859 , 71.58044314]),
 'mean_test_score': array([-4.49011554, -4.44166262]),
 'param_n_estimators': masked_array(data=[134, 109],
             mask=[False, False],
       fill_value='?',
            dtype=object),
 'param_num_leaves': masked_array(data=[12, 12],
             mask=[False, False],
       fill_value='?',
            dtype=object),
 'params': [{'n_estimators': 134, 'num_leaves': 12},
            {'n_estimators': 109, 'num_leaves': 12}],
 'rank_test_score': array([2, 1]),
 'split0_test_score': array([-4.48527946, -4.43597508]),
 'split1_test_score': array([-4.49465466, -4.44535484]),
 'split2_test_score': array([-4.4904125 , -4.44365794]),
 'std_fit_time': array([0.82462336, 1.94688146]),
 'std_score_time': array([0.11454041, 0.2121693 ]),
 'std_test_score': array([0.00383316, 0.00408093])}


In [26]:
y_pred_rs = rs.predict(test.iloc[:, :5]) # 이 친구도 가장 좋았던 하이퍼파라미터로 자동적으로 다시 학습하기 때문에 예측만 해주면 됨.

# 모델 앙상블
여러 모델을 합쳐서 예측을 하면 새로 들어오는 값에 대해 일반화가 잘될 수 있다.

##  배깅
soft or hard voting 같은거

In [36]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

In [38]:
clf1 = RandomForestClassifier(n_estimators =10, n_jobs = -1, random_state = 0)
clf2 = lgb.LGBMClassifier(n_estimators=10, random_state = 0, learning_rate=0.03)
#초기모델선언

In [39]:
eclf1 = VotingClassifier(estimators=[('rf', clf1), ('lgbm', clf2)], 
                         voting='soft') # soft voting으로 설정해서 위의 두개의 모델을 합친다.

eclf1 = eclf1.fit(X, y) #학습할 때는 fit함수

In [43]:
eclf1.estimators

[('rf',
  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                         criterion='gini', max_depth=None, max_features='auto',
                         max_leaf_nodes=None, max_samples=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
                         min_samples_leaf=1, min_samples_split=2,
                         min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
                         oob_score=False, random_state=0, verbose=0,
                         warm_start=False)),
 ('lgbm',
  LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
                 importance_type='split', learning_rate=0.03, max_depth=-1,
                 min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
                 n_estimators=10, n_jobs=-1, num_leaves=31, objective=None,
                 random_state=0, reg_alpha=0.0, reg_lambda=0.0, silent=True,
                 subsample=1.0,

In [41]:
y_pred_vote = eclf1.predict_proba(test.iloc[:, :5]) # 예측값의 평균으로 마지막에 산출

In [46]:
y_pred_vote

array([[0.00113591, 0.00109712, 0.00113622, ..., 0.00055586, 0.00053781,
        0.000507  ],
       [0.0011484 , 0.0010874 , 0.00112431, ..., 0.00050887, 0.00100015,
        0.00051258],
       [0.00109916, 0.00099029, 0.00136401, ..., 0.00046762, 0.00053066,
        0.00050647],
       ...,
       [0.00095534, 0.00094977, 0.00092074, ..., 0.00044184, 0.00053738,
        0.00046011],
       [0.05109638, 0.00106991, 0.00111967, ..., 0.00046408, 0.00052065,
        0.00055006],
       [0.00109108, 0.00326838, 0.00103893, ..., 0.00053392, 0.00050373,
        0.00049263]])

## 스태킹
평균 내기보다 확률 값별로의 가중치를 다르게 가져가서 예측

In [49]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

In [48]:
clf1 = RandomForestClassifier(n_estimators =10, n_jobs = -1, random_state = 0)
clf2 = lgb.LGBMClassifier(n_estimators=10, random_state = 0, learning_rate=0.03)
# 모델선언

In [53]:
clf = StackingClassifier(estimators=[('rf', clf1), ('lgbm', clf2)], #모델 합치기
    final_estimator=LogisticRegression(),
                        n_jobs = -1, 
                        stack_method = 'predict_proba',
                        cv = 3)
# rf랑 lgb로부터 나온 예측값에 가중치를 주어서 새로운 파이널 모델에 넣고 재학습. 

In [54]:
clf = clf.fit(X, y)

In [55]:
y_pred_stack = clf.predict_proba(test.iloc[:, :5])

In [59]:
y_pred_stack

array([[0.00216586, 0.00210159, 0.0021522 , ..., 0.00127722, 0.00137432,
        0.00135569],
       [0.0020739 , 0.00215997, 0.00196321, ..., 0.00153504, 0.00146594,
        0.00151122],
       [0.00196064, 0.00182352, 0.00195937, ..., 0.00117609, 0.00134152,
        0.00121692],
       ...,
       [0.00213839, 0.00195489, 0.00243081, ..., 0.00135868, 0.00124919,
        0.00139169],
       [0.00215344, 0.00206992, 0.00241268, ..., 0.00138046, 0.00138189,
        0.00143969],
       [0.00223416, 0.00217609, 0.00205226, ..., 0.00136906, 0.00130006,
        0.00142649]])

# Low Code ML

In [63]:
train_pycaret = pd.concat([X, y], axis = 1)

In [64]:
train_pycaret.head()

Unnamed: 0,V0000,V0001,V0002,V0003,V0004,label
0,30.474394,8.691177,8.714483,8.687399,8.72123,110
0,30.470463,8.736521,8.682769,8.717135,8.682402,110
0,30.465427,8.753559,8.663426,8.700049,8.734147,110
0,30.458532,8.715056,8.714854,8.717174,8.699257,110
0,30.475773,8.790241,8.735125,8.703167,8.72103,110


In [3]:
from pycaret.datasets import get_data
dataset = get_data('iris')

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size = 0.2, random_state = 0, shuffle = True)

In [65]:
from pycaret.classification import *
clf = setup(data = train_pycaret, target = 'label', session_id=0)

Unnamed: 0,Description,Value
0,session_id,0
1,Target,label
2,Target Type,Multiclass
3,Label Encoded,
4,Original Data,"(24810, 6)"
5,Missing Values,False
6,Numeric Features,5
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [None]:
best = compare_models(n_select = 3, sort = 'acc', include = ['rf', 'lightgbm', 'dt'], folds = 3)

IntProgress(value=0, description='Processing: ', max=19)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.0228,0.1177,0.0139,0.021,0.0215,0.0201,0.0202,6.937
