In [36]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier

In [37]:
train = pd.read_csv('melon_train_feature.csv').iloc[:,1:]
test =  pd.read_csv('melon_test_feature.csv').iloc[:,1:]

In [38]:
y_train = pd.read_csv('y_train.csv').Top10
y_test = pd.read_csv('y_test.csv')

### 빠른 모델링

In [39]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=2020)

In [40]:
from sklearn.model_selection import train_test_split
X_train2, X_dev, y_train2, y_dev = train_test_split(train, y_train, test_size=0.3, random_state=2020, shuffle=True, stratify=y_train)

In [41]:
X_train2.shape, X_dev.shape, y_train2.shape, y_dev.shape

((621, 70), (267, 70), (621,), (267,))

In [42]:
lgbm = LGBMClassifier(random_state=2020,n_jobs= -1)
rf = RandomForestClassifier(random_state = 2020)
lr = LogisticRegression(random_state=2020,n_jobs= -1)
knn = KNeighborsClassifier(n_jobs= -1)
et = ExtraTreesClassifier(random_state=2020,n_jobs= -1)

In [43]:
lgbm.fit(X_train2, y_train2)
rf.fit(X_train2, y_train2)
lr.fit(X_train2, y_train2)
knn.fit(X_train2, y_train2)
et.fit(X_train2, y_train2)

ExtraTreesClassifier(n_jobs=-1, random_state=2020)

In [45]:
from sklearn.model_selection import cross_val_score
print('#### 만들어진 피처의 결과는....? ####')
cv_score1 = cross_val_score(lgbm, X_dev, y_dev, cv=skf, scoring = 'roc_auc')
print('lgbm : ',cv_score1)
print('평균 : ',np.mean(cv_score1))
cv_score2 = cross_val_score(rf, X_dev, y_dev, cv=skf, scoring = 'roc_auc')
print('rf : ',cv_score2)
print('평균 : ',np.mean(cv_score2))
cv_score3 = cross_val_score(lr, X_dev, y_dev, cv=skf, scoring = 'roc_auc')
print('lr : ',cv_score3)
print('평균 : ',np.mean(cv_score3))
cv_score4 = cross_val_score(knn, X_dev, y_dev, cv=skf, scoring = 'roc_auc')
print('knn : ',cv_score4)
print('평균 : ',np.mean(cv_score4))
cv_score5 = cross_val_score(et, X_dev, y_dev, cv=skf, scoring = 'roc_auc')
print('et : ',cv_score5)
print('평균 : ',np.mean(cv_score5))

#### 만들어진 피처의 결과는....? ####
lgbm :  [0.55523256 0.50857143 0.47904762 0.53670635]
평균 :  0.519889488741233
rf :  [0.48885659 0.50095238 0.50333333 0.51140873]
평균 :  0.5011377583979328
lr :  [0.58042636 0.48       0.43238095 0.3452381 ]
평균 :  0.45951135105204877
knn :  [0.42005814 0.50904762 0.52428571 0.60218254]
평균 :  0.5138935031376891
et :  [0.50145349 0.43809524 0.48190476 0.59275794]
평균 :  0.5035528562200073


## Hyperparameter Tuning

### LGBM

In [46]:
from bayes_opt import BayesianOptimization

pbounds = {'learning_rate' : (0.01,0.1),
           'n_estimators' : (5,50),
           'max_depth' : (2,20), # 과적합된 것 같다 _ max_depth 값을 줄이기
           'subsample' : (0.7,1),
           'colsample_bytree' : (0.8,1.0),
           'min_child_samples' : (3, 10),  # 보통 큰 값으로 설정하면 트리가 깊어지는 것을 방지
           'num_leaves': (2,10)
            # num_leaves의 개수를 높이면 정확도가 높아지지만, 반대로 트리의 깊이가 깊어지고 모델이 복잡도가 커져 과적합 영향도가 커집니다
           }

def lgb_opt(learning_rate, n_estimators, max_depth, subsample, colsample_bytree, min_child_samples, num_leaves):
    
    params = {
        'learning_rate' : learning_rate,
        'n_estimators' : int(round(n_estimators)),
        'max_depth': int(round(max_depth)),
        'subsample' : subsample,
        'colsample_bytree' : colsample_bytree,
        'min_child_samples' : int(round(min_child_samples)),
        'num_leaves' : int(round(num_leaves)),
        'objective' : 'binary',
        'random_state' : 42,
        'n_jobs' : -1
    }

    lgb_reg = LGBMClassifier(**params)
    
    scores = cross_val_score(lgb_reg, X_train2, y_train2, scoring = 'roc_auc', cv=skf, n_jobs=-1)
    mean_score = np.mean(scores)
    
    return mean_score


BO_lgb = BayesianOptimization(f = lgb_opt, pbounds = pbounds, random_state=42)

In [47]:
BO_lgb.maximize(init_points=10, n_iter=10)

|   iter    |  target   | colsam... | learni... | max_depth | min_ch... | n_esti... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.5201  [0m | [0m 0.8749  [0m | [0m 0.09556 [0m | [0m 15.18   [0m | [0m 7.191   [0m | [0m 12.02   [0m | [0m 3.248   [0m | [0m 0.7174  [0m |
| [95m 2       [0m | [95m 0.5463  [0m | [95m 0.9732  [0m | [95m 0.0641  [0m | [95m 14.75   [0m | [95m 3.144   [0m | [95m 48.65   [0m | [95m 8.66    [0m | [95m 0.7637  [0m |
| [0m 3       [0m | [0m 0.5261  [0m | [0m 0.8364  [0m | [0m 0.02651 [0m | [0m 7.476   [0m | [0m 6.673   [0m | [0m 24.44   [0m | [0m 4.33    [0m | [0m 0.8836  [0m |
| [0m 4       [0m | [0m 0.5254  [0m | [0m 0.8279  [0m | [0m 0.03629 [0m | [0m 8.595   [0m | [0m 6.192   [0m | [0m 40.33   [0m | [0m 3.597   [0m | [0m 0.8543  [0m |
| [0m 5       [0m | [0m 0.5284  [0m | 

In [48]:
max_lgbm = BO_lgb.max['params']
max_lgbm['n_estimators'] = int(round(max_lgbm['n_estimators']))
max_lgbm['num_leaves'] = int(round(max_lgbm['num_leaves']))
max_lgbm['max_depth'] = int(round(max_lgbm['max_depth']))
max_lgbm['min_child_samples'] = int(round(max_lgbm['min_child_samples']))

max_lgbm

{'colsample_bytree': 1.0,
 'learning_rate': 0.1,
 'max_depth': 13,
 'min_child_samples': 9,
 'n_estimators': 50,
 'num_leaves': 10,
 'subsample': 0.7027316113530102}

In [49]:
lgb_tune = LGBMClassifier(**max_lgbm)
lgb_tune.fit(X_train2, y_train2)
cross_val_score(lgb_tune, X_dev, y_dev, cv=skf, scoring = 'roc_auc')

array([0.54554264, 0.51142857, 0.47619048, 0.50198413])

### ExtraTree, RandomForest

In [50]:
from sklearn.model_selection import RandomizedSearchCV
import time
from tqdm import tqdm

clfs = [
           (
                ExtraTreesClassifier(random_state=2020),
                {'n_estimators': [3, 5, 7, 10, 15, 20, 30, 40, 50, 100, 200],
                 'criterion' : ["gini","entropy"],
                 'min_samples_split' : [1, 2, 3, 4, 5],
                 'max_depth' : [3, 5, 8, 10, 12, 15, 20, 30, 40]}
                # 'max_features': (np.arange(0.5, 1.0, 0.1)*X_train.shape[1]).astype(int)}
            ),
            (
            RandomForestClassifier(random_state=2020),
            {'n_estimators': [3, 5, 7, 10, 15, 20, 30, 40, 50, 100, 200],
             'max_depth': [3,5,7, 9, 10, 12, 15, 20],}
             # 'max_features': (np.arange(0.5, 1.0, 0.1)*X_train.shape[1]).astype(int)}
             )
        ]

clfs_tuned = []  # 튜닝된 모델을 저장
for clf, param_grid in tqdm(clfs):
    start = time.time()
    rand_search = RandomizedSearchCV(clf, param_grid, n_iter=5, scoring='roc_auc', 
                                     cv=skf, random_state=2020, n_jobs=-1)
    rand_search.fit(X_train2, y_train2)
    clf_name = type(clf).__name__
    clf_score = rand_search.score(X_dev, y_dev)
    print('{:30s} {:30f} {:.1f}'.format(clf_name, clf_score, time.time() - start))
    print(rand_search.best_estimator_)
    clfs_tuned.append((clf_name, rand_search, clf_score))

 50%|██████████████████████████████████████████                                          | 1/2 [00:00<00:00,  7.46it/s]

ExtraTreesClassifier                                 0.468331 0.1
ExtraTreesClassifier(criterion='entropy', max_depth=20, min_samples_split=4,
                     n_estimators=30, random_state=2020)


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  3.60it/s]

RandomForestClassifier                               0.470565 0.4
RandomForestClassifier(max_depth=7, random_state=2020)





In [51]:
model_et = ExtraTreesClassifier(criterion='entropy', max_depth=20, min_samples_split=4,n_estimators=30, random_state=2020)
model_et.fit(X_train2, y_train2)

ExtraTreesClassifier(criterion='entropy', max_depth=20, min_samples_split=4,
                     n_estimators=30, random_state=2020)

In [52]:
cross_val_score(model_et, X_dev, y_dev, cv=skf, scoring = 'roc_auc')

array([0.47141473, 0.46571429, 0.52761905, 0.60168651])

In [53]:
model_rf = RandomForestClassifier(max_depth=7, random_state=2020)
model_rf.fit(X_train2, y_train2)

RandomForestClassifier(max_depth=7, random_state=2020)

In [54]:
cross_val_score(model_rf, X_dev, y_dev, cv=skf, scoring = 'roc_auc')

array([0.50193798, 0.44952381, 0.48857143, 0.52678571])

## 결과 test

In [59]:
from sklearn.metrics import roc_auc_score

In [105]:
# 전부 0으로 예측
roc_auc_score(y_test,np.zeros(test.shape[0]))

0.5

In [79]:
y_test = pd.read_csv('y_test.csv').astype('float')

In [88]:
# lgbm
pred_lgbm = pd.DataFrame(lgb_tune.predict_proba(test)[:,1])
pred_lgbm.columns = ['Top10']

In [91]:
roc_auc_score(y_test,pred_lgbm)

0.5956937799043063

In [94]:
pred_et = pd.DataFrame(model_et.predict_proba(test)[:,1])
pred_et.columns = ['Top10']

In [95]:
roc_auc_score(y_test,pred_et)

0.5526315789473684

In [96]:
pred_rf = pd.DataFrame(model_rf.predict_proba(test)[:,1])
pred_rf.columns = ['Top10']

In [97]:
roc_auc_score(y_test,pred_rf)

0.49222488038277507