In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import pandas as pd
cancer = load_breast_cancer()

data_df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
data_df.head()

- 로지스틱 회귀와 KNN을 기반으로 소프트 보팅 방식으로 새롭게 보팅 분류기 만듦
- VotingClassifier 클래스 : 보팅 분류기 생성
- 매개변수: estimators(리스트 값으로 보팅에 사용될 여러개의 Classifier 객체들을 튜플형식으로)와 voting('hard'시 하드보팅, 'soft'시 소프트 보팅)

In [None]:
#개별 모델은 로지스틱 회귀와 KNN
lr_clf = LogisticRegression(solver='liblinear')
knn_clf = KNeighborsClassifier(n_neighbors=8)

#개별 모델을 소프트 보팅을 기반의 앙상블 모델로 구현한 분류기
vo_clf = VotingClassifier(estimators=[('LR',lr_clf),('KNN',knn_clf)], voting='soft')

X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.2, random_state=156)

#학습, 예측, 평가
vo_clf.fit(X_train, y_train)
pred = vo_clf.predict(X_test)
print('Voting 분류기 정확도: {0: .4f}'.format(accuracy_score(y_test, pred)))

- 개별 모델의 학습/예측/평가

In [None]:
classifiers = [lr_clf, knn_clf]
for classifier in classifiers:
    classifier.fit(X_train, y_train)
    pred = classifier.predict(X_test)
    class_name=classifier.__class__.__name__
    print('{0}정확도: {1: .4f}'.format(class_name, accuracy_score(y_test, pred)))

### 랜덤포레스트(배깅에 속함!)

In [None]:
def get_new_feature_name_df(old_feature_name_df):
    feature_dup_df = pd.DataFrame(data=old_feature_name_df.groupby('column_name').cumcount(), columns=['dup_cnt'])
    feature_dup_df = feature_dup_df.reset_index()
    new_feature_name_df = pd.merge(old_feature_name_df.reset_index(), feature_dup_df, how='outer')
    new_feature_name_df['column_name']=new_feature_name_df[['column_name','dup_cnt']].apply(lambda x:x[0]+'_'+str(x[1]) if x[1]>0 else x[0],axis=1)
    new_feature_name_df = new_feature_name_df.drop(['index'],axis=1)
    return new_feature_name_df

In [None]:
def get_human_dataset():
    feature_name_df = pd.read_csv('/Users/stillssi/Desktop/MLP-Python/실습/실습 채우기/머신러닝/datas/human_activity/features.txt', sep='\s+', header=None, names=['column_index','column_name'])
    #중복된 피처명을 수정하는 get_new_feature_name_df()을 이용, 신규 피처명 Dataframe 생성
    new_feature_name_df = get_new_feature_name_df(feature_name_df)
    #DataFrame에 피처명을 칼럼으로 부여하기 위해 리스트 객체로 다시 반환
    feature_name = new_feature_name_df.iloc[:,1].values.tolist()
    #학습 피처 데이터 세트와 테스트 피처 데이터를 DataFrame으로 로딩, 칼럼명은 feature_name 적용
    X_train = pd.read_csv('/Users/stillssi/Desktop/MLP-Python/실습/실습 채우기/머신러닝/datas/human_activity/train/X_train.txt', sep='\s+', header=None, names=feature_name)
    X_test = pd.read_csv('/Users/stillssi/Desktop/MLP-Python/실습/실습 채우기/머신러닝/datas/human_activity/test/X_test.txt', sep='\s+', header=None, names=feature_name)

    y_train = pd.read_csv('/Users/stillssi/Desktop/MLP-Python/실습/실습 채우기/머신러닝/datas/human_activity/train/y_train.txt', sep='\s+', header=None, names=['action'])
    y_test = pd.read_csv('/Users/stillssi/Desktop/MLP-Python/실습/실습 채우기/머신러닝/datas/human_activity/test/y_test.txt', sep='\s+', header=None, names=['action'])

    return X_train,X_test, y_train, y_test

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

X_train, X_test, y_train, y_test = get_human_dataset()

#랜덤 포스 학습 및 별도의 테트 세트 예측성능 평가
rf_clf = RandomForestClassifier(random_state=0, max_depth=8)
rf_clf.fit(X_train, y_train)
pred = rf_clf.predict(X_test)
accuracy= accuracy_score(y_test, pred)
print('랜덤 포레스트 정확도: {0: .4f}'.format(accuracy))

In [None]:
from sklearn.model_selection import GridSearchCV

params = {
    'max_depth': [8,16,24],
    'min_samples_leaf':[1,6,12],
    'min_samples_split':[2,8,16]
}

#RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf_clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
grid_cv = GridSearchCV(rf_clf, param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(X_train, y_train)
print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 예측 정확도: {0: .4f}'.format(grid_cv.best_score_))

- 최적의 파라미터로 학습, 예측 정확도

In [None]:
rf_clf1 = RandomForestClassifier(n_estimators=100, min_samples_leaf=6, max_depth=16, min_samples_split=2, random_state=0)
rf_clf1.fit(X_train, y_train)
rf_clf1.predict(X_test)
print('예측 정확도: {0: .4f}'.format(accuracy_score(y_test, pred)))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

ftr_importances_values = rf_clf1.feature_importances_
ftr_importances = pd.Series(ftr_importances_values, index=X_train.columns)
ftr_top20 = ftr_importances.sort_values(ascending=False)[:20]

plt.figure(figsize=(8,6))
plt.title('Feature importances Top 20')
sns.barplot(x=ftr_top20, y=ftr_top20.index)
plt.show()

## GBM(부스팅 알고리즘) 기반 분류

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
import time
import warnings
warnings.filterwarnings('ignore')

X_train, X_test, y_train, y_test = get_human_dataset()

#GBM 수행시간 측정을 위함, 시작시간 설정
start_time = time.time()
gb_clf = GradientBoostingClassifier(random_state=0)
gb_clf.fit(X_train, y_train)
gb_pred = gb_clf.predict(X_test)
gb_accuracy = accuracy_score(y_test, pred)

print('GBM 정확도: {0: .4f}'.format(gb_accuracy))
print('GBM 수행 시간: {0: .1f} 초'.format(time.time()-start_time))


### XGBoost 

In [None]:
from tkinter.ttk import LabeledScale
import xgboost as xgb
from xgboost import plot_importance
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

dataset = load_breast_cancer()
features = dataset.data
labels = dataset.target

cancer_df = pd.DataFrame(data=features, columns=dataset.feature_names)
cancer_df['target']=labels
cancer_df.head(3)



In [None]:
print(dataset.target_names)
print(cancer_df['target'].value_counts())

In [None]:
X_features = cancer_df.iloc[:, :-1]
y_label = cancer_df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X_features, y_label, test_size=0.2, random_state=156)

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=156)

print(X_train.shape, X_test.shape)
print(X_tr.shape, X_val.shape)

In [None]:
dtr = xgb.DMatrix(data=X_tr, label=y_tr)
dval = xgb.DMatrix(data=X_val, label=y_val)
dtest = xgb.DMatrix(data=X_test, label=y_test)

In [None]:
params = {
    'max_depth':3,
    'eta':0.05,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss'
}
num_rounds= 400

In [None]:
eval_list = [(dtr, 'train'), (dval, 'eval')]
xgb_model = xgb.train(params=params, dtrain=dtr, num_boost_round = num_rounds, early_stopping_rounds=50, evals=eval_list)

In [None]:
pred_probs=xgb_model.predict(dtest)
print('predict() 수행 결과값을 10개만 표시, 예측 확률값으로 표시됨')
print(np.round(pred_probs[:10],3))

preds=[1 if x>0.5 else 0 for x in pred_probs]
print('예측값 10개만 표시', preds[:10])

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

def get_clf_eval(y_test, pred=None, pred_proba=None):
    
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('오차행렬')
    print(confusion)
    print('정확도: {0:.4f}, 정밀도:{1: .4f}, 재현율:{2: .4f}, f1점수: {3: .4f}, AUC점수:{4: .4f}'.format(accuracy, precision, recall, f1, roc_auc))

In [None]:
get_clf_eval(y_test, preds, pred_probs)

## 사이킷런 래퍼 XGBClassifier, XGBRegressor
- 매개변수: eta -> learning_rate, sub_sample->subsample, lambda->reg_lambda, alpha->reg_alpha

In [None]:
from xgboost import XGBClassifier

xgb_wrapper = XGBClassifier(n_estimators=400, learning_rate=0.05, max_depth=3, eval_metric='logloss')
xgb_wrapper.fit(X_train, y_train, verbose=True)
w_preds = xgb_wrapper.predict(X_test)
w_pred_proba=xgb_wrapper.predict_proba(X_test)[:,1]

In [None]:
get_clf_eval(y_test, w_preds, w_pred_proba)

- 조기중단 적용

In [None]:
from __future__ import nested_scopes
from xgboost import XGBClassifier
xgb_wrapper = XGBClassifier(n_estimators=400, learning_rate=0.05, max_depth=3)
evals = [(X_tr, y_tr), (X_val, y_val)]
xgb_wrapper.fit(X_tr, y_tr, early_stopping_rounds=50, eval_metric='logloss', eval_set=evals, verbose=True)
ws50_preds = xgb_wrapper.predict(X_test)
ws50_pred_proba = xgb_wrapper.predict_proba(X_test)[:, 1]


In [None]:
get_clf_eval(y_test, ws50_preds, ws50_pred_proba)

- 조기종료 라운드 수 10으로 변경

In [None]:
xgb_wrapper.fit(X_tr, y_tr, early_stopping_rounds=10, eval_metric='logloss', eval_set=evals, verbose=True)
ws10_pred = xgb_wrapper.predict(X_test)
ws10_pred_proba = xgb_wrapper.predict_proba(X_test)[:, 1]
get_clf_eval(y_test, ws10_pred, ws10_pred_proba)

## LightGBM

In [None]:
from lightgbm import LGBMClassifier

import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

dataset = load_breast_cancer()
cancer_df = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)
cancer_df['target'] = dataset.target
X_feature = cancer_df.iloc[:,:-1]
y_label = cancer_df.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X_feature, y_label, test_size=0.2, random_state=156)
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=156)

lgbm_wrapper = LGBMClassifier(n_estimators=400, learning_rate=0.05)

evals = [(X_tr, y_tr), (X_val, y_val)]
lgbm_wrapper.fit(X_tr, y_tr, early_stopping_rounds=50, eval_metric='logloss', eval_set=evals, verbose=True)
preds = lgbm_wrapper.predict(X_test)
pred_proba = lgbm_wrapper.predict_proba(X_test)[:,1]

In [None]:
get_clf_eval(y_test, preds, pred_proba)

In [None]:
from lightgbm import plot_importance
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(10,12))
plot_importance(lgbm_wrapper, ax=ax)

- 베이지안 최적화 기반의 HyperOpt를 이용한 하이퍼 파라미터 튜닝

In [None]:
params = {
    'max_depth' : [10,20,30,40,50], 
    'num_leaves' : [35,45,55,65],
    'colsample_bytree':[0.5, 0.6,0.7,0.8,0.9], 
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9],
    'min_child_weight': [10,20,30,40],
    'reg_alpha': [0.01, 0.05, 0.1]
}

## 분류 실습 - 캐들 산탄데르 고객 만족 예측

- 데이터 전처리

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import warnings

warnings.filterwarnings('ignore')
cust_df = pd.read_csv('/Users/stillssi/Desktop/MLP-Python/실습/실습 채우기/머신러닝/datas/train.csv', encoding='latin1')
print('dataset shape', cust_df.shape)
cust_df.head(3)

In [None]:
cust_df.info()

In [None]:
print(cust_df['TARGET'].value_counts())
unsatisfied_cnt = cust_df[cust_df['TARGET']==1].TARGET.count()
total_cnt = cust_df.TARGET.count()
print('unsatisfied 비율은 {0: .2f}'.format(unsatisfied_cnt/total_cnt))

var3의 평균값 -999999 -> NaN이나 특정 예외값을 -999999fh 변환

In [None]:
cust_df.describe()

In [None]:
print(cust_df.var3.value_counts()[:10]) #-999999값이 116개 있음 -> 다른 값에 비해 편차가 심하므로 2로 뱐환

In [None]:
cust_df['var3'].replace(-999999, 2, inplace=True)
#cust_df.drop('ID', axis=1, inplace=True)

#피처 세트와 레이블 분리, 레이블 칼럼은 DataFrame 마지막에 위치해 -1로 분리
X_feature = cust_df.iloc[:,:-1]
y_labels = cust_df.iloc[:, -1]
print('피처 데이터 shape:{0}'.format(X_feature.shape))

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_feature, y_labels, test_size=0.2, random_state=0)

train_cnt = y_train.count()
test_cnt = y_test.count()

print('학습 세트 Shape: {0}, 테스트 세트 shape:{1}'.format(X_train.shape, X_test.shape))
print('학습 세트 레이블 값 분포 비율')
print(y_train.value_counts()/train_cnt)
print('\n 테스트 세트 레이블 값 분포 비율')
print(y_test.value_counts()/test_cnt)

- XGBoost 조기 중단의 검증 데이터 세트로 사용하기 위해 학습 데이터 쪼개기

In [None]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=0)

- XGBoost 모델 학습과 하이퍼 파라미터 튜닝

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

#n_estimator는 500으로, random_state는 예제 수행 시마다 동일 예측 결과를 위해 수정
xgb_clf = XGBClassifier(n_estimator=500, learning_rate=0.05, random_state=156)

#성능 평가 지표는 auc로 조기 중간 파라밈터는 100으로 설정하고 학습 수행
xgb_clf.fit(X_tr, y_tr, early_stopping_rounds=100, eval_metric="auc", eval_set=[(X_tr, y_tr), (X_val, y_val)])

xgb_roc_score = roc_auc_score(y_test, xgb_clf.predict_proba(X_test)[:,1])
print('ROC AUC: {0: .4f}'.format(xgb_roc_score))

In [None]:
from hyperopt import hp
#max_depth는 5에서 15까지 1간격으로, min_child_weight는 1~6까지 1간격으로
#colsample_bytree는 0.5에서 0.95사이, learning_rate는 0.01dptj 0.2사이 정규 분포된 값으로 검색
#quniform : 간격설정, uniform: 사이
xgb_search_space = {
    'max_depth': hp.quniform('max_depth',5,15,1),
    'min_child_weight': hp.quniform('min_child_weight', 1,6,1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 0.95),
    'learning_rate': hp.uniform('learning_rate', 0.01, 2)
}


In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

def objective_func(search_space):
    xgb_clf = XGBClassifier(n_estimator=100, 
    max_depth=int(search_space['max_depth']), 
    min_child_weight=int(search_space['min_child_weight']), 
    colsample_bytree=search_space['colsample_bytree'], 
    learning_rate=search_space['learning_rate'])

    #3개의 k-fold 방식으로 평가된 roc-auc 지표를 담는 리스트
    roc_auc_list = []

    #3개의 k-fold방식 적용
    kf = KFold(n_splits=3)
    #X_train을 다시 학습과 검증 데이터로 분리
    for tr_index, val_index in kf.split(X_train):
        #kf.split(X_train)으로 추출된 학습과 검증 index값으로 학습과 검증 데이터 분리
        X_tr, y_tr = X_train.iloc[tr_index], y_train.iloc[tr_index]
        X_val, y_val = X_train.iloc[val_index], y_train.iloc[val_index]

        xgb_clf.fit(X_tr, y_tr, early_stopping_rounds=30, eval_metric='auc', eval_set=[(X_tr, y_tr), (X_val, y_val)])

        score = roc_auc_score(y_val, xgb_clf.predict_proba(X_val)[:,1])
        roc_auc_list.append(score)

    #3개의 k-fold로 계산된 roc_auc값의 평균값을 반환하되
    #HyperOpt는 목적 함수의 최소값을 위한 입력값을 찾음으로 -1을 곱한 뒤 반환
    return -1*np.mean(roc_auc_list)


In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from hyperopt import fmin, tpe, Trials
import numpy as np
trials = Trials()
xgb_search_space = {
    'max_depth': hp.quniform('max_depth',5,15,1),
    'min_child_weight': hp.quniform('min_child_weight', 1,6,1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 0.95),
    'learning_rate': hp.uniform('learning_rate', 0.01, 2)
}

#fmin()함수 호출, max_evals지정된 횟수만큼 반복 후 목적 함수 최소값을 가지는 최적 입력값 추출
best = fmin(fn=objective_func, space=xgb_search_space, algo=tpe.suggest, max_evals=50, trials=trials, rstate=np.random.default_rng(seed=30))

# print('best:',best)

In [None]:
#n_estimators를 500 증가 후 최적으로 찾은 하이퍼 파라미터를 기반으로 학습과 예측 수행
xgb_clf = XGBClassifier(n_estimators=500, learning_rate=round(best['learning_rate'],5), max_depth=int(best['max_depth']),
min_child_weight=int(best['min_child_weight']), colsample_bytree=round(best['colsample_bytree'],15))

#evaluation metric을 auc로, early stopping은 100으로 설정하고 학습 수행
xgb_clf.fit(X_tr, y_tr, early_stopping_rounds=100, eval_metric='auc', eval_set=[(X_tr, y_tr),(X_val, y_val)])

xgb_roc_score = roc_auc_score(y_test, xgb_clf.predict_proba(X_test)[:,1])
print('ROC AUC 점수: {0: .4f}'.format(xgb_roc_score))

In [None]:
from xgboost import plot_importance
import matplotlib.pyplot as plt
%matplotlib inline

fig,ax = plt.subplot(1,1,figsize=(10,8))
plot_importance(xgb_clf, ax=ax, max_num_features=20, height=0.4)

- LightGBM 모델 학습과 하이퍼 파라미터 튜닝

In [25]:
from lightgbm import LGBMClassifier

lgbm_clf = LGBMClassifier(n_estimators=500)

eval_set = [(X_tr, y_tr),(X_val, y_val)]
lgbm_clf.fit(X_tr, y_tr, early_stopping_rounds=100, eval_metric='auc', eval_set=eval_set)

lgbm_roc_score = roc_auc_score(y_test, lgbm_clf.predict_proba(X_test)[:,1])
print('ROC AUC: {0: .4f}'.format(lgbm_roc_score))


[1]	training's auc: 0.82625	training's binary_logloss: 0.15523	valid_1's auc: 0.809814	valid_1's binary_logloss: 0.15774
Training until validation scores don't improve for 100 rounds
[2]	training's auc: 0.83366	training's binary_logloss: 0.149566	valid_1's auc: 0.812647	valid_1's binary_logloss: 0.153249
[3]	training's auc: 0.839786	training's binary_logloss: 0.145331	valid_1's auc: 0.814983	valid_1's binary_logloss: 0.150043
[4]	training's auc: 0.84588	training's binary_logloss: 0.142002	valid_1's auc: 0.820013	valid_1's binary_logloss: 0.147504
[5]	training's auc: 0.848189	training's binary_logloss: 0.139394	valid_1's auc: 0.821242	valid_1's binary_logloss: 0.145447
[6]	training's auc: 0.853423	training's binary_logloss: 0.137158	valid_1's auc: 0.820464	valid_1's binary_logloss: 0.143963
[7]	training's auc: 0.85535	training's binary_logloss: 0.135295	valid_1's auc: 0.821721	valid_1's binary_logloss: 0.142658
[8]	training's auc: 0.85958	training's binary_logloss: 0.133521	valid_1's au

In [None]:
lgbm_search_space={
    'num_leaves': hp.quniform('num_leaves', 32, 64, 1),
    'max_depth': hp.quniform('max_depth', 100, 160, 1),
    'min_child_samples': 
}