# ch.4 분류

## Logistic Regression

In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.datasets import load_iris
iris = load_iris()

In [None]:
# data, taget, target_name, DESCR, feature_names

data = iris.data
label = iris.target
columns = iris.feature_names
df = pd.DataFrame(data, columns=columns)
df.head()

In [None]:
label

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = \
train_test_split(data, label, test_size=0.2, shuffle=True, stratify=label, random_state=2019)

- 모델 객체 생성

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(verbose=1)

- 모델 학습

In [None]:
lr.fit(x_train, y_train)

- 예측 (결과 확인)

In [None]:
y_pred_lr = lr.predict(x_test)

In [None]:
# 로지스틱 회귀 정확도
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred_lr)

In [None]:
# 계수(weights)
lr.coef_

In [None]:
# wjfvus(bias)
lr.intercept_

## Support vector Machine

In [None]:
from sklearn.svm import SVC

In [None]:
svc = SVC()

In [None]:
svc.fit(x_train, y_train)

In [None]:
y_pred_svc = svc.predict(x_test)

In [None]:
accuracy_score(y_test, y_pred_svc)

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
tree = DecisionTreeClassifier()    

In [None]:
tree.fit(x_train, y_train)

In [None]:
y_pred_tree = tree.predict(x_test)

In [None]:
accuracy_score(y_test, y_pred_tree)

In [None]:
#### 결과값 비교

df = pd.DataFrame({'LR' : y_pred_lr, 'SVC' : y_pred_svc, 'TREE' : y_pred_tree, '정답': y_test})

In [None]:
df

In [None]:
# 위에 4단계 그냥 외우기

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [None]:
# DecisionTree Classifier 생성
dt_clf = DecisionTreeClassifier(random_state=156)

# 붓꽃 데이터를 로딩하고, 학습과 테스트 데이터 세트로 분리
iris_data = load_iris()
x_train, x_test, y_train, y_test = \
train_test_split(iris_data.data, iris_data.target, test_size = 0.2, random_state=11)

# DecisionTreeClassifier 학습.
dt_clf.fit(x_train, y_train)

In [None]:
from sklearn.tree import export_graphviz

# export_graphviz() 의 호출 결과를 out_file로 지정된 tree.dot 파일을 생성함.
export_graphviz(dt_clf, out_file = "tree.dot", class_names = iris_data.target_names, \
               feature_names = iris_data.feature_names, impurity = True, filled = True)

In [None]:
import graphviz

# 위에서 생성된 tree.dot 파일을 Graphviz가 읽어서 주피터 노트북상에서 시각화
with open("tree.dot") as f :
    dot_graph = f.read()
    
graphviz.Source(dot_graph)     # 색이 연할 수록 엔트로피 높음(많이 섞여있음) 진할 수록 단일

In [None]:
y_pred = dt_clf.predict(x_test)

In [None]:
accuracy_score(y_test, y_pred)

### 제약 조건
- max_depth = 3 (위에껀 안 줌, 무한임)

In [None]:
dt_clf2 = DecisionTreeClassifier(max_depth = 3, random_state=156)
dt_clf2.fit(x_train, y_train)

In [None]:
export_graphviz(dt_clf2, out_file = "tree.dot", class_names = iris_data.target_names, \
               feature_names = iris_data.feature_names, impurity = True, filled = False)
with open("tree.dot") as f :
    dot_graph = f.read()
    
graphviz.Source(dot_graph)      

In [None]:
y_pred2 = dt_clf2.predict(x_test)

In [None]:
accuracy_score(y_test, y_pred)

- min_samples_split = 4 인 경우

In [None]:
dt_clf3 = DecisionTreeClassifier(min_samples_split = 4, random_state=156)
dt_clf3.fit(x_train, y_train)

In [None]:
export_graphviz(dt_clf3, out_file = "tree.dot", class_names = iris_data.target_names, \
               feature_names = iris_data.feature_names, impurity = True, filled = True)
with open("tree.dot") as f :
    dot_graph = f.read()
    
graphviz.Source(dot_graph)      

In [None]:
y_pred3 = dt_clf3.predict(x_test)
accuracy_score(y_test, y_pred)

- min_samples_leaf = 4 인 경우

In [None]:
dt_clf4 = DecisionTreeClassifier(min_samples_leaf = 4, random_state=156)
dt_clf4.fit(x_train, y_train)

In [None]:
export_graphviz(dt_clf4, out_file = "tree.dot", class_names = iris_data.target_names, \
               feature_names = iris_data.feature_names, impurity = True, filled = True)
with open("tree.dot") as f :
    dot_graph = f.read()
    
graphviz.Source(dot_graph)   

In [None]:
y_pred4 = dt_clf4.predict(x_test)
accuracy_score(y_test, y_pred)

### Feature 중요도

In [None]:
dt_clf.feature_importances_   # 복수니까 s를 붙임 수치가 높을 수록 가장 중요

In [None]:
# feature별 importance 매핑
for name, value in zip(iris_data.feature_names, dt_clf.feature_importances_):
    print('{0} : {1:.3f}'.format(name, value))
    

In [None]:
df = pd.DataFrame({'name': iris_data.feature_names, '제약 무' : tree.feature_importances_, \
                  'max_depth = 3' : dt_clf2.feature_importances_, \
                  'min_samples_split = 4' : dt_clf3.feature_importances_, \
                  'min_samples_leaf = 4' : dt_clf4.feature_importances_})
df

## 앙상블 학습

In [None]:
import pandas as pd

from sklearn.datasets import load_breast_cancer # 이걸로 함

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression


cancer = load_breast_cancer()

data_df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
data_df.head()

In [None]:
data_df.count()

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier

# 개별 모델은 로지스틱 회귀와 KNN임.
lr_clf = LogisticRegression()
knn_clf = KNeighborsClassifier(n_neighbors=8)

# 개별 모델을 소프트 보팅 기반의 앙상블 모델로 구현한 분류기 , 객체 생성
vo_clf = VotingClassifier( estimators=[('LR', lr_clf), ('KNN', knn_clf)], voting='soft')
X_train, X_test, y_train, y_test = \
train_test_split(cancer.data, cancer.target, test_size = 0.2, random_state=156)


In [None]:
# VotingClassifier 학습/예측/평가. 
vo_clf.fit(X_train , y_train)
pred = vo_clf.predict(X_test)
print('Voting 분류기 정확도: {0:.4f}'.format(accuracy_score(y_test , pred)))

In [None]:
# 개별 모델의 학습/예측/평가.
classifiers = [lr_clf, knn_clf]   # 두개 데이터를 이용해서 값도 두개로 출력
for classifier in classifiers:
    classifier.fit(X_train , y_train)
    pred = classifier.predict(X_test)
    class_name= classifier.__class__.__name__
    print('{0} 정확도: {1:.4f}'.format(class_name, accuracy_score(y_test , pred)))

## Random Forest

In [None]:
# page 204, 205를 먼저 쳐야지 랜덤 포레스트 예시 진행 가능

def get_new_feature_name_df(old_feature_name_df) :
    feature_dup_df = pd.DataFrame(data = old_feature_name_df.groupby('column_name').cumcount(),
                                 columns=['dup_cnt'])
    feature_dup_df = feature_dup_df.reset_index()
    new_feature_name_df = pd.merge(old_feature_name_df.reset_index(), feature_dup_df, how='outer')
    new_feature_name_df['column_name'] = new_feature_name_df[['column_name', 'dup_cnt']]\
    .apply(lambda x : x[0]+'_'+str(x[1]) if x[1]>0 else x[0], axis=1)
    new_feature_name_df = new_feature_name_df.drop(['index'], axis=1)
    return new_feature_name_df

In [None]:
def get_human_dataset() :
    
    feature_name_df = pd.read_csv('./human_activity/features.txt', sep='\s+',
                                 header=None, names=['column_index', 'column_name'])
    
    new_feature_name_df = get_new_feature_name_df(feature_name_df)
    
    feature_name = new_feature_name_df.iloc[:, 1].values.tolist()
    
    X_train = pd.read_csv('./human_activity/train/X_train.txt', sep='\s+', names=feature_name)
    X_test = pd.read_csv('./human_activity/test/X_test.txt', sep='\s+', names=feature_name)
    
    y_train = pd.read_csv('./human_activity/train/y_train.txt', sep='\s+', header=None, names=['action'])
    y_test = pd.read_csv('./human_activity/test/y_test.txt', sep='\s+', header=None, names=['action'])
    
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = get_human_dataset()

In [None]:
print('## 학습 피처 데이터셋 info()')
print(X_train.info())

In [None]:
# 이제 랜덤 포레스트

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

params = {'n_estimators':[100], 'max_depth':[6, 8, 10, 12], 'min_samples_leaf':[8, 12, 18],\
         'min_samples_split':[8, 16, 20]}

# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf_clf = RandomForestClassifier(random_state=0, n_jobs=-1)
grid_cv = GridSearchCV(rf_clf, param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(X_train, y_train)

print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)  # 주어진 파라미터에서 평가
print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))

In [None]:
rf_clf1 = RandomForestClassifier(n_estimators=300, max_depth=10, min_samples_leaf=8, \
                                min_samples_split=8, random_state=0)

rf_clf1.fit(X_train, y_train)
pred = rf_clf1.predict(X_test)
print('예측 정확도: {0:.4f}'.format(accuracy_score(y_test, pred)))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

ftr_importances_values = rf_clf1.feature_importances_
ftr_importances = pd.Series(ftr_importances_values,index=X_train.columns  )
ftr_top20 = ftr_importances.sort_values(ascending=False)[:20]

plt.figure(figsize=(8,6))
plt.title('Feature importances Top 20')
sns.barplot(x=ftr_top20 , y = ftr_top20.index)
plt.show()

In [None]:

GridSearchCV 로 교차검증 및 하이퍼 파라미터 튜닝

In [None]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators': [100],
    'max_depth': [6, 8, 10, 12], 
    'min_samples_leaf': [8, 12, 18],
    'min_samples_split': [8, 16, 20]
}
# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf_clf = RandomForestClassifier(random_state=0, n_jobs=-1)
grid_cv = GridSearchCV(rf_clf, param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(X_train, y_train)

print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))

In [None]:
rf_clf1 = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_leaf=8, \
                                 min_samples_split=8, random_state=0)
rf_clf1.fit(X_train, y_train)
pred = rf_clf1.predict(X_test)
print('예측 정확도: {0:.4f}'.format(accuracy_score(y_test, pred)))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
import time
import warnings
warnings.filterwarnings('ignore')


# GBM 수행 시간 측정을 위함. 시작 시간 설정.
# start_time = time.time()

gb_clf = GradientBoostingClassifier(random_state=0, verbose=1)
gb_clf.fit(X_train , y_train)
gb_pred = gb_clf.predict(X_test)
gb_accuracy = accuracy_score(y_test, gb_pred)


In [None]:
X_train, X_test, y_train, y_test = get_human_dataset()

gb_clf.fit(X_train, y_train)

In [None]:
gb_pred = gb_clf.predict(X_test)
gb_accuracy = accuracy_score(y_test, gb_pred)
print('GBM 정확도: {0:.4f}'.format(gb_accuracy))

In [None]:
## 4.6 XGBoost(eXtra Gradient Boosting)

In [None]:
import xgboost
xgboost.__version__

In [None]:
import xgboost as xgb
from xgboost import plot_importance
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

dataset = load_breast_cancer()
X_features= dataset.data
y_label = dataset.target

cancer_df = pd.DataFrame(data=X_features, columns=dataset.feature_names)
cancer_df['target']= y_label
cancer_df.head(3)

In [None]:
print(dataset.target_names)
print(cancer_df['target'].value_counts())

In [None]:
X_train, X_test, y_train, y_test = \
    train_test_split(X_features, y_label, test_size=0.2, random_state=156)

In [None]:
dtrain = xgb.DMatrix(data=X_train , label=y_train)
dtest = xgb.DMatrix(data=X_test , label=y_test)

In [None]:
params = { 'max_depth':3,
           'eta': 0.1,
           'objective':'binary:logistic',
           'eval_metric':'logloss',
           'early_stoppings':100
        }
num_rounds = 400

In [None]:
# train 데이터 셋은 ‘train’, evaluation(test) 데이터 셋은 ‘eval’ 로 명기합니다. 
wlist = [(dtrain,'train'), (dtest,'eval')]
# 하이퍼 파라미터와 early stopping 파라미터를 train( ) 함수의 파라미터로 전달
xgb_model = xgb.train(params = params, dtrain=dtrain,
                      num_boost_round=num_rounds , evals=wlist)

In [None]:
get_clf_eval(y_test , ws100_preds)

In [None]:
# early_stopping_rounds를 10으로 설정하고 재 학습. 
xgb_wrapper.fit(X_train, y_train, early_stopping_rounds=10, 
                eval_metric="logloss", eval_set=evals, verbose=True)

ws10_preds = xgb_wrapper.predict(X_test)
get_clf_eval(y_test , ws10_preds)

In [None]:
fig, ax = plt.subplots(figsize=(10, 12))
# 사이킷런 래퍼 클래스를 입력해도 무방. 
plot_importance(xgb_wrapper, ax=ax)