In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

import matplotlib.pyplot as plt

## 분류분석 개요

In [2]:
titanic = pd.read_csv('Titanic.csv', engine = 'python')
titanic.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,3,male,22.0,1,0,7.25,S,0
1,1,female,38.0,1,0,71.2833,C,1
2,3,female,26.0,0,0,7.925,S,1
3,1,female,35.0,1,0,53.1,S,1
4,3,male,35.0,0,0,8.05,S,0


In [3]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 889 entries, 0 to 888
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    889 non-null    int64  
 1   Sex       889 non-null    object 
 2   Age       889 non-null    float64
 3   SibSp     889 non-null    int64  
 4   Parch     889 non-null    int64  
 5   Fare      889 non-null    float64
 6   Embarked  889 non-null    object 
 7   Survived  889 non-null    int64  
dtypes: float64(2), int64(4), object(2)
memory usage: 55.7+ KB


In [4]:
titanic['Embarked'].unique()

array(['S', 'C', 'Q'], dtype=object)

In [5]:
mapping = {'S':1, 'C':2, 'Q':3}
mapping2 = {'male':1, 'female':0}
titanic['Embarked'] = titanic['Embarked'].map(mapping)
titanic['Sex'] = titanic['Sex'].map(mapping2)
titanic.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,3,1,22.0,1,0,7.25,1,0
1,1,0,38.0,1,0,71.2833,2,1
2,3,0,26.0,0,0,7.925,1,1
3,1,0,35.0,1,0,53.1,1,1
4,3,1,35.0,0,0,8.05,1,0


In [6]:
# devide independent variables and label
X = titanic.iloc[:, :-1].values
y = titanic.iloc[:, -1].values
X

array([[ 3.    ,  1.    , 22.    , ...,  0.    ,  7.25  ,  1.    ],
       [ 1.    ,  0.    , 38.    , ...,  0.    , 71.2833,  2.    ],
       [ 3.    ,  0.    , 26.    , ...,  0.    ,  7.925 ,  1.    ],
       ...,
       [ 3.    ,  0.    , 30.    , ...,  2.    , 23.45  ,  1.    ],
       [ 1.    ,  1.    , 26.    , ...,  0.    , 30.    ,  2.    ],
       [ 3.    ,  1.    , 32.    , ...,  0.    ,  7.75  ,  3.    ]])

In [None]:
# declare classifier object and train the model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X, y)

# make predictions
y_pred = knn.predict(X)

In [None]:
print('정확도: ', round(accuracy_score(y, y_pred),3))
print('재현율: ', round(recall_score(y, y_pred),3))
print('정밀도:', round(precision_score(y, y_pred),3))
print('F1 Score:', round(f1_score(y, y_pred),3))

In [None]:
from IPython.display import Image
Image("confusion matrix.png")

In [None]:
confusion_matrix(y, y_pred)

In [None]:
plot_confusion_matrix(knn, X,y)

In [None]:
probs = knn.predict_proba(X)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y, preds)
roc_auc = metrics.auc(fpr, tpr)

In [None]:
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# https://www.it-swarm.dev/ko/python/python%EC%97%90%EC%84%9C-roc-%EA%B3%A1%EC%84%A0%EC%9D%84-%EA%B7%B8%EB%A6%AC%EB%8A%94-%EB%B0%A9%EB%B2%95/1048726952/

### 실습: iris data로 정확도, confustion matrix 도출

In [None]:
from sklearn.datasets import load_iris

iris = load_iris()

In [None]:
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target

# 0.0, 1.0, 2.0으로 표현된 label을 문자열로 매핑
df['target'] = df['target'].map({0:"setosa", 1:"versicolor", 2:"virginica"})
df.head()

In [None]:
x_data = df.iloc[:, :-1].values
y_data = df.iloc[:, [-1]].values

In [None]:
# declare classifier object and train the model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_data, y_data)

# make predictions
y_data_pred = knn.predict(x_data)

In [None]:
print('정확도: ', round(accuracy_score(y_data, y_data_pred),3))

In [None]:
confusion_matrix(y_data, y_data_pred)

### train/test 분리 진행

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=1004)

In [None]:
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

In [None]:
knn.fit(X_train,y_train)

In [None]:
y_train_pred = knn.predict(X_train)
y_test_pred_knn = knn.predict(X_test)

cv_score = cross_val_score(knn, X_train, y_train, cv=5, scoring="accuracy")
print("교차 검증 평균 점수: {:.2f}".format(cv_score.mean()))

In [None]:
confusion_matrix(y_train, y_train_pred)

In [None]:
print('Train Data')
print('정확도: ', round(accuracy_score(y_train, y_train_pred),3))
print('재현율: ', round(recall_score(y_train, y_train_pred),3))
print('정밀도:', round(precision_score(y_train, y_train_pred),3))
print('F1 Score:', round(f1_score(y_train, y_train_pred),3))

In [None]:
print('Test Data')
print('정확도: ', round(accuracy_score(y_test, y_test_pred_knn),3))
print('재현율: ', round(recall_score(y_test, y_test_pred_knn),3))
print('정밀도:', round(precision_score(y_test, y_test_pred_knn),3))
print('F1 Score:', round(f1_score(y_test, y_test_pred_knn),3))

### 실습: iris 데이터 train/test dataset 나눠서 각각 정확도 구해보기

In [None]:
X_data_train, X_data_test, y_data_train, y_data_test = train_test_split(x_data,
                                                    y_data,
                                                    test_size=0.2,
                                                    random_state=25)

In [None]:
knn.fit(X_data_train,y_data_train)

y_train_pred = knn.predict(X_data_train)
y_test_pred = knn.predict(X_data_test)

print('Train Data')
print('정확도: ', round(accuracy_score(y_data_train, y_train_pred),3))
print('Test Data')
print('정확도: ', round(accuracy_score(y_data_test, y_test_pred),3))

## 로지스틱 리그레션

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
Image("logistic regression.png")

In [None]:
logistic = LogisticRegression()
logistic.fit(X, y)

In [None]:
help(LogisticRegression)

In [None]:
print('회귀계수:', logistic.coef_)
print('정확도:', round(logistic.score(X, y),3))

In [None]:
y_pred_logistic = logistic.predict(X)

print('정확도: ', round(accuracy_score(y, y_pred_logistic),3))
print('재현율: ', round(recall_score(y, y_pred_logistic),3))
print('정밀도:', round(precision_score(y, y_pred_logistic),3))
print('F1 Score:', round(f1_score(y, y_pred_logistic),3))

In [None]:
confusion_matrix(y, y_pred_logistic)

### 실습: iris data train/test 나눠서 logistic regression 진행

In [None]:
logistic.fit(X_data_train,y_data_train)

y_train_pred = logistic.predict(X_data_train)
y_test_pred = logistic.predict(X_data_test)

print('Train Data')
print('정확도: ', round(accuracy_score(y_data_train, y_train_pred),3))
print('Test Data')
print('정확도: ', round(accuracy_score(y_data_test, y_test_pred),3))

## 트리 모형

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
tree = DecisionTreeClassifier()
tree.fit(X, y)

y_pred_tree = tree.predict(X)

In [None]:
print(tree.predict_proba([[1,1,26,1,0,20,1]]))
print(tree.predict([[1,1,26,1,0,20,1]]))


In [None]:
print('정확도: ', round(accuracy_score(y, y_pred_tree),3))
print('재현율: ', round(recall_score(y, y_pred_tree),3))
print('정밀도:', round(precision_score(y, y_pred_tree),3))
print('F1 Score:', round(f1_score(y, y_pred_tree),3))

In [None]:
confusion_matrix(y, y_pred_tree)

#### overfitting 주의!

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=1004)

In [None]:
tree.fit(X_train,y_train)

In [None]:
y_train_pred_tree = tree.predict(X_train)
y_test_pred_tree = tree.predict(X_test)

cv_score_tree = cross_val_score(tree, X_train, y_train, cv=5, scoring="accuracy")
print("교차 검증 평균 점수: {:.2f}".format(cv_score_tree.mean()))

In [None]:
confusion_matrix(y_train, y_train_pred_tree)

In [None]:
print('Train Data')
print('정확도: ', round(accuracy_score(y_train, y_train_pred_tree),3))
print('재현율: ', round(recall_score(y_train, y_train_pred_tree),3))
print('정밀도:', round(precision_score(y_train, y_train_pred_tree),3))
print('F1 Score:', round(f1_score(y_train, y_train_pred_tree),3))

In [None]:
print('Test Data')
print('정확도: ', round(accuracy_score(y_test, y_test_pred_tree),3))
print('재현율: ', round(recall_score(y_test, y_test_pred_tree),3))
print('정밀도:', round(precision_score(y_test, y_test_pred_tree),3))
print('F1 Score:', round(f1_score(y_test, y_test_pred_tree),3))

### 복잡한 트리는 기존에 있는 데이터들은 잘 예측하나, 새로운 데이터는 잘 예측하지 못하고 있음

### 적당한 가지에서 잘라내야함, pruning

In [None]:
Image("decision tree.png")

In [None]:
help(DecisionTreeClassifier)

criterion의 default값은 gini  
max_depth: 트리의 최대 깊이  
min_samples_split: 분할되기 위해 노드가 가져야 하는 최소 샘플의 수  
min_samples_leaf: 리프 노드(자식이 없는 노드)가 가지고 있어야할 최소 샘플 수

In [None]:
tree3 = DecisionTreeClassifier(max_depth=3)
tree3.fit(X_train,y_train)

In [None]:
y_train_pred_tree3 = tree3.predict(X_train)
y_test_pred_tree3 = tree3.predict(X_test)

cv_score_tree3 = cross_val_score(tree3, X_train, y_train, cv=5, scoring="accuracy")
print("교차 검증 평균 점수: {:.2f}".format(cv_score_tree3.mean()))

In [None]:
confusion_matrix(y_train, y_train_pred_tree3)

In [None]:
print('Train Data')
print('정확도: ', round(accuracy_score(y_train, y_train_pred_tree3),3))
print('재현율: ', round(recall_score(y_train, y_train_pred_tree3),3))
print('정밀도:', round(precision_score(y_train, y_train_pred_tree3),3))
print('F1 Score:', round(f1_score(y_train, y_train_pred_tree3),3))

In [None]:
print('Test Data')
print('정확도: ', round(accuracy_score(y_test, y_test_pred_tree3),3))
print('재현율: ', round(recall_score(y_test, y_test_pred_tree3),3))
print('정밀도:', round(precision_score(y_test, y_test_pred_tree3),3))
print('F1 Score:', round(f1_score(y_test, y_test_pred_tree3),3))

In [None]:
tree3_entropy = DecisionTreeClassifier(criterion = 'entropy', max_depth=4)
tree3_entropy.fit(X_train,y_train)

In [None]:
y_train_pred_tree3_entropy = tree3_entropy.predict(X_train)
y_test_pred_tree3_entropy = tree3_entropy.predict(X_test)

cv_score_tree3_entropy = cross_val_score(tree3_entropy, X_train, y_train, cv=5, scoring="accuracy")
print("교차 검증 평균 점수: {:.2f}".format(cv_score_tree3_entropy.mean()))

In [None]:
print('Train Data')
print('정확도: ', round(accuracy_score(y_train, y_train_pred_tree3_entropy),3))
print('재현율: ', round(recall_score(y_train, y_train_pred_tree3_entropy),3))
print('정밀도:', round(precision_score(y_train, y_train_pred_tree3_entropy),3))
print('F1 Score:', round(f1_score(y_train, y_train_pred_tree3_entropy),3))

In [None]:
print('Test Data')
print('정확도: ', round(accuracy_score(y_test, y_test_pred_tree3_entropy),3))
print('재현율: ', round(recall_score(y_test, y_test_pred_tree3_entropy),3))
print('정밀도:', round(precision_score(y_test, y_test_pred_tree3_entropy),3))
print('F1 Score:', round(f1_score(y_test, y_test_pred_tree3_entropy),3))

### iris data 트리기반 모형으로 예측해보기

In [None]:
tree3.fit(X_data_train,y_data_train)

y_train_pred = tree3.predict(X_data_train)
y_test_pred = tree3.predict(X_data_test)

print('Train Data')
print('정확도: ', round(accuracy_score(y_data_train, y_train_pred),3))
print('Test Data')
print('정확도: ', round(accuracy_score(y_data_test, y_test_pred),3))

#### 심화과정: 그리드서치

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'criterion': ['gini', 'entropy'], 'max_depth': [2, 3, 4, 5, 6, 7],
     'min_samples_split': [10, 20, 30, 40], 'min_samples_leaf': [5, 10, 15, 20]},
  ]

tree_grid = DecisionTreeClassifier(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(tree_grid, param_grid, cv=5,
                           scoring='accuracy',
                           return_train_score=True)
grid_search.fit(X, y)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
cvres = grid_search.cv_results_
for accuracy, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(accuracy, params)

In [None]:
tree_best = DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_leaf=5,
                                   min_samples_split=10, random_state=42)
tree_best.fit(X_train,y_train)

In [None]:
y_train_pred_best = tree_best.predict(X_train)
y_test_pred_best = tree_best.predict(X_test)

cv_score_tree_best = cross_val_score(tree_best, X_train, y_train, cv=5, scoring="accuracy")
print("교차 검증 평균 점수: {:.2f}".format(cv_score_tree_best.mean()))

In [None]:
print('Train Data')
print('정확도: ', round(accuracy_score(y_train, y_train_pred_best),3))
print('재현율: ', round(recall_score(y_train, y_train_pred_best),3))
print('정밀도:', round(precision_score(y_train, y_train_pred_best),3))
print('F1 Score:', round(f1_score(y_train, y_train_pred_best),3))

In [None]:
print('Test Data')
print('정확도: ', round(accuracy_score(y_test, y_test_pred_best),3))
print('재현율: ', round(recall_score(y_test, y_test_pred_best),3))
print('정밀도:', round(precision_score(y_test, y_test_pred_best),3))
print('F1 Score:', round(f1_score(y_test, y_test_pred_best),3))

### 트리 기반 회귀

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
df = pd.read_csv('Boston_house.csv')
df.head()

In [None]:
Input= df.iloc[:,:-1]
Output = df.iloc[:,-1]

In [None]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(Input, Output)

In [None]:
Output_hat = tree_reg.predict(Input)

print('R^2: ', tree_reg.score(Input, Output))
print('MSE: ', mean_squared_error(Output,Output_hat))

In [None]:
Input_train, Input_test, Output_train, Output_test = train_test_split(Input,
                                                    Output,
                                                    test_size=0.2,
                                                    random_state=1004)

In [None]:
tree_reg.fit(Input_train, Output_train)

Output_train_hat = tree_reg.predict(Input_train)
print('train셋 MSE: ', mean_squared_error(Output_train,Output_train_hat))
print('train셋 R^2: ', tree_reg.score(Input_train, Output_train))

Output_test_hat = tree_reg.predict(Input_test)
print('*' *30)
print('test셋 MSE:', mean_squared_error(Output_test,Output_test_hat))
print('test셋 R^2: ', tree_reg.score(Input_test, Output_test))

#### 그리드 서치를 이용해서 하이퍼파라미터를 설정할 수 있음

## 앙상블

### 랜덤포레스트

In [None]:
Image("random forest.png")

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
randomforest = RandomForestClassifier()
randomforest.fit(X_train, y_train)

In [None]:
y_train_pred_randomforest = randomforest.predict(X_train)
y_test_pred_randomforest = randomforest.predict(X_test)

In [None]:
print('Train Data')
print('정확도: ', round(accuracy_score(y_train, y_train_pred_randomforest),3))
print('Test Data')
print('정확도: ', round(accuracy_score(y_test, y_test_pred_randomforest),3))

In [None]:
help(RandomForestClassifier)

### iris데이터 randomforest 진행하여 test data 정확도 구하기

In [None]:
randomforest.fit(X_data_train,y_data_train)

y_train_pred = randomforest.predict(X_data_train)
y_test_pred = randomforest.predict(X_data_test)

print('Train Data')
print('정확도: ', round(accuracy_score(y_data_train, y_train_pred),3))
print('Test Data')
print('정확도: ', round(accuracy_score(y_data_test, y_test_pred),3))

n_estimators: 생성할 tree의 갯수   
max_features: 최대 선택할 특성의 수 (전체 특성 선택시 무작위성 들어가지 않음) ('auto', 'sqrt', 'log2')  
n_jobs = -1 지정시 컴퓨터의 모든 코어 사용

In [None]:
import time
start = time.time()  # 시작 시간 저장

param_grid = [
    {'n_estimators': [100,200,300,400,500], 'max_features': ['auto', 'log2']},
  ]

randomforest_grid = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(randomforest_grid, param_grid, cv=5,
                           scoring='accuracy',
                           return_train_score=True)
grid_search.fit(X, y)

print("time :", time.time() - start)  # 실행 시간 = 현재시각 - 시작시간 

In [None]:
print(grid_search.best_params_)
print(grid_search.best_estimator_)

In [None]:
randomforest_best = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs = -1)
randomforest_best.fit(X_train,y_train)

In [None]:
y_train_pred_randomforest_best = randomforest_best.predict(X_train)
y_test_pred_randomforest_best = randomforest_best.predict(X_test)

In [None]:
print('Train Data')
print('정확도: ', round(accuracy_score(y_train, y_train_pred_randomforest_best),3))
print('Test Data')
print('정확도: ', round(accuracy_score(y_test, y_test_pred_randomforest_best),3))

In [None]:
print("특성 중요도 : ",randomforest_best.feature_importances_)

In [None]:
aa = titanic.iloc[:,:-1]
aa

n_feature = aa.shape[1]
index = np.arange(n_feature)

plt.barh(index, randomforest_best.feature_importances_, align='center')
plt.yticks(index, aa.columns)
plt.ylim(-1, n_feature)
plt.xlabel('feature importance', size=15)
plt.ylabel('feature', size=15)
plt.show()

## 투표기반

In [None]:
from sklearn.datasets import make_moons

X1, y1 = make_moons(n_samples=500, noise=0.30, random_state=42)
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC # 배우지 않은 내용

log_clf = LogisticRegression(random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
svm_clf = SVC(gamma="scale", random_state=42)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard')

In [None]:
voting_clf.fit(X1_train, y1_train)

In [None]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X1_train, y1_train)
    y1_pred = clf.predict(X1_test)
    print(clf.__class__.__name__, accuracy_score(y1_test, y1_pred))
    
# hard voting

In [None]:
log_clf = LogisticRegression(solver="lbfgs", random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
svm_clf = SVC(gamma="scale", probability=True, random_state=42)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft')
voting_clf.fit(X1_train, y1_train)

In [None]:
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X1_train, y1_train)
    y_pred = clf.predict(X1_test)
    print(clf.__class__.__name__, accuracy_score(y1_test, y1_pred))
    
# soft voting

## 배깅

In [None]:
Image("bagging.png")

In [None]:
from sklearn.ensemble import BaggingClassifier

In [None]:
help(BaggingClassifier)

In [None]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(random_state=42), n_estimators=500,
    max_samples=100, bootstrap=True, random_state=42, n_jobs=-1)

# 중복 허용하여 무작위로 선택된 100개의 샘플로 훈련 (bootstrap=False시 페이스팅)

bag_clf.fit(X_train, y_train)
y_pred_bagging = bag_clf.predict(X_test)

In [None]:
print('Test Data')
print('정확도: ', round(accuracy_score(y_test, y_pred_bagging),3))
# 기존 값 80.9%

In [None]:
bag_clf_oob = BaggingClassifier(
    DecisionTreeClassifier(random_state=42), n_estimators=500,
    bootstrap=True, oob_score=True, random_state=40, n_jobs=-1)
bag_clf_oob.fit(X_train, y_train)
bag_clf_oob.oob_score_

In [None]:
y_pred_bagging_oob = bag_clf_oob.predict(X_test)
print('Test Data')
print('정확도: ', round(accuracy_score(y_test, y_pred_bagging_oob),3))

In [None]:
bag_clf_oob.oob_decision_function_

### iris 데이터 bagging 진행

In [None]:
bag_clf.fit(X_data_train,y_data_train)

y_train_pred = bag_clf.predict(X_data_train)
y_test_pred = bag_clf.predict(X_data_test)

print('Train Data')
print('정확도: ', round(accuracy_score(y_data_train, y_train_pred),3))
print('Test Data')
print('정확도: ', round(accuracy_score(y_data_test, y_test_pred),3))

## 부스팅

In [None]:
Image("boosting.jpg")

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
help(AdaBoostClassifier)

In [None]:
Ada = AdaBoostClassifier(n_estimators=100,
                        learning_rate=1)
Ada.fit(X_train, y_train)

In [None]:
y_train_pred_Ada = Ada.predict(X_train)
y_test_pred_Ada = Ada.predict(X_test)

print('Train Data')
print('정확도: ', round(accuracy_score(y_train, y_train_pred_Ada),3))
print('Test Data')
print('정확도: ', round(accuracy_score(y_test, y_test_pred_Ada),3))

### iris data adaboosting 진행

In [None]:
Ada.fit(X_data_train,y_data_train)

y_train_pred = Ada.predict(X_data_train)
y_test_pred = Ada.predict(X_data_test)

print('Train Data')
print('정확도: ', round(accuracy_score(y_data_train, y_train_pred),3))
print('Test Data')
print('정확도: ', round(accuracy_score(y_data_test, y_test_pred),3))

In [None]:
start = time.time()  # 시작 시간 저장

param_grid = [
    {'n_estimators': [100,200,300,400,500], 'learning_rate': [5,1,0.1]},
  ]

Ada_grid = AdaBoostClassifier(random_state=42)

grid_search = GridSearchCV(Ada_grid, param_grid, cv=5,
                           scoring='accuracy',
                           return_train_score=True)
grid_search.fit(X, y)

print("time :", time.time() - start)  # 실행 시간 = 현재시각 - 시작시간 

In [None]:
print(grid_search.best_params_)
print(grid_search.best_estimator_)

In [None]:
Ada_best = AdaBoostClassifier(learning_rate=1, n_estimators=400, random_state=42)
Ada_best.fit(X_train, y_train)

In [None]:
y_train_pred_Ada_best = Ada_best.predict(X_train)
y_test_pred_Ada_best = Ada_best.predict(X_test)

print('Train Data')
print('정확도: ', round(accuracy_score(y_train, y_train_pred_Ada_best),3))
print('Test Data')
print('정확도: ', round(accuracy_score(y_test, y_test_pred_Ada_best),3))

In [None]:
print("특성 중요도 : ",Ada_best.feature_importances_)

In [None]:
aa = titanic.iloc[:,:-1]
aa

n_feature = aa.shape[1]
index = np.arange(n_feature)

plt.barh(index, Ada_best.feature_importances_, align='center')
plt.yticks(index, aa.columns)
plt.ylim(-1, n_feature)
plt.xlabel('feature importance', size=15)
plt.ylabel('feature', size=15)
plt.show()

In [None]:
print('KNN(5) 정확도: ', round(accuracy_score(y_test, y_test_pred_knn),3))
print('Logistic Regression 정확도: ', round(accuracy_score(y, y_pred_logistic),3))
print('Tree 정확도: ', round(accuracy_score(y_test, y_test_pred_tree),3))
print('Tree(3) 정확도: ', round(accuracy_score(y_test, y_test_pred_tree3),3))
print('Tree(Entropy,4) 정확도: ', round(accuracy_score(y_test, y_test_pred_tree3_entropy),3))
print('Tree(Best) 정확도: ', round(accuracy_score(y_test, y_test_pred_best),3))
print('RandomForest 정확도: ', round(accuracy_score(y_test, y_test_pred_randomforest),3))
print('RandomForest(Best) 정확도: ', round(accuracy_score(y_test, y_test_pred_randomforest_best),3))
print('Bagging 정확도: ', round(accuracy_score(y_test, y_pred_bagging),3))
print('Bagging(oob) 정확도: ', round(accuracy_score(y_test, y_pred_bagging_oob),3))
print('AdaBoosting 정확도: ', round(accuracy_score(y_test, y_test_pred_Ada),3))
print('AdaBoosting(Best) 정확도: ', round(accuracy_score(y_test, y_test_pred_Ada_best),3))

# 어느 분류분석이 좋은지는 도메인 지식, 하이퍼파라미터 설정 등에 따라 달라질 수 있음

#### 추가

In [None]:
from sklearn.svm import SVC

In [None]:
svm = SVC()

In [None]:
svm.fit(X_train,y_train)

y_train_pred = svm.predict(X_train)
y_test_pred = svm.predict(X_test)

print('Train Data')
print('정확도: ', round(accuracy_score(y_train, y_train_pred),3))
print('Test Data')
print('정확도: ', round(accuracy_score(y_test, y_test_pred),3))

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)
X_scaled

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, 
                                                      test_size = 0.2, random_state = 1004)

In [None]:
svm.fit(X_train,y_train)

y_train_pred = svm.predict(X_train)
y_test_pred = svm.predict(X_test)

print('Train Data')
print('정확도: ', round(accuracy_score(y_train, y_train_pred),3))
print('Test Data')
print('정확도: ', round(accuracy_score(y_test, y_test_pred),3))