# 앙상블

## Voting

In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

cancer = load_breast_cancer()
X = cancer['data']
y = cancer['target']

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0, stratify=y)

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

knn1 = KNeighborsClassifier(n_neighbors=5)
knn2 = KNeighborsClassifier(n_neighbors=3)
lr = LogisticRegression(max_iter=10000)
dt3 = DecisionTreeClassifier(max_depth=3)
dt5 = DecisionTreeClassifier(max_depth=5)

In [3]:
from sklearn.ensemble import VotingClassifier
hard = VotingClassifier([('knn1', knn1), ('knn2',knn2), ('lr',lr),('dt3',dt3),('dt5',dt5)])
soft = VotingClassifier([('knn1', knn1), ('knn2',knn2), ('lr',lr),('dt3',dt3),('dt5',dt5)], voting= 'soft')

In [4]:
names = 'hard,soft,knn1,knn2,lr,dt3,dt5'.split(sep=",")

for idx, model in enumerate([hard,soft,knn1,knn2,lr,dt3,dt5]):
    model.fit(X_train,y_train)
    name = names[idx]
    train_score = model.score(X_train, y_train)*100
    test_score = model.score(X_test,y_test)*100
    print(f'{name} Train Accuracy:{train_score:.2f}%')
    print(f'{name} Test Accuracy:{test_score:.2f}%')
    print()

hard Train Accuracy:98.12%
hard Test Accuracy:95.10%

soft Train Accuracy:99.53%
soft Test Accuracy:95.80%

knn1 Train Accuracy:94.60%
knn1 Test Accuracy:91.61%

knn2 Train Accuracy:95.77%
knn2 Test Accuracy:91.61%

lr Train Accuracy:96.71%
lr Test Accuracy:93.71%

dt3 Train Accuracy:97.65%
dt3 Test Accuracy:93.01%

dt5 Train Accuracy:100.00%
dt5 Test Accuracy:91.61%



## Bagging

In [6]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth=5).fit(X_train,y_train)
model.score(X_train,y_train),model.score(X_test,y_test)

(0.9976525821596244, 0.951048951048951)

## Boosting

In [8]:
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier().fit(X_train, y_train)
model.score(X_train,y_train), model.score(X_test, y_test)

(1.0, 0.951048951048951)

## Stacking

In [9]:
from sklearn.ensemble import StackingClassifier

estimators = [('rf', RandomForestClassifier()), ('gb', GradientBoostingClassifier())]

model = StackingClassifier(estimators = estimators, final_estimator=LogisticRegression())

model.fit(X_train,y_train).score(X_test,y_test)

0.958041958041958

### Self Study - Classification

In [10]:
from sklearn.datasets import load_digits
digits = load_digits()
X=digits['data']
y=digits['target']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=0, stratify=y)



In [11]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier

In [14]:
knn = KNeighborsClassifier().fit(X,y)
print(knn.score(X_train,y_train), knn.score(X_test, y_test))

(0.9910913140311804, 0.9888888888888889)

In [15]:
lr = LogisticRegression(max_iter=10000).fit(X_train,y_train)
print(lr.score(X_train,y_train), lr.score(X_test, y_test))

(1.0, 0.9644444444444444)

In [17]:
dt = DecisionTreeClassifier(max_depth=3).fit(X_train, y_train)
print(dt.score(X_train,y_train), dt.score(X_test, y_test))

(0.47438752783964366, 0.4688888888888889)

In [18]:
knn1 = KNeighborsClassifier(n_neighbors=5)
knn2 = KNeighborsClassifier(n_neighbors=3)
lr1 = LogisticRegression(max_iter=10000)
dt3 = DecisionTreeClassifier(max_depth=3)
dt5 = DecisionTreeClassifier(max_depth=5)

In [19]:
from sklearn.ensemble import VotingClassifier
hard = VotingClassifier([('knn1', knn1), ('knn2',knn2), ('lr1',lr1),('dt3',dt3),('dt5',dt5)])
soft = VotingClassifier([('knn1', knn1), ('knn2',knn2), ('lr1',lr1),('dt3',dt3),('dt5',dt5)], voting= 'soft')

In [31]:
names = 'hard,soft,knn1,knn2,lr1,dt3,dt5'.split(sep=",")

for idx, model in enumerate([hard,soft,knn1,knn2,lr1,dt3,dt5]):
    model.fit(X_train,y_train)
    name = names[idx]
    train_score = model.score(X_train, y_train)*100
    test_score = model.score(X_test,y_test)*100
    print(f'{name} Train Accuracy:{train_score:.2f}%')
    print(f'{name} Test Accuracy:{test_score:.2f}%')
    print()

hard Train Accuracy:99.48%
hard Test Accuracy:98.22%

soft Train Accuracy:99.63%
soft Test Accuracy:97.78%

knn1 Train Accuracy:99.11%
knn1 Test Accuracy:98.00%

knn2 Train Accuracy:99.11%
knn2 Test Accuracy:98.67%

lr1 Train Accuracy:100.00%
lr1 Test Accuracy:96.44%

dt3 Train Accuracy:47.44%
dt3 Test Accuracy:46.89%

dt5 Train Accuracy:70.08%
dt5 Test Accuracy:67.56%



In [28]:
gb = GradientBoostingClassifier().fit(X_train, y_train)
print(gb.score(X_train,y_train)*100, gb.score(X_test, y_test)*100)

1e+02 96.88888888888889


In [26]:
rf = RandomForestClassifier(max_depth=5).fit(X_train,y_train)
print(rf.score(X_train,y_train)*100,rf.score(X_test,y_test) *100)

96.73348181143281 94.88888888888889


In [30]:
from sklearn.ensemble import StackingClassifier

estimators = [('rf', RandomForestClassifier()), ('gb', GradientBoostingClassifier())]

stack = StackingClassifier(estimators = estimators, final_estimator=LogisticRegression())

print(stack.fit(X_train,y_train).score(X_test,y_test))

9.711111111111111e-01


In [32]:
from sklearn.datasets import load_digits
digits = load_digits()
best_model = {}

# 데이터 분할
x_tr, x_te, y_tr, y_te = train_test_split(digits['data'],
                                                    digits['target'],
                                                    stratify=digits['target'],
                                                    random_state=0)

# 모델 설정
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

knn1 = KNeighborsClassifier(n_neighbors=5)
knn2 = KNeighborsClassifier(n_neighbors=3)
lr = LogisticRegression(max_iter=10000)
dt3 = DecisionTreeClassifier(max_depth=3)
dt5 = DecisionTreeClassifier(max_depth=5)

# voting
from sklearn.ensemble import VotingClassifier
hard = VotingClassifier([('knn1', knn1), ('knn2', knn2), ('lr', lr), ('dt3', dt3), ('dt5', dt5)])

soft = VotingClassifier([('knn1', knn1), ('knn2', knn2), ('lr', lr), ('dt3', dt3), ('dt5', dt5)], voting='soft')

names = ['hard', 'soft', 'knn1', 'knn2', 'lr', 'dt3', 'dt5']
for idx, model in enumerate([hard, soft, knn1, knn2, lr, dt3, dt5]):
    model.fit(x_tr, y_tr)
    name = names[idx]
    train_score = model.score(x_tr, y_tr) * 100
    test_score = model.score(x_te, y_te) * 100
    best_model[name] = [test_score]
    
# bagging
for i in range(1, 6):
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(max_depth=i).fit(x_tr, y_tr)
    best_model[f'bagging, max_depth={i}'] = [model.score(x_te, y_te)]
    
# boosting
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier().fit(x_tr, y_tr)
best_model['boosting'] = [model.score(x_te, y_te)]

# stacking
from sklearn.ensemble import StackingClassifier

estimators = [('rf', RandomForestClassifier()),
             ('gb', GradientBoostingClassifier())]

model = StackingClassifier(estimators=estimators,
                          final_estimator=LogisticRegression())

best_model['stacking'] = [model.fit(x_tr, y_tr).score(x_te, y_te)]



In [45]:
import pandas as pd
best_model_df = pd.DataFrame(best_model).T
best_model_df.sort_values(0, ascending=False).iloc[0]

0    98.666667
Name: knn2, dtype: float64

In [46]:
import pandas as pd
best_model_df = pd.DataFrame(best_model).T
print(best_model_df)
print(best_model_df.sort_values(0, ascending=False).reset_index())
best_model_df.sort_values(0, ascending=False).reset_index().loc[0,"index"]

                              0
hard                  98.222222
soft                  97.777778
knn1                  98.000000
knn2                  98.666667
lr                    96.444444
dt3                   46.888889
dt5                   67.777778
bagging, max_depth=1   0.733333
bagging, max_depth=2   0.824444
bagging, max_depth=3   0.877778
bagging, max_depth=4   0.924444
bagging, max_depth=5   0.944444
boosting               0.968889
stacking               0.971111
                   index          0
0                   knn2  98.666667
1                   hard  98.222222
2                   knn1  98.000000
3                   soft  97.777778
4                     lr  96.444444
5                    dt5  67.777778
6                    dt3  46.888889
7               stacking   0.971111
8               boosting   0.968889
9   bagging, max_depth=5   0.944444
10  bagging, max_depth=4   0.924444
11  bagging, max_depth=3   0.877778
12  bagging, max_depth=2   0.824444
13  bagging, max

'knn2'

In [None]:
#조익준

from sklearn.datasets import load_digits
digits = load_digits()
# 모델이름을 키로 성능점수를 저장한다
# 나중에 데이터프레임으로 저장하기위해 사전타입에 저장한다.
# 성능점수를 배열로 저장해야 데이터프레임으로 변환이 가능하다.
best_model = {}  

# 데이터 분할
x_tr, x_te, y_tr, y_te = train_test_split(digits['data'],
                                                    digits['target'],
                                                    stratify=digits['target'],
                                                    random_state=0)

# 모델 설정
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

knn1 = KNeighborsClassifier(n_neighbors=5)
knn2 = KNeighborsClassifier(n_neighbors=3)
lr = LogisticRegression(max_iter=10000)
dt3 = DecisionTreeClassifier(max_depth=3)
dt5 = DecisionTreeClassifier(max_depth=5)

# voting
from sklearn.ensemble import VotingClassifier
hard = VotingClassifier([('knn1', knn1), ('knn2', knn2), ('lr', lr), ('dt3', dt3), ('dt5', dt5)])

soft = VotingClassifier([('knn1', knn1), ('knn2', knn2), ('lr', lr), ('dt3', dt3), ('dt5', dt5)], voting='soft')

names = ['hard', 'soft', 'knn1', 'knn2', 'lr', 'dt3', 'dt5']
for idx, model in enumerate([hard, soft, knn1, knn2, lr, dt3, dt5]):
    model.fit(x_tr, y_tr)
    name = names[idx]
    train_score = model.score(x_tr, y_tr) * 100
    test_score = model.score(x_te, y_te) * 100
    best_model[name] = [test_score] # 성능점수를 배열로 저장한다
    
# bagging
for i in range(1, 6):
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(max_depth=i).fit(x_tr, y_tr)
    best_model[f'bagging, max_depth={i}'] = [model.score(x_te, y_te)] # 성능점수를 배열로 저장한다
    
# boosting
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier().fit(x_tr, y_tr)
best_model['boosting'] = [model.score(x_te, y_te)] # 성능점수를 배열로 저장한다

# stacking
from sklearn.ensemble import StackingClassifier

estimators = [('rf', RandomForestClassifier()),
             ('gb', GradientBoostingClassifier())]

model = StackingClassifier(estimators=estimators,
                          final_estimator=LogisticRegression())

best_model['stacking'] = [model.fit(x_tr, y_tr).score(x_te, y_te)] # 성능점수를 배열로 저장한다

import pandas as pd
pd.DataFrame(best_model) # 가로로 긴 모양이 생성된다. 정렬하려면 세로로 길어야한다.
best_model_df = pd.DataFrame(best_model).T # 가로를 세로로 변환한다.
print(best_model_df)
#0 칼럼기준 내림차순 정렬. 접근이 쉽도록 인덱스를 일반칼럼으로 변환하고 인덱스는 숫자로 새로 만들어준다.
print(best_model_df.sort_values(0, ascending=False).reset_index()) 
# 맨 첫번째 행, "index"칼럼이 가장 성능좋은 모델이름이다.
best_model_df.sort_values(0, ascending=False).reset_index().loc[0,"index"]
