## VOTING 방식 모델 구현
 - 데이터 : breast cancer
  - 유형 : 지도학습 -> 분류
  - 방법 : Voting 방식으로 진행 : LogisticRegression, DecisionTreeClassifier, SVC
  - 학습 데이터셋: 동일한 데이터셋으로 3개의 모델 학습 진행

1. 모듈 로딩, 데이터 준비

In [2]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer

In [3]:
x,y = load_breast_cancer(as_frame = True, return_X_y=True)

In [4]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x,y, random_state=5, test_size=0.2, stratify= y)

2. 학습 진행
<hr>

2 - 1. 앙상블 보팅 학습에 사용할 모델 인스턴스 생성

In [5]:
#LogisticRegression

from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(solver='liblinear')
lr_model.fit(xtrain, ytrain)

2 - 2. DecisionTree

In [6]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier()
dt_model.fit(xtrain, ytrain)

2 - 3. SVC

In [7]:
from sklearn.svm import SVC

In [8]:
svc_model = SVC()
svc_model.fit(xtrain, ytrain)

In [9]:
from sklearn.ensemble import VotingClassifier

#동일 데이터셋으로 병렬학습 진행할 모델 리스트 선정 및 결과 결정 방법 설정하기
vt_model = VotingClassifier(estimators=
                            [('lr_model',lr_model),
                             ('dt_model',dt_model),
                             ('svc_model',svc_model)], 
                            verbose=True, voting='hard')

In [10]:
# 동일 데이터셋을 전달해서 3개의 모델 동시에 학습 진행
vt_model.fit(xtrain, ytrain)

[Voting] ................. (1 of 3) Processing lr_model, total=   0.0s
[Voting] ................. (2 of 3) Processing dt_model, total=   0.0s
[Voting] ................ (3 of 3) Processing svc_model, total=   0.0s


In [14]:
#예측하기
(1)
new_data = pd.DataFrame([xtest.iloc[0]], columns=xtest.columns)
new_data

array([1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 1])

In [15]:
#(2)
xtest.iloc[0].to_frame().T # 한 행으로 줄 수 있음.

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
525,8.571,13.1,54.53,221.3,0.1036,0.07632,0.02565,0.0151,0.1678,0.07126,...,9.473,18.45,63.3,275.6,0.1641,0.2235,0.1754,0.08512,0.2983,0.1049


In [17]:
vt_model.predict(new_data)
# vt_model.predict_proba(new_data) => 지금은 hard voting이라 안됨.

array([1])

In [12]:
#voting instance 내의 학습기들
vt_model.estimators_[0].predict(xtest)

array([1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 0], dtype=int64)

In [21]:
#voting instance 내의 학습기들 => 접근방법 (1)
vt_model.estimators_[0]

In [24]:
#voting instance 내의 학습기들 => 접근방법 (2) ---- 딕셔너리 형태. 키-값으로 접근 가능.
vt_model.named_estimators_.get('lr_model').predict(new_data), vt_model.named_estimators_.get('dt_model').predict(new_data), vt_model.named_estimators_.get('svc_model').predict(new_data)

(array([1], dtype=int64), array([1], dtype=int64), array([1], dtype=int64))

In [27]:
for key, val in vt_model.named_estimators_.items():
    print(f'{key} : {val.predict(new_data)[0]}')

    # 소프트 보팅 하고 싶으면 보팅 만들때 soft로 하면되고, svc는 probabilty=True로 줘야 proba 값을 출력할 수 있음.

lr_model : 1
dt_model : 1
svc_model : 1


In [13]:
#3개 알고리즘 모델에 대한 성능 평가
print(f'[Logistic Regression]\n Train; {lr_model.score(xtrain, ytrain)}\n Test; {lr_model.score(xtest,ytest)}\n')
print(f'[DecisionTree]\n Train; {dt_model.score(xtrain, ytrain)}\n Test; {dt_model.score(xtest,ytest)}\n')
print(f'[Random Forest]\n Train; {rf_model.score(xtrain, ytrain)}\n test; {rf_model.score(xtest,ytest)}')

[Logistic Regression]
 Train; 0.9582417582417583
 Test; 0.9649122807017544

[DecisionTree]
 Train; 1.0
 Test; 0.9122807017543859



NameError: name 'rf_model' is not defined