In [32]:
# 필요한 module import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import accuracy_score

In [33]:
# Raw Data Loading
iris = load_iris()
df = pd.DataFrame(iris['data'], columns=iris['feature_names'])
df.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
df['label'] = iris['target']
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   label         150 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 6.0 KB


In [34]:
# 결측치, 이상치 X 중복데이터 O
df = df.drop_duplicates()

In [35]:
# 정규화
x_data = df.drop('label', axis=1, inplace=False).values
t_data = df['label'].values

scaler=MinMaxScaler()
scaler.fit(x_data)
x_data_norm = scaler.transform(x_data)

In [36]:
# 데이터 분할
x_data_train_norm, x_data_test_norm, t_data_train, t_data_test = \
train_test_split(x_data_norm,
                 t_data,
                 test_size=0.3,
                 stratify=t_data,
                 random_state=0)

In [40]:
# 1. KNN Model
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(x_data_train_norm, t_data_train)
print(f'KNN 모델의 정확도 : {accuracy_score(t_data_test, knn.predict(x_data_test_norm))}') # 0.9777777777777777

# 2. SVM Model
svm = SVC(C=0.5, kernel='linear')
svm.fit(x_data_train_norm, t_data_train)
print(f'SVM 모델의 정확도 : {accuracy_score(t_data_test, svm.predict(x_data_test_norm))}') # 0.9777777777777777

# 3. DT Model
dtc = DecisionTreeClassifier()
dtc.fit(x_data_train_norm, t_data_train)
print(f'DT 모델의 정확도 : {accuracy_score(t_data_test, dtc.predict(x_data_test_norm))}') # 0.9555555555555556

KNN 모델의 정확도 : 0.9777777777777777
SVM 모델의 정확도 : 0.9777777777777777
DT 모델의 정확도 : 0.9555555555555556


In [46]:
# 1. Ensemble의 Voting Model (Hard Voting)
hvc = VotingClassifier(estimators=[('KNN', knn), ('SVM', svm), ('DT', dtc)],
                       voting='hard')
hvc.fit(x_data_train_norm, t_data_train)
print(f'Voting 모델의 정확도 : {accuracy_score(t_data_test, hvc.predict(x_data_test_norm))}') # 0.9777777777777777

# 2. Ensemble의 Bagging 기법인 RandomForest
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=50,
                             max_depth=3,
                             random_state=20)
rfc.fit(x_data_train_norm, t_data_train)
print(f'RF 모델의 정확도 : {accuracy_score(t_data_test, rfc.predict(x_data_test_norm))}') # 1.0

# 3. Ensemble의 Boosting Model (Gradient Boosting)
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier()
gbc.fit(x_data_train_norm, t_data_train)
print(f'Boosting 모델의 정확도 : {accuracy_score(t_data_test, gbc.predict(x_data_test_norm))}') # 0.9777777777777777

# 4. Ensemble의 Boosting Model (XG Boosting)
from xgboost import XGBClassifier

xgbc = XGBClassifier(max_depth=3,
                     learning_rate=0.1)
xgbc.fit(x_data_train_norm, t_data_train)
print(f'XGBC 모델의 정확도 : {accuracy_score(t_data_test, xgbc.predict(x_data_test_norm))}') # 0.9555555555555556

Voting 모델의 정확도 : 0.9777777777777777
RF 모델의 정확도 : 1.0
Boosting 모델의 정확도 : 0.9777777777777777
XGBC 모델의 정확도 : 0.9555555555555556
