# 0. Import libraries

In [1]:
import numpy as np
import pandas as pd

# 1. 데이터 적재

## sepal: 꽃받침, petal: 꽃잎

In [2]:
# 머신러닝과 통계 분야에서 오래전부터 사용해 온 붗꽃(Iris) 데이터 셋
from sklearn.datasets import load_iris
# sklearn에 데이터 셋으로부터 load_iris라는 함수를 사용할 것이다.
iris_dataset = load_iris()

In [3]:
print("Iris_dataset size:\n", iris_dataset.keys())

Iris_dataset size:
 dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])


In [4]:
print(iris_dataset['DESCR'][:193] + "\n...")
# Discription: 이 데이터가 무엇인지 설명을 써놓은 것

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, pre
...


In [5]:
print("name of target:", iris_dataset['target_names'])

name of target: ['setosa' 'versicolor' 'virginica']


In [6]:
print("name of feature:", iris_dataset['feature_names']) # 판단 기준

name of feature: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [7]:
print("type of data:", type(iris_dataset['data']))

type of data: <class 'numpy.ndarray'>


In [8]:
print("shape of data:", iris_dataset['data'].shape)

shape of data: (150, 4)


In [9]:
print("first five rows of data\n", iris_dataset['data'][:5])

first five rows of data
 [[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]


In [10]:
print("type of target", type(iris_dataset['target']))

type of target <class 'numpy.ndarray'>


In [11]:
print("size of target", iris_dataset['target'].shape)

size of target (150,)


In [12]:
print("target:", iris_dataset['target']) # 0, 1, 2

target: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


# 2. 성과 측정: 훈련 데이터와 테스트 데이터

In [13]:
from sklearn.model_selection import train_test_split
# sklean 라이브러리에 model_selection이라는 부분으로부터 train_test_split 함수를 사용할 것이다.

X_train, X_test, y_train, y_test = train_test_split(iris_dataset['data'], iris_dataset['target'], random_state=0)
# 데이터 셋을 무작위로 섞은 뒤, train data와 test data를 75:25로 분리. (default → 75:25)

In [14]:
print("size of X_train:\n", X_train.shape) # (# of data * 75%, # of features)
print("size of y_train:\n", y_train.shape) # (# of data * 75%, )

size of X_train:
 (112, 4)
size of y_train:
 (112,)


In [15]:
print("size of X_test:\n", X_test.shape)
print("size of y_test:\n", y_test.shape)

size of X_test:
 (38, 4)
size of y_test:
 (38,)


# 3. 가장 먼저 할 일: 데이터 살펴보기

In [16]:
# X_train 데이터를 사용해서 데이터프레임을 만듦.
# 열의 이름은 iris_dataset.feature_names에 있는 문자열을 사용
iris_dataframe = pd.DataFrame(X_train, columns=iris_dataset.feature_names)
# 데이터프레임을 사용해 y_train에 따라 색으로 구분된 산점도 행렬을 만듦
pd.plotting.scatter_matrix(iris_dataframe, c = y_train, figsize=(15, 15), marker='o', hist_kwds={'bins':20}, s=60, alpha=.8)

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7fa21e402780>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fa21de78940>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fa21dea2cc0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fa21de54208>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7fa21ddfc780>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fa21de22cf8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fa21ddd22b0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fa21dd79860>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7fa21dd79898>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fa21dd51358>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fa21dcfa8d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fa21dd21e48>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7fa21dcd1400>,
        <matplotlib.axes._subplots.

# 4. 첫 번째 머신러닝 모델: k-최근접 이웃 알고리즘(k-Nearest Neighbors, k-NN)

In [17]:
from sklearn.neighbors import KNeighborsClassifier
# sklean의 여러 클래스 중에 하나(KNeightborsClassifier)에 미리 정의가 되어있다.

knn = KNeighborsClassifier(n_neighbors=1)

In [18]:
knn.fit(X_train, y_train) # 학습

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=1, p=2,
           weights='uniform')

# 5. 예측하기

In [19]:
X_new = np.array([[5, 2.9, 1, 0.2],
                 [3, 1.1, 0.8, 0.6]])
print("X_new.shape:", X_new.shape)

X_new.shape: (2, 4)


In [20]:
prediction = knn.predict(X_new)
print("prediction.shape:", prediction.shape)

print("prediction:", prediction)
print("predicted target name:", iris_dataset['target_names'][prediction])

prediction.shape: (2,)
prediction: [0 0]
predicted target name: ['setosa' 'setosa']


# 6. 모델 평가하기

In [21]:
y_pred = knn.predict(X_test)
print("X_test.shape:", X_test.shape)
print("y_pred.shape:", y_pred.shape)

print("predicted test data set:\n", y_pred)

X_test.shape: (38, 4)
y_pred.shape: (38,)
predicted test data set:
 [2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
 2]


In [22]:
print("accuracy for test set: {:.2f}".format(np.mean(y_pred == y_test)))

accuracy for test set: 0.97


# 요약 - 훈련과 평가의 과정

In [23]:
X_train, X_test, y_train, y_test = train_test_split(iris_dataset['data'], iris_dataset['target'], random_state=0)

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train) # 학습

y_pred = knn.predict(X_test)
print("accuracy for test set: {:.2f}".format(np.mean(y_pred == y_test)))

accuracy for test set: 0.97
