# Classification algorithms
1. k-nearest neighbors (k-NN)
2. Naive Bayes classifier (NB)

# 0. Load iris dataset
- Fisher's iris dataset, the most famous bechmark dataset
- Wikipedia: https://en.wikipedia.org/wiki/Iris_flower_data_set
- UCI data repository: https://archive.ics.uci.edu/ml/datasets/Iris

In [1]:
# read the iris data into a DataFrame
import pandas as pd
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
col_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
iris = pd.read_csv(url, header=None, names=col_names)

In [2]:
iris.head(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
iris.shape

(150, 5)

### 참고: seaborn
- Homepage: https://stanford.edu/~mwaskom/software/seaborn/index.html
- Seaborn is a Python visualization library based on matplotlib.
- It provides a high-level interface for drawing statistical graphs.
- You can install this package from conda repository. Just type `conda install seaborn` or `pip install seaborn`.

In [None]:
%matplotlib inline
import seaborn as sns
sns.pairplot(iris, hue="species", size = 3)

In [4]:
# Divite data into X and Y
X = iris.drop('species', axis = 1)
print(type(X))
print(X.shape)
X.head(5)

<class 'pandas.core.frame.DataFrame'>
(150, 4)


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [5]:
y = iris.species
print(type(y))
print(y.shape)
y.head(5)

<class 'pandas.core.series.Series'>
(150,)


0    Iris-setosa
1    Iris-setosa
2    Iris-setosa
3    Iris-setosa
4    Iris-setosa
Name: species, dtype: object

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 123)

## 1. k-nearest neighbors 

In [None]:
# make an instance of a k-NN classifier object
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
type(knn)

In [None]:
print(knn)

In [None]:
KNeighborsClassifier?

In [None]:
knn.fit(X_train, y_train)

In [None]:
y_pred = knn.predict(X_test)

In [None]:
# calculate classification accuracy
from sklearn import metrics
accuracy = metrics.accuracy_score(y_test, y_pred)
cm = metrics.confusion_matrix(y_test, y_pred)

In [None]:
print(accuracy)
print(cm)

In [1]:
from sklearn import metrics

In [None]:
print(metrics.classification_report(y_test, y_pred))

In [None]:
accuracy_set = []
cm_set = []
k_set = [1,3,5,7,9,11]

# for k in range(1,10):
for k in k_set:
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    cm = metrics.confusion_matrix(y_test, y_pred)
    accuracy_set.append(accuracy)
    cm_set.append(cm)

In [None]:
print(accuracy_set)

In [None]:
print(cm_set[4])

## 2. Naive Bayes classifier

In [None]:
# Because all variables in iris dataset are numerical, we use Gaussian naive Bayes.
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred_2 = gnb.predict(X_test)

In [None]:
accuracy = metrics.accuracy_score(y_test, y_pred_2)
cm = metrics.confusion_matrix(y_test, y_pred_2)

In [None]:
print(accuracy)
print(cm)