In [21]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.svm import SVC

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

%matplotlib notebook

In [22]:
np.random.seed = 2021
iris = load_iris()
X, y, labels, feature_names  = iris.data, iris.target, iris.target_names, iris['feature_names']
df_iris= pd.DataFrame(X, columns= feature_names) 
df_iris['label'] = y
features_dict = {k:v for k,v in  enumerate(labels)}
df_iris['label_names'] = df_iris.label.apply(lambda x: features_dict[x])
df_iris

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label,label_names
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2,virginica
146,6.3,2.5,5.0,1.9,2,virginica
147,6.5,3.0,5.2,2.0,2,virginica
148,6.2,3.4,5.4,2.3,2,virginica


In [23]:
#test-train split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, shuffle= True)
print(f'X_train.shape = {X_train.shape[0]}, y_train.shape = {y_train.shape[0]}')
print(f'X_test.shape = {X_test.shape[0]}, y_test.shape = {y_test.shape[0]}')

X_train.shape = 112, y_train.shape = 112
X_test.shape = 38, y_test.shape = 38


In [24]:
#scale
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)    
X_test = scaler.transform(X_test)
print(X_train[0:5])

[[ 0.01543995 -0.11925475  0.22512685  0.35579762]
 [-0.09984503 -1.04039491  0.11355956 -0.02984109]
 [ 1.05300481 -0.11925475  0.95031423  1.12707506]
 [-1.36797986  0.34131533 -1.39259884 -1.31530348]
 [ 1.1682898   0.11103029  0.72717965  1.38416753]]


In [25]:
#finding the best k
q_rate=[]
for i in range (1, 26):
    knn_temp = KNeighborsClassifier(n_neighbors=i)
    knn_temp.fit(X_train, y_train)
    predict_i = knn_temp.predict(X_test)
    q_rate.append(np.mean(predict_i != y_test))

In [31]:
#plotting k 
plt.figure()
plt.plot(range(1,26), q_rate, marker="o", markerfacecolor="w", color="blue", markersize=7)
plt.title("q rate / k",fontsize=20)
plt.xlabel("k- values",fontsize=15)
plt.ylabel("q rate",fontsize=15)
plt.xticks(range(1,26))
plt.show()

<IPython.core.display.Javascript object>

In [27]:
#classifier
k_best = 3
knn = KNeighborsClassifier(n_neighbors = k_best)
knn.fit(X_train, y_train)
score_best = knn.score(X_test, y_test)
print (f'The best k = {k_best} , score = {score_best}')

The best k = 3 , score = 0.9736842105263158


In [28]:
#testing
iris_dict = dict(zip(df_iris['label'].unique(), df_iris['label_names'].unique()))
print(iris_dict)
pred1 = knn.predict(X_test[0:10])
print(pred1)
print(iris_dict[pred1[0]])
print(iris_dict[pred1[4]])

{0: 'setosa', 1: 'versicolor', 2: 'virginica'}
[2 1 0 2 0 2 0 1 1 1]
virginica
setosa


In [29]:
#decision boundary
X_short = df_iris.iloc[:, :2].values
def make_meshgrid(x, y, h=.02):
    x_min, x_max = x.min() - 1, x.max() + 1
    y_min, y_max = y.min() - 1, y.max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    return xx, yy

def plot_contours(ax, clf, xx, yy, **params):
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = ax.contourf(xx, yy, Z, **params)
    return out

model = svm.SVC(kernel='poly')
clf = model.fit(X_short, y)

fig, ax = plt.subplots()
X0, X1 = X[:, 0], X[:, 1]
xx, yy = make_meshgrid(X0, X1)

plot_contours(ax, clf, xx, yy, cmap = plt.cm.viridis, alpha=0.8)
ax.scatter(X0, X1, c=y, cmap=plt.cm.viridis, s=20, edgecolors='k')
ax.set_ylabel('sepal width (cm)')
ax.set_xlabel('sepal length (cm)')
ax.set_title('Sepal decision boundaries')
plt.show()

<IPython.core.display.Javascript object>