# Home task : KNN classification
1. `iris` dataset
    - load `iris` dataset
    - split to train and test
    - train KNN classifiers for different `K`
    - choose the best `K` determined on test set

2. synthetic dataset
    - generrate synthetic dataset
    - split to train and test
    - train KNN regressors for different `K`
    - choose the best `K` determined on test set
    - visualize the decision boundary 




## `Iris` dataset 

In [1]:
import pandas as pd 
import numpy as np 
np.random.seed = 2021
from sklearn.datasets import load_iris
iris = load_iris()
print ('data contains:',iris.keys())

iris_data_with_target = np.append(iris.data, iris.target.reshape([iris.target.shape[0], 1]), axis=1)
np.random.shuffle(iris_data_with_target) #shuffle iris data and target 

X = iris_data_with_target[:, 0:4]
y = iris_data_with_target[:, 4]

labels, feature_names  = iris.target_names, iris['feature_names']
df_iris= pd.DataFrame(X, columns= feature_names) 
df_iris['label'] =  y
features_dict = {k:v for k,v in  enumerate(labels)}
df_iris['label_names'] = df_iris.label.apply(lambda x: features_dict[x])
df_iris

data contains: dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label,label_names
0,6.4,3.2,4.5,1.5,1.0,versicolor
1,6.1,3.0,4.6,1.4,1.0,versicolor
2,5.7,3.8,1.7,0.3,0.0,setosa
3,5.6,2.9,3.6,1.3,1.0,versicolor
4,5.2,3.4,1.4,0.2,0.0,setosa
...,...,...,...,...,...,...
145,6.1,2.6,5.6,1.4,2.0,virginica
146,4.4,3.2,1.3,0.2,0.0,setosa
147,6.9,3.1,5.4,2.1,2.0,virginica
148,6.5,3.0,5.5,1.8,2.0,virginica


In [2]:
from sklearn.preprocessing import MinMaxScaler
MM = MinMaxScaler()
X = MM.fit_transform(X)

X

array([[0.58333333, 0.5       , 0.59322034, 0.58333333],
       [0.5       , 0.41666667, 0.61016949, 0.54166667],
       [0.38888889, 0.75      , 0.11864407, 0.08333333],
       [0.36111111, 0.375     , 0.44067797, 0.5       ],
       [0.25      , 0.58333333, 0.06779661, 0.04166667],
       [0.08333333, 0.45833333, 0.08474576, 0.04166667],
       [0.94444444, 0.25      , 1.        , 0.91666667],
       [0.        , 0.41666667, 0.01694915, 0.        ],
       [0.66666667, 0.41666667, 0.71186441, 0.91666667],
       [0.22222222, 0.70833333, 0.08474576, 0.125     ],
       [0.47222222, 0.29166667, 0.69491525, 0.625     ],
       [0.13888889, 0.58333333, 0.15254237, 0.04166667],
       [0.61111111, 0.41666667, 0.71186441, 0.79166667],
       [0.33333333, 0.16666667, 0.45762712, 0.375     ],
       [0.33333333, 0.16666667, 0.47457627, 0.41666667],
       [0.47222222, 0.08333333, 0.6779661 , 0.58333333],
       [0.41666667, 0.29166667, 0.52542373, 0.375     ],
       [0.58333333, 0.375     ,

In [6]:
k_best = 0 # 'compute the best k'
score_best = float(0) #'compute the best score'
print ('The best k = {} , score = {}'.format(k_best,score_best ))

The best k = 0 , score = 0.0


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3)

In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

k_max_value = 20 

for K in range(1, k_max_value + 1): 
    classifier = KNeighborsClassifier(K) 
    classifier.fit(X_train, y_train)

    y_pred = classifier.predict(X_test)

    accuracy = metrics.accuracy_score(y_test, y_pred)
    if accuracy > score_best:
        k_best = K
        score_best = accuracy
    
print(f"Best is {K} and best acurracy {accuracy}")


Best is 20 and best acurracy 0.9555555555555556


## Synthetic dataset  

In [19]:
from sklearn.datasets import make_blobs, make_regression
import matplotlib.pyplot as plt
%matplotlib notebook
from matplotlib.colors import ListedColormap
cmap_bold = ListedColormap(['red','#FFFF00','purple','grey'])

np.random.seed= 2021
X_D2, y_D2 = make_regression(n_samples = 1000, n_features = 4, random_state = 4)
y_D2 = y_D2 
plt.figure()
plt.title('Sample binary classification problem with non-linearly separable classes')
plt.scatter(X_D2[:,0], X_D2[:,1], c=y_D2,
           marker= 'o', s=30, cmap=cmap_bold)



<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x1de48ee4d90>

In [20]:
k_best = 0 # 
score_best = 0 
print ('The best k = {} , score = {}'.format(k_best,score_best ))

The best k = 0 , score = 0


In [21]:
from sklearn.preprocessing import StandardScaler

Scaler = StandardScaler()
X_D2 = Scaler.fit_transform(X_D2)
X_D2

array([[ 0.45840043,  0.65911802, -2.04284406, -0.79175776],
       [-1.09598189, -0.38618439, -0.35089588,  0.61548053],
       [-1.17412776, -0.73506488,  1.10800883, -1.89312137],
       ...,
       [ 1.70536849, -0.50804104,  1.47616186, -0.64076074],
       [-1.85219013, -1.21414428, -1.24276372, -0.40230543],
       [-0.11670086,  1.3697656 ,  1.76936828, -0.04308694]])

In [22]:
#devide into sets
from sklearn.model_selection import train_test_split

X_train_D2, X_test_D2, y_train_D2, y_test_D2 = train_test_split(X_D2, y_D2, train_size=0.7, test_size=0.3)

In [23]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

k_max_value = 20 

for K in range(1, k_max_value + 1): 
    knn_r = KNeighborsRegressor(K) 
    knn_r.fit(X_train_D2, y_train_D2) 

    y_pred_D2 = knn_r.predict(X_test_D2)

    score = knn_r.score(X_test_D2, y_test_D2)
    print(score)
    if score >  score_best :
        k_best = K
        score_best = score

print(f"Best K is {k_best}, score {score_best}")

0.915544995817408
0.9443830252733955
0.9554584797761759
0.9549755240012486
0.9560230963534123
0.9531967164133548
0.9551601159904962
0.9526771309408731
0.9526484559263587
0.9517532574593203
0.9504209644156232
0.9516744152597392
0.9497274091029292
0.9474194540324016
0.9463713279365945
0.9443960282695097
0.9435138793687412
0.9416250357911686
0.9410058822681474
0.9396500117509659
Best K is 5, score 0.9560230963534123


In [24]:
import matplotlib.pyplot as plt

plt.figure()
plt.scatter(range(len(y_test_D2)), y_test_D2, s=5, color="blue", label='Original')
plt.plot(range(len(y_pred_D2)), y_pred_D2, lw=1.5, color="black", label='Predicted')
plt.legend()
plt.show()

<IPython.core.display.Javascript object>