# Classification using KNN
### Author: Prof. Sandro Camargo <github.com/sandrocamargo>
### Data Mining Course <https://moodle.unipampa.edu.br/moodle/course/view.php?id=5213>
#### This script uses the basic concepts of KNN.
##### In this script, we used the iris dataset https://archive.ics.uci.edu/dataset/53/iris

To open this notebook in your Google Colab environment, [click here](https://colab.research.google.com/github/Sandrocamargo/data-mining/blob/main/Python/md05_Classification_KNN.ipynb).

In [None]:
# Download and unzip the dataset
!wget -c https://archive.ics.uci.edu/static/public/53/iris.zip
!unzip -u iris.zip

In [None]:
# import and inspect the dataset
import pandas as pd

data = pd.read_csv('iris.data', header=None)
data.columns = ['Sepal Length','Sepal Width','Petal Length','Petal Width','Species']
data.head() # Show first 5 samples

In [None]:
# Getting to know your data",
import seaborn as sns
import matplotlib.pyplot as plt

sns.pairplot(data, hue='Species', markers=["o", "s", "D"])
plt.savefig("iris-pairplot.pdf")

In [None]:
# split dataset into train and test sets
from sklearn import model_selection
import random

random.seed(10)

# Store the inputs in the matrix X and the outputs in the array y
X = data.iloc[:,0:4]
print(X.describe())

y = data.iloc[:,4]
print("\n",y.value_counts(),"\n")

target_names = list(set(y))

train_x, test_x, train_y, test_y = model_selection.train_test_split(X, y, train_size=0.8, stratify=y)

# Verifying dataset dimensions
print('The training dataset (inputs) dimensions are: ', train_x.shape)
print('The training dataset (outputs) dimensions are: ', train_y.shape)
print('The testing dataset (inputs) dimensions are: ', test_x.shape)
print('The testing dataset (outputs) dimensions are: ', test_y.shape)

In [None]:
# Classification report on training set
from sklearn.neighbors import NearestCentroid
import numpy as np

clf = NearestCentroid()
clf.fit(train_x, train_y)

tmpdata = train_x
tmpdata['Species']=train_y
ax = sns.scatterplot(data=tmpdata, x='Petal Width', y='Petal Length', hue='Species', style='Species')

ax.plot(clf.centroids_[0,3], clf.centroids_[0,2], marker="*", color="r", markersize=10)
ax.plot(clf.centroids_[1,3], clf.centroids_[1,2], marker="*", color="r", markersize=10)
ax.plot(clf.centroids_[2,3], clf.centroids_[2,2], marker="*", color="r", markersize=10)
ax.plot(clf.centroids_[0,3], clf.centroids_[0,2],'o',ms=110,mec='r',mfc='none')
ax.plot(clf.centroids_[1,3], clf.centroids_[1,2],'o',ms=110,mec='r',mfc='none')
ax.plot(clf.centroids_[2,3], clf.centroids_[2,2],'o',ms=110,mec='r',mfc='none')

plt.title("Iris - Training Data")
plt.savefig("iris-train-knn.pdf")

In [None]:
# Classification report on training set
from sklearn.metrics import classification_report

predicted = clf.predict(train_x.iloc[:,0:4])
print(classification_report(train_y, predicted, target_names=target_names))

In [None]:
# Showing the performance on testing set
tmpdata = test_x
tmpdata['Species']=test_y

# Plotting the training set
ax = sns.scatterplot(data=tmpdata, x='Petal Width', y='Petal Length', hue='Species', style='Species')
ax.plot(clf.centroids_[0,3], clf.centroids_[0,2], marker="*", color="r", markersize=10)
ax.plot(clf.centroids_[1,3], clf.centroids_[1,2], marker="*", color="r", markersize=10)
ax.plot(clf.centroids_[2,3], clf.centroids_[2,2], marker="*", color="r", markersize=10)
ax.plot(clf.centroids_[0,3], clf.centroids_[0,2],'o',ms=110,mec='r',mfc='none')
ax.plot(clf.centroids_[1,3], clf.centroids_[1,2],'o',ms=110,mec='r',mfc='none')
ax.plot(clf.centroids_[2,3], clf.centroids_[2,2],'o',ms=110,mec='r',mfc='none')

plt.title("Iris - Test Data")
plt.savefig("iris-test-knn.pdf")

In [None]:
# Showing the performance on testing set
from sklearn.metrics import classification_report

predicted = clf.predict(test_x.iloc[:,0:4])
print(classification_report(test_y, predicted, target_names=target_names))