# Iris Dataset

In [12]:
# import modules
from sklearn import datasets 
import numpy as np 
import math

# set the seed for the random number generator
mySeed=1234567

# load data
iris = datasets.load_iris() # load data 
X = iris.data # get features
y = iris.target # get targets
print(iris.DESCR) # print dataset description
print(X)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

# KNN Code

In [13]:
def distEuclid(x1,x2):
  '''
  returns the euclidean distance between x1 and x2
  '''
  dist = 0.0
  for dim in range(len(x1)):
    dist += (x1[dim] - x2[dim]) ** 2
  return math.sqrt(dist)

def distManhattan(x1,x2):
  '''
  returns the manhattan distance between x1 and x2
  '''
  dist = 0.0
  for dim in range(len(x1)):
    dist += abs(x1[dim] - x2[dim])
  return dist

def distMinkowski(x1,x2):
  '''
  returns the minkowski distance between x1 and x2
  '''
  dist = 0.0
  for dim in range(len(x1)):
    dist += abs(x1[dim] - x2[dim]) ** 3
  return dist ** (1/3)

def findNearest(X,x_,k,distType):
  '''
  takes training data X, and sample data x_ to find the k nearest neighbours to x_ in X using distance function distType
  '''
  dists = list()#list of distances between x_ and every value in X
  nearest = list()#list of x_'s k nearest neighbours in X
  #inserts the index of x in X and the distance (defined by distType) between x and x_ into the dists list
  if distType == 'euclidean':
    for i in range(len(X)):
      dists.append((i, distEuclid(X[i],x_)))
  elif distType == 'manhattan':
    for i in range(len(X)):
      dists.append((i, distManhattan(X[i],x_)))
  elif distType == 'minkowski':
    for i in range(len(X)):
      dists.append((i, distMinkowski(X[i],x_)))
  dists.sort(key=lambda tup: tup[1])#sorts the dists list with in ascending order of distance to x_
  for i in range(k):#inserts the k nearest neighbours into the nearest list
    nearest.append(dists[i])
  return nearest

def predictClass(X,y,x_,k,distType,c):
  '''
  Takes training data X, training outputs y, sample x_ then returns the predicted class of the sample
  '''
  nearest = findNearest(X,x_,k,distType)#list of x_'s k nearest neighbours in X
  count = list()#list where each index represents a class in X
  for i in range(c):
    count.append(0)
  for i in range(k):
    count[y[nearest[i][0]]] += 1 #increment the value in count at index y for each label y corresponding to neighbor x in X
  max = 0
  maxI = 0
  for  i in range(c):#find the most commonly occuring class among the nearest neighbours to x_
    if count[i] > max:
      max = count[i]
      maxI = i
  return maxI



def mykNN(X,y,X_,k,distType,c):
  '''
  Takes training data X, training outputs y, testing data X_ then returns the predicted outputs for X_
  k is the number of neighbours and distType is the distance type
  c is the number of classes in the data
  '''
  y_ = list()
  for sample in X_:
    y_.append(predictClass(X,y,sample,k,distType,c))
  return y_

def getAccuracy(trainX,trainy,valX,valy,k,distType,c):
  '''
  takes training and validation data to find kNN accuracy using k nearest neighboours and distance funstion distType
  '''
  y_ = mykNN(trainX,trainy,valX,k,distType,c)
  correct = 0
  for i in range(len(y_)):
    if y_[i] == valy[i]:
      correct += 1
  return (correct/len(y_))*100

def myNestedCrossVal(X,y,N,K,distTypes,mySeed,c):
  '''
  takes dataset X and corresponding labels y and performs N-fold cross validation on all k in K and all distance functions in distType
  returns a list containing the accuracy of the best parameters in each fold as well as a list describing those parameters
  '''
  accuracies_fold = list()
  best_parameters_fold = list()
  np.random.seed(mySeed)
  np.random.shuffle(X)
  np.random.seed(mySeed)
  np.random.shuffle(y)
  splitX = np.array_split(X,N)#splits X into n arrays stored in splitX
  splity = np.array_split(y,N)#splits y into n arrays stored in splity
  for n in range(N):
    #split data into a testing set, a training set, and a validation set
    testX = splitX[n]
    trainX = splitX[:n] + splitX[n+1:]
    valX = trainX[N-2]
    trainX = np.concatenate(trainX[:N-2])
    testy = splity[n]
    trainy = splity[:n] + splity[n+1:]
    valy = trainy[N-2]
    trainy = np.concatenate(trainy[:N-2])
    accuracyMax = [0.0,0,'']
    for k in K:
      for distType in distTypes:
        accuracy = getAccuracy(trainX,trainy,valX,valy,k,distType,c)#find the accuracy of this fold for every value of k and disttype
        if accuracy > accuracyMax[0]:
          accuracyMax = [accuracy,k,distType]#store the accuracy and details of the best performing parameters of this fold
    accuracies_fold.append(getAccuracy(np.concatenate((trainX,valX)),np.concatenate((trainy,valy)),testX,testy,accuracyMax[1],accuracyMax[2],c))#test the best performing parameters from this fold on the test data
    best_parameters_fold.append((accuracyMax[1],accuracyMax[2]))
  return accuracies_fold, best_parameters_fold

# KNN test

In [14]:
#KNN tested on dataset
accuracies_fold, best_parameters_fold = myNestedCrossVal(X,y,5,list(range(1,11)),['euclidean','manhattan', 'minkowski'],mySeed,3)
for i in range(len(accuracies_fold)):
  print('fold: ' + str(i + 1) + ' accuracy: ' + str(accuracies_fold[i]) + ' k: ' + str(best_parameters_fold[i][0]) + ' distance: ' + best_parameters_fold[i][1])


#KNN tested on dataset with added noise
np.random.seed(mySeed) 
XN=X+np.random.normal(0,0.5,X.shape)
accuracies_fold, best_parameters_fold = myNestedCrossVal(XN,y,5,list(range(1,11)),['euclidean','manhattan', 'minkowski'],mySeed,3)
for i in range(len(accuracies_fold)):
  print('fold: ' + str(i + 1) + ' accuracy: ' + str(accuracies_fold[i]) + ' k: ' + str(best_parameters_fold[i][0]) + ' distance: ' + best_parameters_fold[i][1])

fold: 1 accuracy: 96.66666666666667 k: 4 distance: euclidean
fold: 2 accuracy: 93.33333333333333 k: 4 distance: euclidean
fold: 3 accuracy: 93.33333333333333 k: 1 distance: euclidean
fold: 4 accuracy: 100.0 k: 4 distance: euclidean
fold: 5 accuracy: 96.66666666666667 k: 1 distance: euclidean
fold: 1 accuracy: 96.66666666666667 k: 1 distance: manhattan
fold: 2 accuracy: 80.0 k: 3 distance: minkowski
fold: 3 accuracy: 86.66666666666667 k: 10 distance: euclidean
fold: 4 accuracy: 80.0 k: 3 distance: euclidean
fold: 5 accuracy: 83.33333333333334 k: 2 distance: euclidean
