In [1]:
import pandas as pd
from scipy.io import arff
from sklearn.model_selection import train_test_split
import numpy
import math
import operator
import matplotlib.pyplot as plt

## Loading Data

In [2]:
data1=arff.loadarff("./trainProdSelection.arff")
trainData = pd.DataFrame(data1[0])
data2=arff.loadarff("./testProdSelection.arff")
testData = pd.DataFrame(data2[0])

In [3]:
list(pd.DataFrame(data1[0]))
pd.DataFrame(data1[0]).dtypes

Type          object
LifeStyle     object
Vacation     float64
eCredit      float64
salary       float64
property     float64
label         object
dtype: object

## Train Data pre-processing 

In [4]:
trainData.Type=trainData.Type.str.decode("UTF-8")

In [5]:
trainData.LifeStyle=trainData.LifeStyle.str.decode("UTF-8")

In [6]:
trainData.label=trainData.label.str.decode("UTF-8")

In [7]:
minValuev=trainData.Vacation.min()
maxValuev=trainData.Vacation.max()
trainData.Vacation=trainData.Vacation.apply(lambda x:(x-minValuev)/(maxValuev-minValuev))

In [8]:
minValuec=trainData.eCredit.min()
maxValuec=trainData.eCredit.max()
trainData.eCredit=trainData.eCredit.apply(lambda x:(x-minValuec)/(maxValuec-minValuec))

In [9]:
minValues=trainData.salary.min()
maxValues=trainData.salary.max()
trainData.salary=trainData.salary.apply(lambda x:(x-minValues)/(maxValues-minValues))

In [10]:
minValuep=trainData.property.min()
maxValuep=trainData.property.max()
trainData.property=trainData.property.apply(lambda x:(x-minValuep)/(maxValuep-minValuep))

In [11]:
trainData.head()

Unnamed: 0,Type,LifeStyle,Vacation,eCredit,salary,property,label
0,student,spend>saving,0.079365,0.107558,0.21996,0.183167,C1
1,student,spend>saving,0.15873,0.052326,0.293102,0.112797,C1
2,student,spend>saving,0.095238,0.177326,0.346023,0.1742,C1
3,student,spend>saving,0.031746,0.127907,0.309882,0.189984,C1
4,student,spend>saving,0.222222,0.020349,0.363663,0.127311,C1


## Test Data pre-processing 

In [12]:
testData.Type=testData.Type.str.decode("UTF-8")

In [13]:
testData.LifeStyle=testData.LifeStyle.str.decode("UTF-8")

In [14]:
testData.label=testData.label.str.decode("UTF-8")

In [15]:
testData.Vacation=testData.Vacation.apply(lambda x:(x-minValuev)/(maxValuev-minValuev))

In [16]:
testData.eCredit=testData.eCredit.apply(lambda x:(x-minValuec)/(maxValuec-minValuec))

In [17]:
testData.salary=testData.salary.apply(lambda x:(x-minValues)/(maxValues-minValues))

In [18]:
testData.property=testData.property.apply(lambda x:(x-minValuep)/(maxValuep-minValuep))

In [19]:
testData.head()

Unnamed: 0,Type,LifeStyle,Vacation,eCredit,salary,property,label
0,student,spend<saving,0.174603,0.046512,0.270299,0.210554,C1
1,student,spend>>saving,0.444444,0.020349,0.330534,0.138584,C1
2,student,spend<<saving,0.428571,0.165698,0.299126,0.066076,C1
3,engineer,spend>saving,0.222222,0.110465,0.54867,0.080036,C1
4,librarian,spend<saving,0.015873,0.017442,0.482442,0.038246,C1


## Euclidean Distance

In [20]:
def euclideanDistance(instance1, instance2, length):
    distance = 0
#     print(type(instance1))
#     print(type(instance1))
    for i in range(2):
        if (instance1[i]!=instance2[i]):
            distance += pow((1), 2)
    for x in range(2,length):
        distance += pow((instance1[x] - instance2[x]), 2)
    return math.sqrt(distance)

## Getting Neighbours

In [21]:
def getNeighbours(trainingSet, testInstance, k):
    distances = []
    length = len(testInstance)-1
    for x in range(len(trainingSet)):
        dist = euclideanDistance(testInstance, trainingSet[x], length)
        distances.append((trainingSet[x], dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(k):
        neighbors.append(distances[x][0])
    return neighbors

## Finding response for the class

In [22]:
def getResponse(neighbours):
    classVotes = {}
    for x in range(len(neighbours)):
        response = neighbours[x][-1]
        if response in classVotes:
            classVotes[response] += 1
        else:
            classVotes[response] = 1
    sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
    return sortedVotes[0][0]

## Calulating Accuracy

In [23]:
def getAccuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions[x]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

## Applying kNN 

In [24]:
def knn(k):
    predictions=[]
    for x in range(len(testData)):
        neighbours = getNeighbours(trainData.values, testData.values[x], k)
        result = getResponse(neighbours)
        predictions.append(result)
    accuracy = getAccuracy(testData.values, predictions)
    return accuracy

In [25]:
knn(3)

28.57142857142857

In [26]:
a=[]
maxi=0
maxk=0
for i in range(1,60,2):
    acc=knn(i)
    a.append(acc)
    print("for k value: ",i," accuracy is: ",acc)
    if(acc>=maxi):
        maxi=acc
        maxk=i

for k value:  1  accuracy is:  19.047619047619047
for k value:  3  accuracy is:  28.57142857142857
for k value:  5  accuracy is:  28.57142857142857
for k value:  7  accuracy is:  28.57142857142857
for k value:  9  accuracy is:  23.809523809523807
for k value:  11  accuracy is:  19.047619047619047
for k value:  13  accuracy is:  14.285714285714285
for k value:  15  accuracy is:  19.047619047619047
for k value:  17  accuracy is:  19.047619047619047
for k value:  19  accuracy is:  19.047619047619047
for k value:  21  accuracy is:  19.047619047619047
for k value:  23  accuracy is:  19.047619047619047
for k value:  25  accuracy is:  19.047619047619047
for k value:  27  accuracy is:  19.047619047619047
for k value:  29  accuracy is:  14.285714285714285
for k value:  31  accuracy is:  14.285714285714285
for k value:  33  accuracy is:  19.047619047619047
for k value:  35  accuracy is:  19.047619047619047
for k value:  37  accuracy is:  19.047619047619047
for k value:  39  accuracy is:  19.0476

## Maximum Accuracy

In [27]:
print("Maximum Accuracy is",maxi,"at k=",maxk)

Maximum Accuracy is 33.33333333333333 at k= 57
