In [1]:
import pandas as pd
from scipy.io import arff
import numpy
import math
import operator

<h2>Reading Train and Test Data</h2>

In [2]:
data=arff.loadarff("trainProdSelection.arff")
trainSet = pd.DataFrame(data[0])

data=arff.loadarff("testProdSelection.arff")
testSet = pd.DataFrame(data[0])
testSet.head()

Unnamed: 0,Type,LifeStyle,Vacation,eCredit,salary,property,label
0,b'student',b'spend<saving',12.0,19.0,14.79,3.7697,b'C1'
1,b'student',b'spend>>saving',29.0,10.0,16.19,2.4839,b'C1'
2,b'student',b'spend<<saving',28.0,60.0,15.46,1.1885,b'C1'
3,b'engineer',b'spend>saving',15.0,41.0,21.26,1.4379,b'C1'
4,b'librarian',b'spend<saving',2.0,9.0,19.7207,0.6913,b'C1'


<h4>Datatypes</h4>

In [3]:
list(pd.DataFrame(data[0]))
pd.DataFrame(data[0]).dtypes

Type          object
LifeStyle     object
Vacation     float64
eCredit      float64
salary       float64
property     float64
label         object
dtype: object

<h3>Train Set</h3>

<h4>Pre-Processing</h4>

In [4]:
trainSet.Type=trainSet.Type.str.decode("UTF-8")

In [5]:
trainSet.LifeStyle=trainSet.LifeStyle.str.decode("UTF-8")

In [6]:
trainSet.label=trainSet.label.str.decode("UTF-8")
trainSet.head()

Unnamed: 0,Type,LifeStyle,Vacation,eCredit,salary,property,label
0,student,spend>saving,6.0,40.0,13.62,3.2804,C1
1,student,spend>saving,11.0,21.0,15.32,2.0232,C1
2,student,spend>saving,7.0,64.0,16.55,3.1202,C1
3,student,spend>saving,3.0,47.0,15.71,3.4022,C1
4,student,spend>saving,15.0,10.0,16.96,2.2825,C1


In [7]:
low=trainSet.Vacation.min()
high=trainSet.Vacation.max()
trainSet.Vacation=trainSet.Vacation.apply(lambda x:(x-low)/(high-low))

In [8]:
low=trainSet.eCredit.min()
high=trainSet.eCredit.max()
trainSet.eCredit=trainSet.eCredit.apply(lambda x:(x-low)/(high-low))

In [9]:
low=trainSet.salary.min()
high=trainSet.salary.max()
trainSet.salary=trainSet.salary.apply(lambda x:(x-low)/(high-low))

In [10]:
low=trainSet.property.min()
high=trainSet.property.max()
trainSet.property=trainSet.property.apply(lambda x:(x-low)/(high-low))

In [11]:
trainSet.head()

Unnamed: 0,Type,LifeStyle,Vacation,eCredit,salary,property,label
0,student,spend>saving,0.079365,0.107558,0.21996,0.183167,C1
1,student,spend>saving,0.15873,0.052326,0.293102,0.112797,C1
2,student,spend>saving,0.095238,0.177326,0.346023,0.1742,C1
3,student,spend>saving,0.031746,0.127907,0.309882,0.189984,C1
4,student,spend>saving,0.222222,0.020349,0.363663,0.127311,C1


In [12]:
testSet.Type=testSet.Type.str.decode("UTF-8")

In [13]:
testSet.LifeStyle=testSet.LifeStyle.str.decode("UTF-8")

In [14]:
testSet.label=testSet.label.str.decode("UTF-8")

In [15]:
low=testSet.Vacation.min()
high=testSet.Vacation.max()
testSet.Vacation=testSet.Vacation.apply(lambda x:(x-low)/(high-low))

In [16]:
low=testSet.eCredit.min()
high=testSet.eCredit.max()
testSet.eCredit=testSet.eCredit.apply(lambda x:(x-low)/(high-low))

In [17]:
low=testSet.salary.min()
high=testSet.salary.max()
testSet.salary=testSet.salary.apply(lambda x:(x-low)/(high-low))

In [18]:
low=testSet.property.min()
high=testSet.property.max()
testSet.property=testSet.property.apply(lambda x:(x-low)/(high-low))

In [19]:
testSet.head()

Unnamed: 0,Type,LifeStyle,Vacation,eCredit,salary,property,label
0,student,spend<saving,0.2,0.058824,0.104637,0.398926,C1
1,student,spend>>saving,0.54,0.021008,0.175059,0.243041,C1
2,student,spend<<saving,0.52,0.231092,0.138339,0.085992,C1
3,engineer,spend>saving,0.26,0.151261,0.430086,0.116229,C1
4,librarian,spend<saving,0.0,0.016807,0.352657,0.025714,C1


<h3>Euclidean Distance</h3>

In [20]:
def eucliDistance(instance_a, instance_b, length):
    
    d = 0
    
    for i in range(2):
        if (instance_a[i]!=instance_b[i]):
            d += pow((1), 2)
    
    for x in range(2,length):
        d += pow((instance_a[x] - instance_b[x]), 2)
    
    return math.sqrt(d)

<h3>Get Neighbours</h3>

In [21]:
def getNeighbors(train_Set, test_Instance, k):
    distances = []
    length = len(test_Instance)-1
    
    for x in range(len(train_Set)):
        dist = eucliDistance(test_Instance, train_Set[x], length)
        distances.append((train_Set[x], dist))
    
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    
    for x in range(k):
        neighbors.append(distances[x][0])
    
    return neighbors

<h3>Frequency for Each Class</h3>

In [22]:
def getOutput(neighbours):
    Freq = {}
    for x in range(len(neighbours)):
        output = neighbours[x][-1]
        if output in Freq:
            Freq[output] += 1
        else:
            Freq[output] = 1
    result = sorted(Freq.items(), key=operator.itemgetter(1), reverse=True)
    return result[0][0]

<h3>Calculate Accuracy</h3>

In [23]:
def findAccuracy(test_Set, predictions):
    positive = 0
    for x in range(len(test_Set)):
        if test_Set[x][-1] == predictions[x]:
            positive += 1
    return (positive/float(len(test_Set))) * 100.0

<h3>K Nearest Neighbour</h3>

In [24]:
def knn(k):
    predictions=[]
    
    for x in range(len(testSet)):
        neighbors = getNeighbors(trainSet.values, testSet.values[x], k)
        result = getOutput(neighbors)
        predictions.append(result)
    
    accuracy = findAccuracy(testSet.values, predictions)
    
    print('Accuracy: ' + repr(accuracy) + '%')

In [25]:
for i in range(1,100,2):
    print(i)
    knn(i)
# knn(5)

1
Accuracy: 23.809523809523807%
3
Accuracy: 28.57142857142857%
5
Accuracy: 28.57142857142857%
7
Accuracy: 28.57142857142857%
9
Accuracy: 23.809523809523807%
11
Accuracy: 23.809523809523807%
13
Accuracy: 19.047619047619047%
15
Accuracy: 14.285714285714285%
17
Accuracy: 19.047619047619047%
19
Accuracy: 19.047619047619047%
21
Accuracy: 19.047619047619047%
23
Accuracy: 19.047619047619047%
25
Accuracy: 19.047619047619047%
27
Accuracy: 19.047619047619047%
29
Accuracy: 19.047619047619047%
31
Accuracy: 19.047619047619047%
33
Accuracy: 19.047619047619047%
35
Accuracy: 14.285714285714285%
37
Accuracy: 14.285714285714285%
39
Accuracy: 19.047619047619047%
41
Accuracy: 19.047619047619047%
43
Accuracy: 19.047619047619047%
45
Accuracy: 19.047619047619047%
47
Accuracy: 19.047619047619047%
49
Accuracy: 19.047619047619047%
51
Accuracy: 19.047619047619047%
53
Accuracy: 23.809523809523807%
55
Accuracy: 33.33333333333333%
57
Accuracy: 28.57142857142857%
59
Accuracy: 28.57142857142857%
61
Accuracy: 28.57142