### Loading useful Modules

In [1]:
import pandas as pd
from scipy.io import arff
import numpy as np
import math
import operator

### Loading Train and Test Datasets

In [3]:
train=arff.loadarff("./product-selection/trainProdSelection.arff")
trainDataSet = pd.DataFrame(train[0])
test=arff.loadarff("./product-selection/testProdSelection.arff")
testDataSet = pd.DataFrame(test[0])

In [4]:
import seaborn as sns
sns.barplot(trainDataSet['label'].value_counts())

<matplotlib.axes._subplots.AxesSubplot at 0x118526a0>

In [5]:
trainDataSet['label'].value_counts()

C4    47
C3    41
C5    36
C1    36
C2    26
Name: label, dtype: int64

### Encoding string based series in Training Set to UTF-8


In [6]:
trainDataSet.Type=trainDataSet.Type.str.decode("UTF-8")

In [7]:
trainDataSet.LifeStyle=trainDataSet.LifeStyle.str.decode("UTF-8")

In [8]:
trainDataSet.label=trainDataSet.label.str.decode("UTF-8")

### Normalization

In [9]:
minValue=trainDataSet.Vacation.min()
maxValue=trainDataSet.Vacation.max()
trainDataSet.Vacation=trainDataSet.Vacation.apply(lambda x:(x-minValue)/(maxValue-minValue))


In [10]:
minValue=trainDataSet.eCredit.min()
maxValue=trainDataSet.eCredit.max()
trainDataSet.eCredit=trainDataSet.eCredit.apply(lambda x:(x-minValue)/(maxValue-minValue))

In [11]:
minValue=trainDataSet.salary.min()
maxValue=trainDataSet.salary.max()
trainDataSet.salary=trainDataSet.salary.apply(lambda x:(x-minValue)/(maxValue-minValue))

In [12]:
minValue=trainDataSet.property.min()
maxValue=trainDataSet.property.max()
trainDataSet.property=trainDataSet.property.apply(lambda x:(x-minValue)/(maxValue-minValue))

In [13]:
trainDataSet.head()

Unnamed: 0,Type,LifeStyle,Vacation,eCredit,salary,property,label
0,student,spend>saving,0.079365,0.107558,0.21996,0.183167,C1
1,student,spend>saving,0.15873,0.052326,0.293102,0.112797,C1
2,student,spend>saving,0.095238,0.177326,0.346023,0.1742,C1
3,student,spend>saving,0.031746,0.127907,0.309882,0.189984,C1
4,student,spend>saving,0.222222,0.020349,0.363663,0.127311,C1


### Encoding string based series in Test Set to UTF-8


In [14]:
testDataSet.Type=testDataSet.Type.str.decode("UTF-8")

In [15]:
testDataSet.LifeStyle=testDataSet.LifeStyle.str.decode("UTF-8")

In [16]:
testDataSet.label=testDataSet.label.str.decode("UTF-8")

### Normalization


In [17]:
minValue=testDataSet.Vacation.min()
maxValue=testDataSet.Vacation.max()
testDataSet.Vacation=testDataSet.Vacation.apply(lambda x:(x-minValue)/(maxValue-minValue))

In [18]:
minValue=testDataSet.eCredit.min()
maxValue=testDataSet.eCredit.max()
testDataSet.eCredit=testDataSet.eCredit.apply(lambda x:(x-minValue)/(maxValue-minValue))

In [19]:
minValue=testDataSet.salary.min()
maxValue=testDataSet.salary.max()
testDataSet.salary=testDataSet.salary.apply(lambda x:(x-minValue)/(maxValue-minValue))

In [20]:
minValue=testDataSet.property.min()
maxValue=testDataSet.property.max()
testDataSet.property=testDataSet.property.apply(lambda x:(x-minValue)/(maxValue-minValue))

In [21]:
testDataSet.head()

Unnamed: 0,Type,LifeStyle,Vacation,eCredit,salary,property,label
0,student,spend<saving,0.2,0.058824,0.104637,0.398926,C1
1,student,spend>>saving,0.54,0.021008,0.175059,0.243041,C1
2,student,spend<<saving,0.52,0.231092,0.138339,0.085992,C1
3,engineer,spend>saving,0.26,0.151261,0.430086,0.116229,C1
4,librarian,spend<saving,0.0,0.016807,0.352657,0.025714,C1


### Calculating Distance between two datapoints 

In [22]:
def eDistance(row1, row2, length):
    distance = 0
    for i in range(2):
        if (row1[i]==row2[i]):
            distance += pow((1), 2)
    for x in range(2,length):
        distance += pow((row1[x] - row2[x]), 2)
    return math.sqrt(distance)

### Calculating the K nearest Neighbours for  a test data point using all train data points

In [23]:
def getNeighbors(trainSet, testInstance, k):
    distances = []
    length = len(testInstance)-1
    for x in range(len(trainSet)):
        dist = eDistance(testInstance, trainSet[x], length)
        distances.append((trainSet[x], dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(k):
        neighbors.append(distances[x][0])
    return neighbors

### Predicting Label from a set of nearest neighbours using Majority Voting


In [24]:
def predictLabelFrom(neighbors):
    votes = {}
    for x in range(len(neighbors)):
        response = neighbors[x][-1]
        if response in votes:
            votes[response] += 1
        else:
            votes[response] = 1
    #sortedVotes = sorted(votes.items(), key=operator.itemgetter(0,1),reverse=True)
    e=sorted(votes.items(), key=lambda x: (x[1],x[0]), reverse=True)
    d=pd.DataFrame(e)
#     display(d)
    dFilter=d[d[1]==d[1].max()]
    fg = dFilter.sort_values(by = 0,ascending=True).head(1)
    #print(e,fg[0].values[0])
    return fg[0].values[0]
    #return sortedVotes[0][0]

### Calculating Accuracy, given predicted values and actual values

In [25]:
def Accuracy(testSet, predictedLabels):
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictedLabels[x]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0



### KNN Algorithm Driver Function

In [26]:
def knn(k):
    predictedLabels=[]
    for x in range(len(testDataSet)):
        neighbors = getNeighbors(trainDataSet.values, testDataSet.values[x], k)
        result = predictLabelFrom(neighbors)
        predictedLabels.append(result)
        print('> predicted=' + repr(result) + ', actual=' + repr(testDataSet.values[x][-1]))
    accuracy = Accuracy(testDataSet.values, predictedLabels)
#     print('Accuracy: ' + repr(accuracy) + '%' + 'for k: ' +repr(k))

In [27]:
from sklearn.model_selection import train_test_split

In [28]:

def knn2(k):
    train=testDataSet.reindex(np.random.permutation(trainDataSet.index))
    trainDF1=train[:150]
    testDF1=train[150:]
    predictedLabels=[]
    for x in range(len(testDataSet)):
        neighbors = getNeighbors(trainDataSet.values, testDataSet.values[x], k)
        result = predictLabelFrom(neighbors)
        predictedLabels.append(result)
        print('> predicted=' + repr(result) + ', actual=' + repr(testDataSet.values[x][-1]))
    accuracy = Accuracy(testDataSet.values, predictedLabels)
    print('Accuracy: ' + repr(accuracy) + '%' + 'for k: ' +repr(k))

In [29]:
knn2(5)
# for i in range(3,50,2):
#     knn(i)

> predicted=u'C2', actual=u'C1'
> predicted=u'C4', actual=u'C1'
> predicted=u'C4', actual=u'C1'
> predicted=u'C4', actual=u'C1'
> predicted=u'C2', actual=u'C1'
> predicted=u'C1', actual=u'C1'
> predicted=u'C2', actual=u'C1'
> predicted=u'C2', actual=u'C1'
> predicted=u'C1', actual=u'C1'
> predicted=u'C3', actual=u'C1'
> predicted=u'C3', actual=u'C1'
> predicted=u'C3', actual=u'C1'
> predicted=u'C4', actual=u'C1'
> predicted=u'C1', actual=u'C1'
> predicted=u'C1', actual=u'C1'
> predicted=u'C4', actual=u'C1'
> predicted=u'C4', actual=u'C1'
> predicted=u'C4', actual=u'C1'
> predicted=u'C4', actual=u'C1'
> predicted=u'C5', actual=u'C1'
> predicted=u'C4', actual=u'C1'
Accuracy: 19.047619047619047%for k: 5
