In [11]:
import numpy as np
import math
import operator
from classifier import classifier

class knn(classifier):
    def __init__(self, k = 2):
        self.k = k
        self.train_data = None
        self.test_data = None
        self.train_labels = None

    def fit(self, X, Y):
        self.train_data = X
        self.train_labels = Y

    def predict(self, X):
        self.test_data = X
        predictions=[]
        for x in range(len(X)):
            neighbors = self.findNeighbors(self.train_data, self.test_data[x], self.k, self.train_labels)
            result = self.decision(neighbors)
            predictions.append(result)
        return predictions
    
    def euclidDist(self, inst1, inst2, length):
        dist = 0
        for x in range(length):
            dist += pow((inst1[x] - inst2[x]), 2)
        finalDist = math.sqrt(dist)
        return finalDist

    def findNeighbors(self, trainingSet, testInst, k, train_labels):
        dist = []
        length = len(testInst) - 1
        for x in range(len(trainingSet)):
            finalDist = self.euclidDist(testInst, trainingSet[x], length)
            dist.append((trainingSet[x], train_labels[x], finalDist))
        dist.sort(key=operator.itemgetter(2))
        neighbors = []
        for x in range(k):
            neighbors.append(dist[x][:2])
        return neighbors

    def decision(self, neighbors):
        classVotes = {}
        for x in range(len(neighbors)):
            response = neighbors[x][-1][0]
            if response in classVotes:
                classVotes[response] += 1
            else:
                classVotes[response] = 1
        sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
        return sortedVotes[0][0]

    def findAccuracy(self, test_labels, predictions):
        correctPred = 0
        for x in range(len(test_labels)):
            if test_labels[x][0] == predictions[x]:
                correctPred += 1
        accuracy = (correctPred/float(len(test_labels))) * 100.0
        return accuracy
    

In [12]:
import arff
import pandas as pd

data = arff.load('PhishingData.arff')
df = pd.DataFrame(data)

data = df.values.tolist()

# Correct implementation
# data = np.asarray(data)
# data = data.astype(np.float)

In [13]:
training_data = df.iloc[:,:9]
training_labels = df.iloc[:,9:]
# training_data = df.values.tolist()
# training_labels = df.values.tolist()

In [14]:
split = int(len(data) * 0.8)
train_data = training_data[:split]
train_labels = training_labels[:split]
test_data = training_data[split:]
test_labels = training_labels[split:]

train_data = np.asarray(train_data)
train_data = train_data.astype(np.float)

train_labels = np.asarray(train_labels)
train_labels = train_labels.astype(np.float)

test_data = np.asarray(test_data)
test_data = test_data.astype(np.float)

test_labels = np.asarray(test_labels)
test_labels = test_labels.astype(np.float)

# print(train_data[0])
# print(train_labels[0])
# print(test_data[0])
# print(test_labels[0])
# train_data = sorted(train_data)
# test_data = sorted(test_data)

# train_x = train_data.iloc[:,:9]
# train_y = train_data.iloc[:,9:]
# test_x = test_data.iloc[:,:9]
# test_y = test_data.iloc[:,9:]

# print(len(train_labels))

# train_data[0], test_data[0], len(train_data), len(test_data)

In [17]:
for i in range(2,13):
    knnImpl = knn(k = i)
    knnImpl.fit(train_data, train_labels)
    predictions = knnImpl.predict(test_data)
    # print(predictions, test_labels)
    accuracy = knnImpl.findAccuracy(test_labels, predictions)
    print('Accuracy: ' + repr(accuracy) + '%', 'when k = ' + repr(i))

Accuracy: 87.4538745387454% when k = 2
Accuracy: 88.56088560885608% when k = 3
Accuracy: 88.56088560885608% when k = 4
Accuracy: 87.82287822878229% when k = 5
Accuracy: 88.19188191881919% when k = 6
Accuracy: 88.56088560885608% when k = 7
Accuracy: 88.19188191881919% when k = 8
Accuracy: 87.4538745387454% when k = 9
Accuracy: 88.92988929889299% when k = 10
Accuracy: 88.56088560885608% when k = 11
Accuracy: 88.19188191881919% when k = 12


In [None]:
# euclidDist(train_data[0], train_data[1], 9)

In [None]:
# predictions=[]
# k = 2
# for x in range(len(test_data)):
#     neighbors = findNeighbors(train_data, test_data[x], k)
#     result = voting(neighbors)
#     predictions.append(result)
# #     print('predicted result =' + repr(result) + ', actual result =' + repr(test_data[x][-1]))
# accuracy = findAccuracy(test_data, predictions)
# print('Accuracy: ' + repr(accuracy) + '%')