In [1]:
import numpy as np
import random

In [2]:
path = 'creditcard.csv'
data = np.genfromtxt(path, delimiter=',')

print(type(data))
print(data.shape)

# Convert the numpy array to list
data = data.tolist()
print(len(data))

# Select a small portion to work with
portion_size = int(0.03 * len(data))
selected_portion = random.sample(data, portion_size)
print(type(selected_portion))
print(len(selected_portion))

<class 'numpy.ndarray'>
(284807, 31)
284807
<class 'list'>
8544


In [3]:
# Data Preparation
train_set = []
val_set = []
test_set = []

def data_preparation(train_size, val_size, test_size):
    # Below method shuffles the whole dataset to remove bias
    random.shuffle(selected_portion)

    for s in selected_portion:
        # Num is used to take value between 0 and 1, '.random' method generates that
        # In this way, with every iteration of data, the data will be randomly set
        # upon the three lists
        num = random.random()
        if 0 <= num <= train_size:
            train_set.append(s)
        elif train_size < num <= train_size + val_size:
            val_set.append(s)
        else:
            test_set.append(s)

In [4]:
# According to the question, 
# 70% in train data, 20% in validation data and 10% in test data
data_preparation(0.7, 0.2, 0.1)

# To know the sizes of the lists
print(len(train_set))
print(len(val_set))
print(len(test_set))

5961
1722
861


In [5]:
import math

In [6]:
def euclidean_distance(v, t):
    distance = 0
    for x, y in zip(v, t):
        distance += (x - y) ** 2
    return math.sqrt(distance)

In [7]:
def sort_func(distance):
    return distance[1]

In [None]:
def knn_classifier(k, train_data, test_data):
    distance = []
    # Accuracy keeps track of the number of correct predictions by the KNN Algorithm
    accuracy = 0
    
    for test in test_data:
        for train in train_data:
            # The Euclidean distance between two datapoints 'test' and 'train'
            # '[:-1]' excludes the last element from each dataset 
            # because it is the target column
            dist = euclidean_distance(test[:-1], train[:-1])
            distance.append([train, dist])
        
        # Sort the distance list    
        distance.sort(key=sort_func)
        # Selects the k nearest neighbors from the sorted distance list
        select = distance[:k]
        
        # Initialize a dictionary to count class labels of the neighbors
        label = {
            0: 0,
            1: 0
        }
        
        # Below loop iterates over the selected neighbors 
        # and updates the label dictionary
        for s in select:
            label[s[0][-1]] += 1
        # Take the class label which is major in the neighbors (K samples)
        predicted_class = max(label, key=label.get) 
        # Compare the predicted class with the actual class
        # if true, increment the accuracy
        if test[-1] - predicted_class == 0:
            accuracy += 1
    
    # The final accuracy is calculated by the dividing the total accuracy by
    # the number of test data point
    return accuracy/len(test_data)

In [None]:
print(f"{knn_classifier(1, train_set, test_set)}")
print(f"{knn_classifier(3, train_set, test_set)}")
print(f"{knn_classifier(5, train_set, test_set)}")
print(f"{knn_classifier(10, train_set, test_set)}")
print(f"{knn_classifier(15, train_set, test_set)}")