In [2]:
import os
import sys
import copy
import numpy as np
from scipy import stats
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score


import warnings
warnings.filterwarnings("ignore")
%load_ext autoreload
%autoreload 2
sys.path.append("./../../src")

from load_data import gen_data

In [3]:
data = pd.read_csv("./../../data/fraud_detection_bank_dataset.csv")
col_names = [f"col_{i}" for i in range(111)]
col_names_naive = [x for x in col_names if len(data[x].value_counts().keys()) < 10]
target = "targets"

In [4]:
train_data, test_data = train_test_split(data, train_size=0.8, random_state=123)
X_train, y_train = train_data[col_names_naive].values, train_data[target].values
X_test, y_test = test_data[col_names_naive].values, test_data[target].values

In [15]:
def accuracy(y, y_hat):
    
    return (y == y_hat).sum() / len(y)

## KNN Origin

In [10]:
class KNN():
    
    @staticmethod
    def distance(p1, p2):
        
        if p1 is None or p2 is None:
            return 0
        return ((p2 - p1) ** 2).sum() ** 0.5
    
    def __init__(self, top_k = 3):
        
        self.top_k = top_k
        
    def fit(self, X_train, y_train):
        
        self.X_train = X_train
        self.y_train = y_train
        
    def predict(self, X_test):
        
        dist = []
        for X, y in zip(X_train, y_train):
            
            dist.append((self.distance(X_test, X), y))
            
        rst = sorted(dist, key=lambda x:x[0])[0:self.top_k]
        
        # return rst
        return stats.mode([x[1] for x in rst])[0][0]
        
        

In [11]:
knn = KNN(3)

knn.fit(X_train, y_train)

In [14]:
y_test_hat = np.apply_along_axis(knn.predict, axis=1, arr=X_test)

In [23]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

print("Test confusion matrix: \n", confusion_matrix(y_test, y_test_hat))
print("\nTest precision: ", precision_score(y_test, y_test_hat))
print("Test recall: ", recall_score(y_test, y_test_hat))
print("Test F1: ", f1_score(y_test, y_test_hat))
print("Test accuracy: ", accuracy_score(y_test, y_test_hat))

Test confusion matrix: 
 [[2751  296]
 [ 354  693]]

Test precision:  0.7007077856420627
Test recall:  0.66189111747851
Test F1:  0.6807465618860511
Test accuracy:  0.8412310698583293


## KNN (Kd Tree)

### kd tree

In [30]:
X_train = gen_data(0, 5, 1000, 5, seed=123)

X_train

array([[3.48234593, 1.43069667, 1.13425727, 2.75657385, 3.59734485],
       [2.1155323 , 4.90382099, 3.42414869, 2.40465951, 1.96058759],
       [1.71589008, 3.64524854, 2.19286122, 0.29838948, 1.99022128],
       ...,
       [4.62832648, 4.84491934, 3.31649527, 0.11676207, 4.03240146],
       [2.47166819, 4.86670763, 1.27713913, 0.39741698, 4.21432237],
       [2.50824211, 1.78702978, 2.15812133, 4.71354896, 0.21996535]])

In [34]:
class Node():
    
    def __init__(self):
        self.father = None
        self.left = None
        self.right = None
        self.feature = None
        self.split = None
        
    def _str_(self):
        feature, split = self.feature, self.split
        print(f"Feature:{feature}, Split_value:{split}")
        
    @property
    def brother(self):
        if self.father is None:
            ret = None
        else:
            if self.father.left is self:
                ret = self.father.right
            else:
                ret = self.father.left
        return ret
    
    
class KDTree():
    
    def __init__(self):
        self.root = Node
        
    @staticmethod
    def distance(p1, p2):
        if p1 is None or p2 is None:
            return 0
        return ((p2 - p1) ** 2).sum() ** 0.5
    
    @staticmethod
    def _split_feat(X):
        
        feat_idx = np.argmax(X.var(axis=0))
        # print(len(X), feat_idx)

        if len(X) % 2 == 0:
            mid = int(len(X) / 2) - 1
        else:
            mid = int(len(X) / 2 - 0.5)
        
        X_sort = X[X[:,feat_idx].argsort()]
        split = X_sort[:,feat_idx][mid]
        X_left = X_sort[:mid]
        X_right = X_sort[mid+1:]
        
        return feat_idx, split, X_left, X_right
    
    def show(self):
        
        queue = [self.root]
        
        while queue:
            nd = queue.pop(0)
            feature, split = nd.feature, nd.split
            print(f"Feature:{feature}, Split_value:{split}")
            if nd.left is not None:
                queue.append(nd.left)
            if nd.right is not None:
                queue.append(nd.right)
    
    def build_tree(self, X):
        
        nd = self.root
        queue = [(nd, X)]
        
        while queue:
            
            nd, X = queue.pop(0)
            nd.feature, nd.split, X_left, X_right = self._split_feat(X)
            # print(nd.feature, nd.split, len(X_left), len(X_right))
            
            if len(X_left) != 0:
                nd.left = Node()
                nd.left.father = nd
                queue.append((nd.left, X_left))

            if len(X_right) != 0:
                nd.right = Node()
                nd.right.father = nd
                queue.append((nd.right, X_right))
            

In [35]:
kd_tree = KDTree()

kd_tree.build_tree(X_train)

kd_tree.show()

Feature:1, Split_value:2.4816581308469567
Feature:2, Split_value:2.5720730716998306
Feature:0, Split_value:2.567294400801938
Feature:3, Split_value:2.605972272887657
Feature:4, Split_value:2.441251038287665
Feature:4, Split_value:2.4389334104516824
Feature:2, Split_value:2.481563667479282
Feature:0, Split_value:2.4442474036023243
Feature:0, Split_value:2.366249519351621
Feature:3, Split_value:2.7214898675277803
Feature:0, Split_value:2.458166582574732
Feature:3, Split_value:2.59581396223125
Feature:3, Split_value:2.188460543343492
Feature:4, Split_value:2.385107573825103
Feature:3, Split_value:2.601452106567488
Feature:4, Split_value:2.6285577063507395
Feature:4, Split_value:2.638086989956188
Feature:4, Split_value:2.4645686380928673
Feature:4, Split_value:1.7767985997021336
Feature:0, Split_value:2.2055380220045095
Feature:0, Split_value:2.5470086367511193
Feature:3, Split_value:2.667948001478832
Feature:3, Split_value:2.082183062002296
Feature:2, Split_value:2.3144327927482635
Featur

In [None]:
class KNN():
    
    @staticmethod
    def distance(p1, p2):
        
        if p1 is None or p2 is None:
            return 0
        return ((p2 - p1) ** 2).sum() ** 0.5
    
    def __init__(self, top_k = 3):
        
        self.top_k = top_k
        
    def fit(self, X_train, y_train):
        
        self.X_train = X_train
        self.y_train = y_train
        
    def predict(self, X_test):
        
        dist = []
        for X, y in zip(X_train, y_train):
            
            dist.append((self.distance(X_test, X), y))
            
        rst = sorted(dist, key=lambda x:x[0])[0:self.top_k]
        
        # return rst
        return stats.mode([x[1] for x in rst])[0][0]
        
        

In [24]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 1],
       [0, 0, 0, ..., 0, 0, 0]])

In [32]:
def in_traversal(tree):
    if tree != None:
        in_traversal(tree.leftchild)
        print(tree.root)
        in_traversal(tree.rightchild)
        
def post_traversal(tree):
    if tree != None:
        post_traversal(tree.leftchild)
        post_traversal(tree.rightchild)
        print(tree.root)

class Tree():
    
    def __init__(self, root, leftchild=None, rightchild=None):
        
        self.root = root
        self.leftchild = leftchild
        self.rightchild = rightchild

In [35]:
def create_kdtree(X_train, y_train):
    
    pass

In [59]:
print(np.shape(X_train))

np.shape(X_train[:, 1])


axis = 5


kmax = np.shape(X_train)[1]

median_val = np.median(X_train[:, axis])
mask = X_train[:, axis] > median_val
mask


np.shape(X_train[mask, :]), np.shape(X_train[~mask, :])

np.shape(y_train[mask]), np.shape(y_train[~mask])

(16374, 62)


((635,), (15739,))