## 0. Import Library & Dataset

In [20]:
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from collections import Counter
from tqdm import tqdm

def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

data = unpickle('/content/drive/MyDrive/data_batch_1')    
X = data[b'data']
y = data[b'labels']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

## 1. Define Model

In [155]:
## 거리척도
def calc_l1_distance(x_train, x_test):
  return np.sum(np.abs(x_train - x_test), axis=1)

def calc_l2_distance(x_train, x_test):
  return np.sqrt(np.sum(np.square(x_train - x_test), axis=1))

## NN
class NearstNeighbor():
  def __init__(self):
    pass

  def train(self, X, y):
    self.X_tr = X
    self.y_tr = y

  def cal_distance(self, x_train, x_test, use_l1):
    if use_l1:
      distance = calc_l1_distance(x_train, x_test)
    else:
      distance = calc_l2_distance(x_train, x_test)    
    return distance

  def predict(self, X, use_l1=True):
    num_test = X.shape[0]
    y_pred = np.zeros(num_test)

    for i in tqdm(range(num_test)):
      distance = self.cal_distance(self.X_tr, X[i], use_l1)
      min_index = np.argmin(distance)

      y_pred[i] = self.y_tr[min_index]

    return y_pred

## K-NN
class K_NearstNeighbor(NearstNeighbor):
  def predict(self, k, X, use_l1=True):
    num_test = X.shape[0]
    y_pred = np.zeros(num_test)

    for i in tqdm(range(num_test)):
      distance = self.cal_distance(self.X_tr, X[i], use_l1)    
      min_indices = np.argsort(distance)[:k].tolist()

      y_pred[i] = Counter(np.array(self.y_tr)[min_indices]).most_common(1)[0][0]

    return y_pred

## 2. L1, L2 distance

In [74]:
x1 = np.array([[56, 32, 10, 18, 90, 23, 128, 133, 24, 26, 178, 200, 2, 0, 255, 220]])
x2 = np.array([[10, 20, 24, 17, 8, 10, 89, 100, 12, 16, 178, 170, 4, 32, 233, 112]])

l1_dist = calc_l1_distance(x1, x2)
l2_dist = calc_l2_distance(x1, x2)

print(l1_dist, l2_dist, sep="\n")

[456]
[162.11107303]


## 3. Train model

### 3-1) NN

In [104]:
# train
nn = NearstNeighbor()
nn.train(x_train, y_train)

# pred
l1_y_pred = nn.predict(x_test[:300])
l2_y_pred = nn.predict(x_test[:300], False)

100%|██████████| 300/300 [00:11<00:00, 25.74it/s]
100%|██████████| 300/300 [00:11<00:00, 25.87it/s]


시간이 너무 오래 걸리는 관계로 300개 데이터셋에 대해서만 pred

In [105]:
# accuracy score 계산
from sklearn.metrics import accuracy_score

print(
    accuracy_score(y_test[:300], l1_y_pred),
    accuracy_score(y_test[:300], l2_y_pred),
    sep = '\n'
)

0.21333333333333335
0.20333333333333334


L1, L2 모두 20%정도의 Accuracy 보여줌

## 3-2 K-NN

In [197]:
# train
k_nn = K_NearstNeighbor()
k_nn.train(x_train, y_train)

# pred
l2_y_pred = k_nn.predict(3, x_test[:300], False)

100%|██████████| 300/300 [00:11<00:00, 25.62it/s]


In [199]:
# accuracy score 계산
from sklearn.metrics import accuracy_score

print(
    accuracy_score(y_test[:300], l2_y_pred),
    sep = '\n'
)

0.23


그리 큰 차이는 없는 것 같다.