In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
plt.style.use("ggplot")
%matplotlib inline

In [2]:
def get_data(test_or_train):
    flag = True
    for target_digit in np.arange(10):
        if (flag):
            x = np.array(pd.read_csv("digit/digit_" + test_or_train + str(target_digit) + ".csv", header = None))
            y = np.ones(len(x)) * target_digit
            flag = False
        else:
            tmp = np.array(pd.read_csv("digit/digit_" + test_or_train + str(target_digit) + ".csv", header = None))
            tmp_y = np.ones(len(tmp)) * target_digit
            x = np.concatenate([x, tmp])
            y = np.concatenate([y, tmp_y])
            
    return x, y

In [3]:
def calc_K(x1,x2, h):
    return np.exp(-(np.linalg.norm(x1 - x2, ord=2) ** 2)/(2* (h ** 2)))

def K_matrix(x, y, h):
    return np.array([calc_K(i,j,h) for i in x for j in y]).reshape(len(x),len(y))

In [4]:
class GaussianKernelL2Regression:
    
    def __init__(self, K, _lambda = 0.1, h = 1):
        self.K = K
        self._lambda = _lambda
        self.h = h

    def fit(self, X, y):
        self.X = X
        self.theta = np.matmul(np.matmul(np.linalg.inv(np.matmul(self.K, self.K) + self._lambda * np.eye(len(X))), self.K),y)
        
    def predict(self, predict_K):
        return np.matmul(predict_K, self.theta)

In [8]:
#データの取得
train_X, train_y = get_data("train")
test_X, test_y = get_data("test")

#カーネル行列を事前に計算しておく
K = K_matrix(train_X, train_X, 1)
predict_K = K_matrix(test_X, train_X, 1)

#学習（一対他法）
models = []
for i in tqdm(np.arange(10)):
    y = np.array([1 if a == i else -1 for a in train_y])
    model = GaussianKernelL2Regression(K)
    model.fit(train_X, y)
    models.append(model)

100%|██████████| 10/10 [02:26<00:00, 14.11s/it]


In [9]:
#予測
ys = np.empty([10, len(test_y)])
for i in tqdm(np.arange(10)):
    ys[i] = models[i].predict(predict_K)

#最大スコアをもつクラスに分類する
y = np.argmax(ys, axis=0)

100%|██████████| 10/10 [00:00<00:00, 126.06it/s]


In [32]:
#結果
result_matrix = pd.crosstab(test_y, y, margins=True)
print("Total accuracy :", np.diag(result_matrix.values[:-1, :-1]).sum() / result_matrix.values[-1,-1])
result_matrix.style.background_gradient(cmap = "Reds")

Total accuracy : 0.965


col_0,0,1,2,3,4,5,6,7,8,9,All
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0.0,198,0,1,1,0,0,0,0,0,0,200
1.0,0,200,0,0,0,0,0,0,0,0,200
2.0,0,0,193,1,0,0,0,2,3,1,200
3.0,0,0,0,192,0,3,0,2,2,1,200
4.0,0,1,0,0,189,1,3,0,0,6,200
5.0,2,0,3,4,0,187,0,1,1,2,200
6.0,1,0,2,0,0,2,195,0,0,0,200
7.0,0,0,0,0,3,0,0,192,2,3,200
8.0,3,0,1,4,0,3,0,0,187,2,200
9.0,0,0,0,0,2,0,0,1,0,197,200
