In [177]:
import csv 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import f1_score


In [178]:
data = []
with open('meta-dataset.csv', 'r') as file:
    reader = csv.reader(file)
    header = next(reader)
    for row in reader:
        data.append(row[1:])
    data = np.array(data)
    data = data.astype(float)

In [179]:
# Calculate U_yi
y = data[:, 8]
u = [0] * 3
u[0] = (y == 0).sum() - 1
u[1] = (y == 1).sum() - 1
u[2] = (y == 2).sum() - 1

In [180]:
# Pre-calculate NN of all <xi, yi>
d = [[] for _ in range(226)] 
for i in range(len(data)):
    a = data[i][0:8]
    for j in range(i + 1, len(data)):
        b = data[j][0:8]
        val = np.sqrt(sum(a * b))
        d[i].append(val)
        d[j].append(val)

for i in range(len(d)):
    d[i].sort()


In [181]:
# Calculate b-values for all <xi, yi>
bvalues = [0.0] * 226

invd = lambda x : 1.0 / (1.0 + float(x))
invd_func = np.vectorize(invd)

for i in range(226):
    y_val = data[i][8].astype(np.uint8)
    pos = u[y_val]
    num = d[i][0:pos].copy()
    den = d[i].copy()
    num = invd_func(num)
    den = invd_func(den)
    bvalues[i] = np.sum(num) / np.sum(den)



In [182]:
# Allocate to bins
D = pd.read_csv("meta-dataset.csv")
x = D.drop(columns=['label', 'Unnamed: 0'])
y = D['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 0)
bin = np.zeros(8)  
for val in bvalues:
    num = 7 if val == 1.0 else int(val / 0.125)
    bin[num] += 1
bin = bin / 226
bin = np.array(bin)
bin = bin.reshape(1, -1)
bin = pd.DataFrame(bin, columns=x.columns)
print(bin)

     beta_0  beta_1    beta_2  beta_3  beta_4    beta_5  beta_6  beta_7
0  0.088496     0.0  0.256637     0.0     0.0  0.654867     0.0     0.0


In [183]:
knn = KNeighborsClassifier(n_neighbors=51, weights='distance', metric='minkowski')
knn.fit(x_train, y_train)
y_predict = knn.predict(bin)
print("The prediction of H*D is: ", y_predict[0])

The prediction of H*D is:  1


In [184]:
cv = RepeatedStratifiedKFold(n_repeats=10, n_splits=10, random_state=1)
k_prime = KNeighborsClassifier()
y_train_scores = cross_val_score(k_prime, x_train, y_train, cv=cv, n_jobs=-1)
k_prime.fit(x_train, y_train)
y_test_predict = k_prime.predict(x_test)
y_test_score = metrics.accuracy_score(y_test, y_test_predict)
print(np.mean(y_train_scores))
print(y_test_score)

0.5827573529411765
0.631578947368421


In [185]:
f1 = f1_score(y_test, y_test_predict, average='weighted')
print("F1 Score: ", (f1))

F1 Score:  0.6398475273845035
