In [1]:
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import KFold
from sklearn.utils import shuffle

import numpy as np
import pandas as pd

In [2]:
original_data = pd.read_csv('../data/pedalboard-plugin.csv', sep=",", index_col=['id', 'name'])
original_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,plugin1,plugin2,plugin3,plugin4,plugin5,plugin6
id,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
9467,!!*Cuda',23,27,73,109,106,61
7313,'70s*V.H**,39,30,100,60,107,107
7313,'90s*V.H**,23,99,31,42,53,60
9467,*!!*Wanted,23,27,85,49,60,107
10849,**********,107,107,107,107,107,84


In [3]:
def distance(a, b):
    return sum([0 if a == b else 1 for (a, b) in zip(a, b)])

In [4]:
def compare(index, column, train, test):
    equals = []
    for test_index, i in enumerate(index):
        train_recommendations = train.iloc[i]
        test_element = test.iloc[test_index]

        equals.append(test_element[column] in train_recommendations[column].values)    
    
    return np.array(equals, dtype=np.int32).sum()


In [5]:
def measure(data, k):
    kfolds = KFold(n_splits=10, random_state=42, shuffle=False)
    columns = [f'plugin{i}' for i in range(1, 7)]

    print(f'k={k}')
    
    z = []
    for kfold, (train_index, test_index) in enumerate(kfolds.split(data)):
        train, test = data.iloc[train_index], data.iloc[test_index]

        nbrs = NearestNeighbors(n_neighbors=k, algorithm='brute', metric=distance).fit(train)

        y = []
        for plugin in columns:
            test_plugin = test.copy()
            test_plugin[plugin] = -1

            distances, indices = nbrs.kneighbors(test_plugin)
            x = compare(indices, plugin, train, test) / len(test)
            y.append(x)
            print(f'K fold={kfold}, {plugin}: {x}')

        z.append(sum(y)/6)
        print('Total:', z[-1])

    print()
    print('TOTAL:', sum(z)/10)
    return sum(z)/10

In [6]:
data = shuffle(original_data, random_state=42)
measure(data, k=1)
measure(data, k=5)

k=1
K fold=0, plugin1: 0.4105263157894737
K fold=0, plugin2: 0.21578947368421053
K fold=0, plugin3: 0.19473684210526315
K fold=0, plugin4: 0.2578947368421053
K fold=0, plugin5: 0.3631578947368421
K fold=0, plugin6: 0.5842105263157895
Total: 0.33771929824561403
K fold=1, plugin1: 0.4
K fold=1, plugin2: 0.21578947368421053
K fold=1, plugin3: 0.21578947368421053
K fold=1, plugin4: 0.2578947368421053
K fold=1, plugin5: 0.2789473684210526
K fold=1, plugin6: 0.46842105263157896
Total: 0.306140350877193
K fold=2, plugin1: 0.4263157894736842
K fold=2, plugin2: 0.24736842105263157
K fold=2, plugin3: 0.1736842105263158
K fold=2, plugin4: 0.2631578947368421
K fold=2, plugin5: 0.3473684210526316
K fold=2, plugin6: 0.5210526315789473
Total: 0.32982456140350874
K fold=3, plugin1: 0.35789473684210527
K fold=3, plugin2: 0.19473684210526315
K fold=3, plugin3: 0.2
K fold=3, plugin4: 0.2631578947368421
K fold=3, plugin5: 0.3894736842105263
K fold=3, plugin6: 0.5368421052631579
Total: 0.32368421052631574


0.5135254803675856