In [1]:
import numpy as np
import h5py

In [2]:
from src.load_script import load_contest_train_dataset
from sklearn.model_selection import train_test_split

if False:
    X, y, samples = load_contest_train_dataset('datasets/contest_TRAIN.h5', 100)
    wavelengths = X.columns

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=samples)
    del X, y, samples
else:
    X_train = np.load(open('datasets/x_train.npy', 'rb'))
    y_train = np.load(open('datasets/y_train.npy', 'rb'))
    X_test = np.load(open('datasets/x_test.npy', 'rb'))
    y_test = np.load(open('datasets/y_test.npy', 'rb'))
    wavelengths = np.load(open('datasets/wavelengths.npy', 'rb'))

In [4]:
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mutual_info_score

def euclid(a, b):
    return np.linalg.norm(a - b)

def cosine(a, b):
    return 1 - np.dot(a, b) / np.linalg.norm(a) / np.linalg.norm(b)

# TODO maybe mutual information
def direct_mutual_information(x, y, bins=100):
    c_xy, _, _ = np.histogram2d(x, y, bins)
    return mutual_info_score(None, None, contingency=c_xy)

"""
def make_dist(x):
    return x / np.sum(x)

def kl_divergence(x, y):
    x, y = make_dist(x), make_dist(y)   # normalize to a distribution
    return np.sum(np.where(y == 0 or x == 0, 0, x * np.log(x / y)))
"""

def mutual_information(a, b, bins=100):
    hgram, _, _ = np.histogram2d(a, b)
    pxy = hgram / float(np.sum(hgram))
    px = np.sum(pxy, axis=1)
    py = np.sum(pxy, axis=0)
    px_py = px[:, None] * py[None, :]
    nzs = pxy > 0
    return np.sum(pxy[nzs] * np.log(pxy[nzs] / px_py[nzs]))


models = []
best_params = []
for metric in [euclid, cosine, mutual_information, direct_mutual_information]:
    pipe = Pipeline([
        #('scaling', RobustScaler(unit_variance=True)),
        ('pca', PCA(whiten=False)),
        ('clf', KNeighborsClassifier()),
    ])

    params = {
        'clf__n_neighbors'  : [5, 10],
        'pca__n_components' : [20, 25],
    }

    gs = GridSearchCV(pipe, params, verbose=3, cv=2).fit(X_train, y_train)
    models.append(gs.best_estimator_)
    best_params.append(gs.best_params_)

Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV 1/2] END clf__n_neighbors=5, pca__n_components=20;, score=0.821 total time=  15.0s
[CV 2/2] END clf__n_neighbors=5, pca__n_components=20;, score=0.818 total time=  15.2s
[CV 1/2] END clf__n_neighbors=5, pca__n_components=25;, score=0.824 total time=  15.2s
[CV 2/2] END clf__n_neighbors=5, pca__n_components=25;, score=0.821 total time=  15.5s
[CV 1/2] END clf__n_neighbors=10, pca__n_components=20;, score=0.807 total time=  15.2s
[CV 2/2] END clf__n_neighbors=10, pca__n_components=20;, score=0.811 total time=  15.2s
[CV 1/2] END clf__n_neighbors=10, pca__n_components=25;, score=0.813 total time=  16.0s
[CV 2/2] END clf__n_neighbors=10, pca__n_components=25;, score=0.816 total time=  15.4s
Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV 1/2] END clf__n_neighbors=5, pca__n_components=20;, score=0.821 total time=  15.3s
[CV 2/2] END clf__n_neighbors=5, pca__n_components=20;, score=0.818 total time=  15.0s
[CV 1/2]

In [None]:
from sklearn.metrics import classification_report
for model, model_name, model_params in zip(models, ['euclid', 'cosine', 'mutual_info', 'direct_mutual_information'], best_params):
    print(model_name, best_params, sep=', best params were: ')
    print(classification_report(y_test, model.predict(X_test)))

euclid
              precision    recall  f1-score   support

           1       0.40      0.50      0.44        16
           2       0.82      0.78      0.80        23
           3       0.75      0.38      0.50         8
           4       0.24      0.56      0.33         9
           5       0.88      0.58      0.70        12
           6       0.86      0.86      0.86        14
           7       0.25      0.12      0.17         8
           8       0.71      0.42      0.53        12
           9       0.69      0.88      0.77        25
          10       0.56      0.62      0.59         8
          11       0.86      0.63      0.73        19
          12       1.00      0.91      0.95        11

    accuracy                           0.65       165
   macro avg       0.67      0.60      0.61       165
weighted avg       0.70      0.65      0.66       165

cosine
              precision    recall  f1-score   support

           1       0.58      0.44      0.50        16
          