In [1]:
import numpy as np
import h5py

In [5]:
from src.load_script import load_contest_train_dataset
from sklearn.model_selection import train_test_split

if True:
    X, y, samples = load_contest_train_dataset('datasets/contest_TRAIN.h5', 10)
    wavelengths = X.columns

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=samples)
    del X, y, samples
else:
    X_train = np.load(open('datasets/x_train.npy', 'rb'))
    y_train = np.load(open('datasets/y_train.npy', 'rb'))
    X_test = np.load(open('datasets/x_test.npy', 'rb'))
    y_test = np.load(open('datasets/y_test.npy', 'rb'))
    wavelengths = np.load(open('datasets/wavelengths.npy', 'rb'))

100%|██████████| 12/12 [01:55<00:00,  9.64s/it]


In [6]:
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mutual_info_score
from sklearn.preprocessing import Normalizer

def euclid(a, b):
    return np.linalg.norm(a - b)

def cosine(a, b):
    return 1 - np.dot(a, b) / np.linalg.norm(a) / np.linalg.norm(b)

def direct_mutual_information(x, y, bins=100):
    c_xy, _, _ = np.histogram2d(x, y, bins)
    return - mutual_info_score(None, None, contingency=c_xy)

def mutual_information(a, b, bins=100):
    hgram, _, _ = np.histogram2d(a, b)
    pxy = hgram / float(np.sum(hgram))
    px = np.sum(pxy, axis=1)
    py = np.sum(pxy, axis=0)
    px_py = px[:, None] * py[None, :]
    nzs = pxy > 0
    return - np.sum(pxy[nzs] * np.log(pxy[nzs] / px_py[nzs]))


models = []
for metric in [euclid, cosine, mutual_information]: # cosine
    pipe = Pipeline([
        #('scaling', RobustScaler(unit_variance=True)),
        #('pca', PCA(whiten=False)),
        ('normalize', Normalizer(norm='max'))
        ('clf', KNeighborsClassifier(metric=metric, n_neighbors=5)),
    ])

    models.append(pipe.fit(X_train, y_train))

In [7]:
from sklearn.metrics import classification_report
for model, model_name in zip(models, ['euclid', 'cosine', 'mutual_information']):
    print(model_name, sep=', best params were: ')
    print(classification_report(y_test, model.predict(X_test)))
    print('=' * 80)

euclid
              precision    recall  f1-score   support

           1       0.46      0.86      0.60        29
           2       0.62      0.69      0.65        49
           3       0.70      0.47      0.56        15
           4       0.72      0.65      0.68        20
           5       0.71      0.74      0.72        23
           6       1.00      0.86      0.92        28
           7       0.73      0.47      0.57        17
           8       0.79      0.68      0.73        22
           9       0.76      0.90      0.82        49
          10       0.77      0.62      0.69        16
          11       0.83      0.69      0.75        42
          12       1.00      0.45      0.62        20

    accuracy                           0.71       330
   macro avg       0.76      0.67      0.69       330
weighted avg       0.75      0.71      0.71       330

cosine


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           1       0.57      0.86      0.68        29
           2       0.67      0.82      0.73        49
           3       0.60      0.80      0.69        15
           4       0.80      0.80      0.80        20
           5       0.64      0.91      0.75        23
           6       1.00      0.86      0.92        28
           7       0.00      0.00      0.00        17
           8       1.00      0.50      0.67        22
           9       0.85      0.90      0.87        49
          10       0.86      0.75      0.80        16
          11       0.89      0.79      0.84        42
          12       1.00      0.75      0.86        20

    accuracy                           0.77       330
   macro avg       0.74      0.73      0.72       330
weighted avg       0.76      0.77      0.75       330

mutual_information
              precision    recall  f1-score   support

           1       0.49      0.76      0.59        29
     