In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!pip install fastparquet
!sudo apt-get install libsnappy-dev
!python3 -m pip install  python-snappy
!python3 -m pip install pyarrow
! pip install hyperas

Collecting fastparquet
  Downloading fastparquet-0.7.1-cp37-cp37m-manylinux2010_x86_64.whl (1.2 MB)
[?25l[K     |▎                               | 10 kB 27.5 MB/s eta 0:00:01[K     |▋                               | 20 kB 29.7 MB/s eta 0:00:01[K     |▉                               | 30 kB 35.8 MB/s eta 0:00:01[K     |█▏                              | 40 kB 26.0 MB/s eta 0:00:01[K     |█▍                              | 51 kB 20.1 MB/s eta 0:00:01[K     |█▊                              | 61 kB 14.8 MB/s eta 0:00:01[K     |██                              | 71 kB 13.3 MB/s eta 0:00:01[K     |██▎                             | 81 kB 14.5 MB/s eta 0:00:01[K     |██▌                             | 92 kB 15.9 MB/s eta 0:00:01[K     |██▉                             | 102 kB 16.6 MB/s eta 0:00:01[K     |███                             | 112 kB 16.6 MB/s eta 0:00:01[K     |███▍                            | 122 kB 16.6 MB/s eta 0:00:01[K     |███▋                         

In [None]:
import re
import os 
import numpy as np
import pandas as pd

from sklearn.metrics import precision_score, recall_score, accuracy_score
#from imblearn.under_sampling import EditedNearestNeighbours

os.chdir('/content/gdrive/My Drive/data/SSLAMM/L2')

############################################################
# Training of other algorithms on the unbiased dataset
############################################################

import pickle
import fastparquet as fp
from imblearn.under_sampling import RandomUnderSampler

from tensorflow.keras.models import load_model, model_from_json

from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils.class_weight import compute_sample_weight

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

cluster_classes = ['MICRO', 'ORGNANO', 'ORGPICOPRO', 'REDNANO', 'REDPICOEUK',\
                   'REDPICOPRO', 'inf1microm','sup1microm']



In [None]:
def clf_eval(params):
    ''' Wrapper around classifiers for them to be fed into hyperopt '''
    classif = params['classif']
    del params['classif']
    
    if classif == 'knn':
        params['n_neighbors'] = int(params['n_neighbors'])
        clf = KNeighborsClassifier(**params)
    elif classif == 'svm':
        params['C'] = int(params['C'])
        clf = svm.LinearSVC(**params)
    elif classif == 'lgbm':
        params['n_estimators'] = int(params['n_estimators'])
        clf = LGBMClassifier(**params)
    elif classif == 'lda':
        params['n_components'] = int(params['n_components'])
        fit_params = {key: value for key, value in params.items() if key != 'class_weight'}
        clf = LDA(**fit_params)
        
    clf.fit(X_train, y_train)

    if classif in ['svm', 'lda']:
        pred_valid = clf.predict(X_valid)
        w = compute_sample_weight(params['class_weight'], y_valid)
        accuracy = accuracy_score(y_valid, pred_valid, sample_weight = w)
        ll = -accuracy_score(y_valid, pred_valid, sample_weight = w)
    else:
        ll = log_loss(y_valid, clf.predict_proba(X_valid)) # No sample weight in KNN

    
    return {'loss': ll, 'status': STATUS_OK}

In [None]:
# Import Pulse data 
train = np.load('Listmodes/train.npz', allow_pickle = True)
X_train = train['X']
y_train = train['y']

valid = np.load('Listmodes/valid.npz', allow_pickle = True)
X_valid = valid['X']
y_valid = valid['y']

In [None]:
# Load nomenclature
tn = pd.read_csv('Listmodes/train_test_nomenclature.csv')
tn.columns = ['name', 'id']

In [None]:
y_train = y_train.argmax(1)
y_valid = y_valid.argmax(1)

In [None]:
# Fetch the NaN indices
nan_train = np.isnan(X_train).any(1)
nan_valid = np.isnan(X_valid).any(1)

# Delete NaNs observations
X_train = X_train[~nan_train]
y_train = y_train[~nan_train]

X_valid = X_valid[~nan_valid]
y_valid = y_valid[~nan_valid]

In [None]:
test = np.load('/content/gdrive/My Drive/data/SSLAMM/L2/Listmodes/test.npz')
X_test = test['X']
y_test = test['y']
y_test = y_test.argmax(1)

In [None]:
# Fetch the NaN indices
nan_test = np.isnan(X_test).any(1)

X_test = X_test[~nan_test]
y_test = y_test[~nan_test]

In [None]:
n_classes = len(tn)
solver = ('svd', 'lsqr', 'eigen')
shrinkage = (0, 1)
n_components = (2, n_features -1)
tolfloat = (1.0e-5, 1.0e-2)

In [None]:
#************************************
# Looking for the best hyperparams 
#************************************
algo=tpe.suggest
nb_evals = 30

n_classes = len(tn)
class_names, nb_samples  = np.unique(y_valid, return_counts = True)
reweighted = dict(zip(class_names, 1/ nb_samples))
equal_weights = dict(zip(class_names, np.full(len(class_names), 1 / len(class_names))))
class_weight = (reweighted)
priors = nb_samples/ nb_samples.sum()

solver = ('lsqr', 'eigen')
shrinkage = (0,1)
n_components = (1, n_classes -1, 1)
tol = (1.0e-5, 1.0e-2)


lda_params = {
    'classif': 'lda',
    'solver': hp.choice('solver', solver), # large num_leaves helps improve accuracy but might lead to over-fitting
    'shrinkage': hp.uniform('shrinkage', *shrinkage),
    'n_components': hp.quniform('n_components', *n_components),
    'tol': hp.uniform('tol', *tol),
    'class_weight': equal_weights, 
    'priors': tuple(priors)}

In [None]:
#************************************
# Looking for the best hyperparams 
#************************************
algo=tpe.suggest
nb_evals = 30

class_names, nb_samples  = np.unique(y_valid, return_counts = True)
reweighted = dict(zip(class_names, 1/ nb_samples))

priors = nb_samples/ nb_samples.sum()

solver = ('svd')
n_components = (1, n_classes -1, 1)
tol = (1.0e-5, 1.0e-1)


lda_params = {
    'classif': 'lda',
    'solver': 'svd', # large num_leaves helps improve accuracy but might lead to over-fitting
    'n_components': hp.quniform('n_components', *n_components),
    'tol': hp.uniform('tol', *tol),
    'class_weight': equal_weights, 
    'priors': tuple(priors)}

In [None]:
lda_best = fmin(
    fn=clf_eval, 
    space=lda_params,
    algo=algo,
    max_evals = nb_evals)

100%|██████████| 30/30 [00:19<00:00,  1.53it/s, best loss: -0.9483784913698712]


In [None]:
os.chdir('/content/gdrive/My Drive/Models/SSLAMM/')

In [None]:
# Store the best specification
with open('lda_best.pickle', 'wb') as handle:
    pickle.dump(lda_best, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# LDA
lda = LDA(
    solver = 'svd',
    n_components = int(lda_best['n_components']),
    tol = lda_best['tol'])

lda.fit(X_train, y_train)

LinearDiscriminantAnalysis(n_components=4, priors=None, shrinkage=None,
                           solver='svd', store_covariance=False,
                           tol=0.04083568418821131)

In [None]:
lda = LDA(store_covariance = True, priors = nb_samples/ nb_samples.sum())
lda.fit(X_train, y_train)

LinearDiscriminantAnalysis(n_components=None,
                           priors=array([0.00266768, 0.00320519, 0.35503972, 0.04656487, 0.16346479,
       0.02120205, 0.26746431, 0.14039139]),
                           shrinkage=None, solver='svd', store_covariance=True,
                           tol=0.0001)

In [None]:
preds = lda.predict(X_test)

In [None]:
print("Evaluation of best performing model:")
class_accuracy = precision_score(y_test, preds, average = None, labels = list(set(y_test)))

print('Micro accuracy: ', precision_score(y_test, preds,\
                                average = 'micro', labels = list(set(y_test))))
print('Classes accuracy: ', dict(zip(tn['name'], class_accuracy)))
print('Macro accuracy: ', precision_score(y_test, preds,\
                                average = 'macro', labels = list(set(y_test))))

print('\n')
pd.set_option("display.max_rows", None, "display.max_columns", None) 
print(pd.DataFrame(confusion_matrix(y_test, preds,\
                    labels = tn['id']), index = tn['name'], columns =  tn['name']))

Evaluation of best performing model:
Micro accuracy:  0.9593910815408134
Classes accuracy:  {'MICRO': 0.5909090909090909, 'ORGNANO': 0.6518987341772152, 'ORGPICOPRO': 0.9827736405788956, 'REDNANO': 0.9504643962848297, 'REDPICOEUK': 0.6242138364779874, 'REDPICOPRO': 0.8279773156899811, 'inf1microm': 0.9953703703703703}
Macro accuracy:  0.8033724834983386


name        MICRO  ORGNANO  ORGPICOPRO  REDNANO  REDPICOEUK  REDPICOPRO  \
name                                                                      
MICRO          39        0           0        0           0           0   
ORGNANO         0      618           0        6           0           0   
ORGPICOPRO      0        0      104916        0           0         102   
REDNANO         0      141           3     4605         219           0   
REDPICOEUK      0        0           9       78        4764          39   
REDPICOPRO      0        0           0        0           0           0   
inf1microm      0        0          78    

# Valid

In [None]:
preds = lda.predict(X_valid)
print("Evaluation of best performing model:")
class_accuracy = precision_score(y_valid, preds, average = None, labels = list(set(y_valid)))

print('Micro accuracy: ', precision_score(y_valid, preds,\
                                average = 'micro', labels = list(set(y_valid))))
print('Classes accuracy: ', dict(zip(tn['name'], class_accuracy)))
print('Macro accuracy: ', precision_score(y_valid, preds,\
                                average = 'macro', labels = list(set(y_valid))))

print('\n')
pd.set_option("display.max_rows", None, "display.max_columns", None) 
print(pd.DataFrame(confusion_matrix(y_valid, preds,\
                    labels = tn['id']), index = tn['name'], columns =  tn['name']))

Evaluation of best performing model:
Micro accuracy:  0.9056683228556819
Classes accuracy:  {'MICRO': 0.8149882903981265, 'ORGNANO': 0.83492296404989, 'ORGPICOPRO': 0.6984195948514637, 'REDNANO': 0.8717654264278059, 'REDPICOEUK': 0.948434130814252, 'inf1microm': 0.3978095051828672, 'sup1microm': 0.959341222164129}
Macro accuracy:  0.8143265539498072


name        MICRO  ORGNANO  ORGPICOPRO  REDNANO  REDPICOEUK  inf1microm  \
name                                                                      
MICRO         696       22           0       51           2           0   
ORGNANO        10     1138           0       90          24           0   
ORGPICOPRO      0        0       38580        0         126         289   
REDNANO       147      139           1    11387        1713           0   
REDPICOEUK      0        1        3420     1494       92368          22   
inf1microm      0        0         776        0           3        2034   
sup1microm      0        0       10888        