In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!pip install fastparquet
!sudo apt-get install libsnappy-dev
!python3 -m pip install  python-snappy
!python3 -m pip install pyarrow
! pip install hyperas

Collecting fastparquet
  Downloading fastparquet-0.7.1-cp37-cp37m-manylinux2010_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 5.5 MB/s 
[?25hCollecting fsspec
  Downloading fsspec-2021.10.1-py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 40.9 MB/s 
Collecting cramjam>=2.3.0
  Downloading cramjam-2.4.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 28.2 MB/s 
[?25hCollecting thrift>=0.11.0
  Downloading thrift-0.15.0.tar.gz (59 kB)
[K     |████████████████████████████████| 59 kB 4.3 MB/s 
Building wheels for collected packages: thrift
  Building wheel for thrift (setup.py) ... [?25l[?25hdone
  Created wheel for thrift: filename=thrift-0.15.0-cp37-cp37m-linux_x86_64.whl size=348183 sha256=e1228bbc7498ee24c5d84df5d80ea5f76fdb4a4f1668eb9504318417f52d9721
  Stored in directory: /root/.cache/pip/wheels/ba/1f/8e/e6fd36837eecf3d1f2b23f1729477e8e06558d8d60b7093f51
Successfu

In [3]:
import re
import os 
import numpy as np
import pandas as pd

from sklearn.metrics import precision_score, recall_score, accuracy_score
#from imblearn.under_sampling import EditedNearestNeighbours

os.chdir('/content/gdrive/My Drive/data/SWINGS/L2')

############################################################
# Training of other algorithms on the unbiased dataset
############################################################

import pickle
import fastparquet as fp
from imblearn.under_sampling import RandomUnderSampler

from tensorflow.keras.models import load_model, model_from_json

from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils.class_weight import compute_sample_weight

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

cluster_classes = ['MICRO', 'ORGNANO', 'ORGPICOPRO', 'REDNANO', 'REDPICOEUK',\
                   'REDPICOPRO', 'inf1microm','sup1microm']



In [4]:
def clf_eval(params):
    ''' Wrapper around classifiers for them to be fed into hyperopt '''
    classif = params['classif']
    del params['classif']
    
    if classif == 'knn':
        params['n_neighbors'] = int(params['n_neighbors'])
        clf = KNeighborsClassifier(**params)
    elif classif == 'svm':
        params['C'] = int(params['C'])
        clf = svm.LinearSVC(**params)
    elif classif == 'lgbm':
        params['n_estimators'] = int(params['n_estimators'])
        clf = LGBMClassifier(**params)
    elif classif == 'lda':
        params['n_components'] = int(params['n_components'])
        fit_params = {key: value for key, value in params.items() if key != 'class_weight'}
        clf = LDA(**fit_params)
        
    clf.fit(X_train, y_train)

    if classif in ['svm', 'lda']:
        pred_valid = clf.predict(X_valid)
        w = compute_sample_weight(params['class_weight'], y_valid)
        accuracy = accuracy_score(y_valid, pred_valid, sample_weight = w)
        ll = -accuracy_score(y_valid, pred_valid, sample_weight = w)
    else:
        ll = log_loss(y_valid, clf.predict_proba(X_valid)) # No sample weight in KNN

    
    return {'loss': ll, 'status': STATUS_OK}

In [5]:
# Import Pulse data 
train = np.load('Listmodes/train.npz', allow_pickle = True)
X_train = train['X']
y_train = train['y']

valid = np.load('Listmodes/valid.npz', allow_pickle = True)
X_valid = valid['X']
y_valid = valid['y']

In [6]:
# Load nomenclature
tn = pd.read_csv('Listmodes/train_test_nomenclature.csv')
tn.columns = ['name', 'id']

In [7]:
y_train = y_train.argmax(1)
y_valid = y_valid.argmax(1)

In [8]:
# Fetch the NaN indices
nan_train = np.isnan(X_train).any(1)
nan_valid = np.isnan(X_valid).any(1)

# Delete NaNs observations
X_train = X_train[~nan_train]
y_train = y_train[~nan_train]

X_valid = X_valid[~nan_valid]
y_valid = y_valid[~nan_valid]

In [9]:
test = np.load('/content/gdrive/My Drive/data/SWINGS/L2/Listmodes/test.npz')
X_test = test['X']
y_test = test['y']
y_test = y_test.argmax(1)

In [10]:
# Fetch the NaN indices
nan_test = np.isnan(X_test).any(1)

X_test = X_test[~nan_test]
y_test = y_test[~nan_test]

In [None]:
n_classes = len(tn)
solver = ('svd', 'lsqr', 'eigen')
shrinkage = (0, 1)
n_components = (2, n_features -1)
tolfloat = (1.0e-5, 1.0e-2)

In [13]:
#************************************
# Looking for the best hyperparams 
#************************************
algo=tpe.suggest
nb_evals = 30

n_classes = len(tn)
class_names, nb_samples  = np.unique(y_valid, return_counts = True)
reweighted = dict(zip(class_names, 1/ nb_samples))
equal_weights = dict(zip(class_names, np.full(len(class_names), 1 / len(class_names))))
class_weight = (reweighted)
priors = nb_samples/ nb_samples.sum()

solver = ('lsqr', 'eigen')
shrinkage = (0,1)
n_components = (1, n_classes -1, 1)
tol = (1.0e-5, 1.0e-2)


lda_params = {
    'classif': 'lda',
    'solver': hp.choice('solver', solver), # large num_leaves helps improve accuracy but might lead to over-fitting
    'shrinkage': hp.uniform('shrinkage', *shrinkage),
    'n_components': hp.quniform('n_components', *n_components),
    'tol': hp.uniform('tol', *tol),
    'class_weight': equal_weights, 
    'priors': tuple(priors)}

In [15]:
#************************************
# Looking for the best hyperparams 
#************************************
algo=tpe.suggest
nb_evals = 30

class_names, nb_samples  = np.unique(y_valid, return_counts = True)
reweighted = dict(zip(class_names, 1/ nb_samples))

priors = nb_samples/ nb_samples.sum()

solver = ('svd')
n_components = (1, n_classes -1, 1)
tol = (1.0e-5, 1.0e-1)


lda_params = {
    'classif': 'lda',
    'solver': 'svd', # large num_leaves helps improve accuracy but might lead to over-fitting
    'n_components': hp.quniform('n_components', *n_components),
    'tol': hp.uniform('tol', *tol),
    'class_weight': equal_weights, 
    'priors': tuple(priors)}

In [16]:
lda_best = fmin(
    fn=clf_eval, 
    space=lda_params,
    algo=algo,
    max_evals = nb_evals)

100%|██████████| 30/30 [00:34<00:00,  1.16s/it, best loss: -0.9056683228556819]


In [17]:
os.chdir('/content/gdrive/My Drive/Models/SWINGS/')

In [18]:
# Store the best specification
with open('lda_best.pickle', 'wb') as handle:
    pickle.dump(lda_best, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [19]:
# LDA
lda = LDA(
    solver = 'svd',
    n_components = int(lda_best['n_components']),
    tol = lda_best['tol'])

lda.fit(X_train, y_train)

LinearDiscriminantAnalysis(n_components=1, priors=None, shrinkage=None,
                           solver='svd', store_covariance=False,
                           tol=0.03596860877191493)

In [20]:
lda = LDA(store_covariance = True, priors = nb_samples/ nb_samples.sum())
lda.fit(X_train, y_train)

LinearDiscriminantAnalysis(n_components=None,
                           priors=array([0.00211649, 0.00345639, 0.11631971, 0.03661755, 0.27097695,
       0.01050317, 0.53551965, 0.02449009]),
                           shrinkage=None, solver='svd', store_covariance=True,
                           tol=0.0001)

In [21]:
preds = lda.predict(X_test)

In [22]:
print("Evaluation of best performing model:")
class_accuracy = precision_score(y_test, preds, average = None, labels = list(set(y_test)))

print('Micro accuracy: ', precision_score(y_test, preds,\
                                average = 'micro', labels = list(set(y_test))))
print('Classes accuracy: ', dict(zip(tn['name'], class_accuracy)))
print('Macro accuracy: ', precision_score(y_test, preds,\
                                average = 'macro', labels = list(set(y_test))))

print('\n')
pd.set_option("display.max_rows", None, "display.max_columns", None) 
print(pd.DataFrame(confusion_matrix(y_test, preds,\
                    labels = tn['id']), index = tn['name'], columns =  tn['name']))

Evaluation of best performing model:
Micro accuracy:  0.8897740291813963
Classes accuracy:  {'MICRO': 0.7311827956989247, 'ORGNANO': 0.509090909090909, 'ORGPICOPRO': 0.5428938712238648, 'REDNANO': 0.9219324908014718, 'REDPICOEUK': 0.9611028260842772, 'inf1microm': 0.5992698680146027, 'sup1microm': 0.9279406742450115}
Macro accuracy:  0.7734255527048477


name        MICRO  ORGNANO  ORGPICOPRO  REDNANO  REDPICOEUK  inf1microm  \
name                                                                      
MICRO         204        6           1        7           0           0   
ORGNANO         1       84           2       25           6           0   
ORGPICOPRO      0        0       14359        0         189          87   
REDNANO        73       13           0     5763         782           0   
REDPICOEUK      0        0        2997      442       77981          11   
inf1microm      0        0        1891        0          30        2134   
sup1microm      0        0        5680     

# Valid

In [23]:
preds = lda.predict(X_valid)
print("Evaluation of best performing model:")
class_accuracy = precision_score(y_valid, preds, average = None, labels = list(set(y_valid)))

print('Micro accuracy: ', precision_score(y_valid, preds,\
                                average = 'micro', labels = list(set(y_valid))))
print('Classes accuracy: ', dict(zip(tn['name'], class_accuracy)))
print('Macro accuracy: ', precision_score(y_valid, preds,\
                                average = 'macro', labels = list(set(y_valid))))

print('\n')
pd.set_option("display.max_rows", None, "display.max_columns", None) 
print(pd.DataFrame(confusion_matrix(y_valid, preds,\
                    labels = tn['id']), index = tn['name'], columns =  tn['name']))

Evaluation of best performing model:
Micro accuracy:  0.9056683228556819
Classes accuracy:  {'MICRO': 0.8149882903981265, 'ORGNANO': 0.83492296404989, 'ORGPICOPRO': 0.6984195948514637, 'REDNANO': 0.8717654264278059, 'REDPICOEUK': 0.948434130814252, 'inf1microm': 0.3978095051828672, 'sup1microm': 0.959341222164129}
Macro accuracy:  0.8143265539498072


name        MICRO  ORGNANO  ORGPICOPRO  REDNANO  REDPICOEUK  inf1microm  \
name                                                                      
MICRO         696       22           0       51           2           0   
ORGNANO        10     1138           0       90          24           0   
ORGPICOPRO      0        0       38580        0         126         289   
REDNANO       147      139           1    11387        1713           0   
REDPICOEUK      0        1        3420     1494       92368          22   
inf1microm      0        0         776        0           3        2034   
sup1microm      0        0       10888        