In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import os 
os.chdir('/content/gdrive/My Drive/data')

In [None]:
 #!git clone https://github.com/RobeeF/phyto_curves_reco

In [None]:
!pip install fastparquet
!sudo apt-get install libsnappy-dev
!python3 -m pip install  python-snappy
!python3 -m pip install pyarrow
! pip install hyperas
!pip install tensorflow_addons --upgrade

Collecting fastparquet
  Downloading fastparquet-0.7.1-cp37-cp37m-manylinux2010_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 5.1 MB/s 
[?25hCollecting fsspec
  Downloading fsspec-2021.10.1-py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 66.4 MB/s 
[?25hCollecting cramjam>=2.3.0
  Downloading cramjam-2.4.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 34.6 MB/s 
Collecting thrift>=0.11.0
  Downloading thrift-0.15.0.tar.gz (59 kB)
[K     |████████████████████████████████| 59 kB 6.2 MB/s 
Building wheels for collected packages: thrift
  Building wheel for thrift (setup.py) ... [?25l[?25hdone
  Created wheel for thrift: filename=thrift-0.15.0-cp37-cp37m-linux_x86_64.whl size=348187 sha256=5e4eaa3ad81fe77c8ad5b1b336ed6404847463591096e10cf2e857aba2646c14
  Stored in directory: /root/.cache/pip/wheels/ba/1f/8e/e6fd36837eecf3d1f2b23f1729477e8e06558d8d60b7093f51
Successfu

In [None]:
import re
import os 
import numpy as np
import pandas as pd
import pickle
import fastparquet as fp

import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow_addons.optimizers import RectifiedAdam, Lookahead
from tensorflow.keras.models import load_model, model_from_json


from sklearn import svm
from lightgbm import LGBMClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

############################################################
# Training of other algorithms on the unbiased dataset
############################################################
os.chdir('/content/gdrive/My Drive/phyto_curves_reco')

cluster_classes = ['MICRO', 'ORGNANO', 'ORGPICOPRO', 'REDNANO', 'REDPICOEUK',\
                   'REDPICOPRO', 'inf1microm','sup1microm']

Define utilities for hyperoptimisation

In [None]:
def prec_rec_function(y_test, preds, cluster_classes, algo):
    ''' Compute the precision and recall for all classes'''
    prec = precision_score(y_test, preds, average=None)
    prec = dict(zip(cluster_classes, prec))
    prec['algorithm'] = algo
    
    recall= recall_score(y_test, preds, average=None)
    recall = dict(zip(cluster_classes, recall))
    recall['algorithm'] = algo
    
    return prec, recall


# Load the data

In [None]:
os.chdir('/content/gdrive/My Drive/data/SSLAMM/')

In [None]:
# Load nomenclature
tn = pd.read_csv('L2/Pulse/train_test_nomenclature.csv')
tn.columns = ['name', 'id']

In [None]:
# Import Listmode data (for benchmark models)
train = np.load('L2/Listmodes/train.npz')
X_train_na = train['X']
y_train_oh = train['y']

valid = np.load('L2/Listmodes/valid.npz')
X_valid_na = valid['X']
y_valid_oh = valid['y']

test = np.load('L2/Listmodes/test.npz')
X_test_na = test['X']
y_test_oh = test['y']

In [None]:
y_train_na = y_train_oh.argmax(1)
y_valid_na = y_valid_oh.argmax(1)
y_test_na = y_test_oh.argmax(1)

In [None]:
# Import Pulse data (for the CNN)
train_Pulses = np.load('L2/Pulse/train.npz')
X_train_Pulses = train_Pulses['X']
y_train_Pulses = train_Pulses['y']

valid_Pulses = np.load('L2/Pulse/valid.npz')
X_valid_Pulses = valid_Pulses['X']
y_valid_Pulses = valid_Pulses['y']

test_Pulses = np.load('L2/Pulse/test.npz')
X_test_Pulses = test_Pulses['X']
y_test_Pulses = test_Pulses['y']

In [None]:
print(len(X_train_na))
print(len(X_valid_na))
print(len(X_test_na))
print(len(X_train_Pulses))
print(len(X_valid_Pulses))
print(len(X_test_Pulses))

33791
50682
134313
33791
50682
134313


In [None]:
# Fetch the NaN indices (for LDA and KNN, no need for LGBM)
nan_train = np.isnan(X_train_na).any(1)
nan_valid = np.isnan(X_valid_na).any(1)
nan_test = np.isnan(X_test_na).any(1)

# Delete NaNs observations
X_train = X_train_na[~nan_train]
y_train = y_train_na[~nan_train]

X_valid = X_valid_na[~nan_valid]
y_valid = y_valid_na[~nan_valid]

X_test = X_test_na[~nan_test]
y_test = y_test_na[~nan_test]

# Predict with the best specifications of the benchmark models 

In [1]:
#************************************
# Defining parameters spaces
#************************************

# kNN
nn = (1, 50, 1)
w = ('uniform','distance')
algs = ('ball_tree', 'kd_tree', 'brute')
p_knn = (1, 2)

# LDA
n_classes = len(tn)
class_names, nb_samples  = np.unique(y_valid, return_counts = True)
priors = nb_samples/ nb_samples.sum()
solver = ('svd')
n_components = (1, n_classes -1, 1)
tol = (1.0e-5, 1.0e-1)

equal_weights = dict(zip(class_names, np.full(len(class_names), 1 / len(class_names))))

lda_params = {
    'classif': 'lda',
    'solver': 'svd', 
    'n_components': hp.quniform('n_components', *n_components),
    'tol': hp.uniform('tol', *tol),
    'class_weight': equal_weights, 
    'priors': tuple(priors)}

# Lgbm
class_names, nb_samples  = np.unique(y_train, return_counts = True)
reweighted = dict(zip(class_names, 1/ nb_samples))
equal_weights = dict(zip(class_names, np.full(len(class_names), 1 / len(class_names))))

lr = (1e-3, 1e-2)
n_est = (10, 1200, 1) 
num_leaves = (6,8,12,16)
bt = ('gbdt', 'dart')
objective = ('binary')
max_bin = (255, 510)
colsample_bytree = (0.64, 0.65, 0.66)
subsample = (0.7,0.75)
reg_alpha = (1,1.2)
reg_lambda = (1,1.2,1.4)
is_unbalance = (True, False)
class_weight = (reweighted, equal_weights)

NameError: ignored

In [None]:
# Load the best specifications for each benchmark model

with open('../../Models/SSLAMM/knn_best.pickle', 'rb') as handle:
    knn_best = pickle.load(handle)

with open('../../Models/SSLAMM/lda_best.pickle', 'rb') as handle:
    lda_best = pickle.load(handle)

with open('../../Models/SSLAMM/lgbm_best.pickle2', 'rb') as handle:
    lgbm_best = pickle.load(handle)

In [None]:
#********************************
# Fitting of the models
#********************************

from time import time

# KNN
knn = KNeighborsClassifier(n_neighbors = int(knn_best['n_neighbors']), \
                           weights = w[knn_best['weights']], \
                               algorithm = algs[knn_best['algorithm']],
                               p = p_knn[knn_best['p']])
start = time()
knn.fit(X_train, y_train)
end = time()
print('knn training time', end - start)


# LDA
start = time()
lda = LDA(
    solver = 'svd',
    n_components = int(lda_best['n_components']),
    tol = lda_best['tol'])

lda.fit(X_train, y_train)
end = time()
print('lda training time', end - start)

# LGBM
lgbm = LGBMClassifier(learning_rate = lgbm_best['learning_rate'],
    n_estimators = int(lgbm_best['n_estimators']),
    num_leaves = num_leaves[lgbm_best['num_leaves']], # large num_leaves helps improve accuracy but might lead to over-fitting
    boosting_type = bt[lgbm_best['boosting_type']], # for better accuracy -> try dart
    objective = objective[lgbm_best['objective']],
    max_bin = max_bin[lgbm_best['max_bin']], # large max_bin helps improve accuracy but might slow down training progress
    colsample_bytree = colsample_bytree[lgbm_best['colsample_bytree']],
    subsample = subsample[lgbm_best['subsample']],
    reg_alpha = reg_alpha[lgbm_best['reg_alpha']],
    reg_lambda = reg_lambda[lgbm_best['reg_lambda']],
    is_unbalance = is_unbalance[lgbm_best['is_unbalance']],
    class_weight = class_weight[lgbm_best['class_weight']])

start = time()
lgbm.fit(X_train_na, y_train_na)
end = time()
print('lgbm training time', end - start)


knn training time 0.007496833801269531
lda training time 0.5334179401397705
lgbm training time 78.55490398406982


# Predict with the two neural nets

## CNN

In [None]:
cnn = load_model('../../Models/SSLAMM/cnn_small')
#cnn.load_weights('../../Models/SSLAMM/weights_categorical_crossentropy_cnn_VGGBig_3.hdf5')

# Final output of the prediction of the test set

In [None]:
#********************************
# Prediction of the models
#********************************

knn_preds = knn.predict(X_test)  
lda_preds = lda.predict(X_test)  
lgbm_preds = lgbm.predict(X_test_na) 

#ffnn_preds = ffnn.predict(X_test) 
#ffnn_preds = ffnn_preds.argmax(1)

cnn_preds = cnn.predict(X_test_Pulses) 
cnn_preds = cnn_preds.argmax(1)

In [None]:
#********************************
# Accuracy computations
#********************************

prec = pd.DataFrame(columns= cluster_classes + ['algorithm'])
recall = pd.DataFrame(columns= cluster_classes + ['algorithm'])


# KNN 
prec_knn, recall_knn = prec_rec_function(y_test, knn_preds, cluster_classes, 'knn')
prec = prec.append(prec_knn, ignore_index = True)
recall = recall.append(recall_knn, ignore_index = True)

# LDA
prec_lda, recall_lda = prec_rec_function(y_test, lda_preds, cluster_classes, 'lda')
prec = prec.append(prec_lda, ignore_index = True)
recall = recall.append(recall_lda, ignore_index = True)


# LGBM
prec_lgbm, recall_lgbm = prec_rec_function(y_test_na, lgbm_preds, cluster_classes, 'lgbm')
prec = prec.append(prec_lgbm, ignore_index = True)
recall = recall.append(recall_lgbm, ignore_index = True)

# CNN
prec_cnn, recall_cnn = prec_rec_function(y_test_Pulses.argmax(1), cnn_preds, cluster_classes, 'cnn')
prec = prec.append(prec_cnn, ignore_index = True)
recall = recall.append(recall_cnn, ignore_index = True)

In [None]:
precision = prec.set_index('algorithm').T
rec = recall.set_index('algorithm').T

In [None]:
bench_res = precision.join(rec, lsuffix=' precision', rsuffix= ' recall')
bench_res = (bench_res * 100).round(2)
bench_res

algorithm,knn precision,lda precision,lgbm precision,cnn precision,knn recall,lda recall,lgbm recall,cnn recall
MICRO,73.68,96.54,97.13,98.0,72.2,93.95,98.65,98.88
ORGNANO,27.8,50.3,89.74,96.59,35.43,94.86,100.0,97.14
ORGPICOPRO,97.41,98.74,99.91,99.84,76.36,98.97,99.35,99.31
REDNANO,79.0,94.18,98.04,97.33,90.78,85.58,99.32,99.08
REDPICOEUK,71.45,83.8,99.02,99.32,83.26,99.45,98.33,97.6
REDPICOPRO,4.67,28.72,73.73,79.51,54.08,96.65,98.62,95.34
inf1microm,91.95,99.41,99.97,99.67,85.66,96.11,99.47,99.5
sup1microm,91.06,97.59,97.23,96.22,71.17,78.38,98.22,97.39


In [None]:
F1 = 2 * (precision * rec) / (precision + rec)

In [None]:
from sklearn.metrics import f1_score
f1_score(y_test_Pulses.argmax(1), cnn_preds, average= None)

array([0.984375  , 0.96866097, 0.99570264, 0.98195814, 0.98450466,
       0.86708651, 0.99585419, 0.96800782])

In [None]:
F1

algorithm,knn,lda,lgbm,cnn
MICRO,0.729332,0.952273,0.978865,0.984375
ORGNANO,0.311558,0.657426,0.945946,0.968661
ORGPICOPRO,0.856066,0.988584,0.99632,0.995703
REDNANO,0.844816,0.896745,0.986737,0.981958
REDPICOEUK,0.769014,0.909575,0.986704,0.984505
REDPICOPRO,0.085954,0.442812,0.843779,0.867087
inf1microm,0.886918,0.977335,0.997194,0.995854
sup1microm,0.798944,0.869351,0.977247,0.968008


In [None]:
bench_res.to_latex('/content/gdrive/My Drive/Results/SSLAMM/precision_recall_SSLAMM.tex')
F1.to_latex('/content/gdrive/My Drive/Results/SSLAMM/F1_SSLAMM.tex')