In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!pip install --upgrade numpy
!pip install tensorflow_addons

Collecting numpy
  Downloading numpy-1.21.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.7 MB)
[K     |████████████████████████████████| 15.7 MB 4.2 MB/s 
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.19.5
    Uninstalling numpy-1.19.5:
      Successfully uninstalled numpy-1.19.5
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.6.0 requires numpy~=1.19.2, but you have numpy 1.21.3 which is incompatible.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.
albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.[0m
Successfully installed numpy-1.21.3


Collecting tensorflow_addons
  Downloading tensorflow_addons-0.14.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 4.2 MB/s 
Installing collected packages: tensorflow-addons
Successfully installed tensorflow-addons-0.14.0


In [3]:
import re
import os 
import pickle
import numpy as np
import pandas as pd
from time import time

import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow_addons.optimizers import RectifiedAdam, Lookahead
from tensorflow.keras.models import load_model, model_from_json
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA


from sklearn import svm

from lightgbm import LGBMClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

############################################################
# Training of other algorithms on the unbiased dataset
############################################################

cluster_classes = ['MICRO', 'ORGNANO', 'ORGPICOPRO', 'REDNANO', 'REDPICOEUK',\
                   'REDPICOPRO', 'inf1microm','sup1microm']

# Load nomenclature
tn = pd.read_csv('/content/gdrive/My Drive/data/SWINGS/L2/Pulse/train_test_nomenclature.csv')
tn.columns = ['name', 'id']

In [28]:
def prec_rec_function(y_test, preds, cluster_classes, algo):
    ''' Compute the precision and recall for all classes'''
    prec = precision_score(y_test, preds, average=None)
    prec = dict(zip(cluster_classes, prec))
    prec['algorithm'] = algo
    
    recall= recall_score(y_test, preds, average=None)
    recall = dict(zip(cluster_classes, recall))
    recall['algorithm'] = algo
    
    return prec, recall


# Import SWINGS data

In [4]:
os.chdir('/content/gdrive/My Drive/data/SWINGS/L2')

# Import Listmode data (for benchmark models)
train = np.load('Listmodes/train.npz')
X_L_train_SWINGS = train['X']
y_L_train_SWINGS = train['y'].argmax(1)

valid = np.load('Listmodes/valid.npz')
X_L_valid_SWINGS = valid['X']
y_L_valid_SWINGS = valid['y'].argmax(1)


test = np.load('Listmodes/test.npz')
X_L_test_SWINGS = test['X']
y_L_test_SWINGS = test['y'].argmax(1)


# Import Pulse data (for the CNN)
train = np.load('Pulse/train.npz')
X_P_train_SWINGS = train['X']
y_P_train_SWINGS = train['y'].argmax(1)

valid = np.load('Pulse/valid.npz')
X_P_valid_SWINGS = valid['X']
y_P_valid_SWINGS = valid['y'].argmax(1)


test = np.load('Pulse/test.npz')
X_P_test_SWINGS = test['X']
y_P_test_SWINGS = test['y'].argmax(1)


In [None]:
# Fetch the NaN indices
nan_train = np.isnan(X_L_train_SWINGS).any(1)
nan_valid = np.isnan(X_L_valid_SWINGS).any(1)
nan_test = np.isnan(X_L_test_SWINGS).any(1)

# Delete NaNs observations
X_L_train_SWINGS = X_L_train_SWINGS[~nan_train]
y_L_train_SWINGS = y_L_train_SWINGS[~nan_train]

X_L_valid_SWINGS = X_L_valid_SWINGS[~nan_valid]
y_L_valid_SWINGS = y_L_valid_SWINGS[~nan_valid]

X_L_test_SWINGS = X_L_test_SWINGS[~nan_test]
y_L_test_SWINGS = y_L_test_SWINGS[~nan_test]

# Import SSLAMM data

In [5]:
os.chdir('/content/gdrive/My Drive/data/SSLAMM/L2')

# Import Listmode data (for benchmark models)

train = np.load('Listmodes/train.npz')
X_L_train_SSLAMM = train['X']
y_L_train_SSLAMM = train['y'].argmax(1)

valid = np.load('Listmodes/valid.npz')
X_L_valid_SSLAMM = valid['X']
y_L_valid_SSLAMM = valid['y'].argmax(1)


test = np.load('Listmodes/test.npz')
X_L_test_SSLAMM = test['X']
y_L_test_SSLAMM = test['y'].argmax(1)


# Import Pulse data (for the CNN)

train = np.load('Pulse/train.npz')
X_P_train_SSLAMM = train['X']
y_P_train_SSLAMM = train['y'].argmax(1)

valid = np.load('Pulse/valid.npz')
X_P_valid_SSLAMM = valid['X']
y_P_valid_SSLAMM = valid['y'].argmax(1)

test = np.load('Pulse/test.npz')
X_P_test_SSLAMM = test['X']
y_P_test_SSLAMM = test['y'].argmax(1)

In [27]:
# Fetch the NaN indices
nan_train = np.isnan(X_L_train_SSLAMM).any(1)
nan_valid = np.isnan(X_L_valid_SSLAMM).any(1)
nan_test = np.isnan(X_L_test_SSLAMM).any(1)

# Delete NaNs observations
X_L_train_SSLAMM = X_L_train_SSLAMM[~nan_train]
y_L_train_SSLAMM = y_L_train_SSLAMM[~nan_train]

X_L_valid_SSLAMM = X_L_valid_SSLAMM[~nan_valid]
y_L_valid_SSLAMM = y_L_valid_SSLAMM[~nan_valid]

X_L_test_SSLAMM = X_L_test_SSLAMM[~nan_test]
y_L_test_SSLAMM = y_L_test_SSLAMM[~nan_test]

In [6]:
del(train)
del(valid)
del(test)

# Train on SWINGS predict on SSLAMM

In [13]:
#************************************
# Defining parameters spaces
#************************************

# kNN
nn = (1,50,1) # Peut mettre plus que 50
w = ('uniform','distance')
algs = ('ball_tree', 'kd_tree', 'brute')
p_knn = (1, 2)

# LDA
n_classes = len(tn)
class_names, nb_samples  = np.unique(y_L_valid_SWINGS, return_counts = True)
priors = nb_samples/ nb_samples.sum()

# Lgbm

lr = (1e-3, 1e-2)
n_est = (10, 1200, 1) 
num_leaves = (6,8,12,16)
bt = ('gbdt', 'dart')
objective = ('binary')
max_bin = (255, 510)
colsample_bytree = (0.64, 0.65, 0.66)
subsample = (0.7,0.75)
reg_alpha = (1,1.2)
reg_lambda = (1,1.2,1.4)
is_unbalance = (True, False)
class_weight = (equal_weights)

## Load the best configuration determined on SWINGS data

In [15]:
os.chdir('/content/gdrive/My Drive/Models/SWINGS/')

# Load the best specifications for each benchmark model
with open('knn_best.pickle', 'rb') as handle:
    knn_best = pickle.load(handle)

with open('lda_best.pickle', 'rb') as handle:
    lda_best = pickle.load(handle)

with open('lgbm_best_trainweights.pickle', 'rb') as handle:
    lgbm_best = pickle.load(handle)

cnn = load_model('/content/gdrive/My Drive/Models/SWINGS/cnn_small')

## Fit the models on SWINGS train

In [19]:
# KNN
knn = KNeighborsClassifier(n_neighbors = int(knn_best['n_neighbors']), \
                           weights = w[knn_best['weights']], \
                               algorithm = algs[knn_best['algorithm']],
                               p = p_knn[knn_best['p']])
start = time()
knn.fit(X_L_train_SWINGS, y_L_train_SWINGS)
end = time()
print('knn training time', end - start)

# SVM
lda = LDA(
    solver = 'svd',
    n_components = int(lda_best['n_components']),
    tol = lda_best['tol'])

lda.fit(X_L_train_SWINGS, y_L_train_SWINGS)

knn training time 0.7228775024414062


LinearDiscriminantAnalysis(n_components=1, priors=None, shrinkage=None,
                           solver='svd', store_covariance=False,
                           tol=0.03596860877191493)

In [20]:
# LGBM
lgbm = LGBMClassifier(learning_rate = lgbm_best['learning_rate'],
    n_estimators = int(lgbm_best['n_estimators']),
    num_leaves = num_leaves[lgbm_best['num_leaves']], # large num_leaves helps improve accuracy but might lead to over-fitting
    boosting_type = bt[lgbm_best['boosting_type']], # for better accuracy -> try dart
    objective = objective[lgbm_best['objective']],
    max_bin = max_bin[lgbm_best['max_bin']], # large max_bin helps improve accuracy but might slow down training progress
    colsample_bytree = colsample_bytree[lgbm_best['colsample_bytree']],
    subsample = subsample[lgbm_best['subsample']],
    reg_alpha = reg_alpha[lgbm_best['reg_alpha']],
    reg_lambda = reg_lambda[lgbm_best['reg_lambda']],
    is_unbalance = is_unbalance[lgbm_best['is_unbalance']])

start = time()
lgbm.fit(X_L_train_SWINGS, y_L_train_SWINGS)
end = time()
print('lgbm training time', end - start)

lgbm training time 70.91164875030518


# Predict on SSLAMM data

In [30]:
#********************************
# Prediction of the models
#********************************

knn_preds = knn.predict(X_L_test_SSLAMM)  
lda_preds = lda.predict(X_L_test_SSLAMM)  
lgbm_preds = lgbm.predict(X_L_test_SSLAMM) 
cnn_preds = cnn.predict(X_P_test_SSLAMM).argmax(1)

In [34]:
#********************************
# Accuracy computations
#********************************

prec = pd.DataFrame(columns= cluster_classes + ['algorithm'])
recall = pd.DataFrame(columns= cluster_classes + ['algorithm'])

# KNN 
prec_knn, recall_knn = prec_rec_function(y_L_test_SSLAMM, knn_preds, cluster_classes, 'knn')
prec = prec.append(prec_knn, ignore_index = True)
recall = recall.append(recall_knn, ignore_index = True)

# LDA
prec_lda, recall_lda = prec_rec_function(y_L_test_SSLAMM, lda_preds, cluster_classes, 'lda')
prec = prec.append(prec_lda, ignore_index = True)
recall = recall.append(recall_lda, ignore_index = True)


# LGBM
prec_lgbm, recall_lgbm = prec_rec_function(y_L_test_SSLAMM, lgbm_preds, cluster_classes, 'lgbm')
prec = prec.append(prec_lgbm, ignore_index = True)
recall = recall.append(recall_lgbm, ignore_index = True)

# CNN
prec_cnn, recall_cnn = prec_rec_function(y_P_test_SSLAMM, cnn_preds, cluster_classes, 'cnn')
prec = prec.append(prec_cnn, ignore_index = True)
recall = recall.append(recall_cnn, ignore_index = True)

In [35]:
precision = prec.set_index('algorithm').T
rec = recall.set_index('algorithm').T

bench_res = precision.join(rec, lsuffix=' precision', rsuffix= ' recall')
bench_res = (bench_res * 100).round(2)
F1 = 2 * (precision * rec) / (precision + rec)
print(bench_res)
print(F1)

algorithm,knn precision,lda precision,lgbm precision,cnn precision,knn recall,lda recall,lgbm recall,cnn recall
MICRO,66.21,66.99,75.47,72.82,75.11,31.39,98.65,93.72
ORGNANO,24.61,23.56,83.25,92.74,72.0,98.29,96.57,94.86
ORGPICOPRO,96.15,84.12,99.81,99.91,67.78,94.9,94.63,82.01
REDNANO,90.27,93.92,100.0,99.86,63.93,51.02,60.53,69.03
REDPICOEUK,54.4,62.24,89.18,91.76,85.96,99.93,98.21,88.73
REDPICOPRO,3.19,1.05,54.3,61.66,62.17,4.52,64.94,11.37
inf1microm,81.82,94.75,94.95,92.86,65.96,84.81,99.06,99.87
sup1microm,93.86,96.99,83.04,61.3,62.11,46.59,74.74,80.95


In [37]:
bench_res.to_latex('/content/gdrive/My Drive/Results/precision_recall_SWINGStoSSLAMM.tex')
F1.to_latex('/content/gdrive/My Drive/Results/F1_SWINGStoSSLAMM.tex')

# Train on SSLAMM data and predict on SWINGS data

## Load the best configuration determined on SSLAMM data

In [39]:
os.chdir('/content/gdrive/My Drive/Models/SSLAMM/')

# Load the best specifications for each benchmark model
with open('knn_best.pickle', 'rb') as handle:
    knn_best = pickle.load(handle)

with open('lda_best.pickle', 'rb') as handle:
    lda_best = pickle.load(handle)

with open('lgbm_best.pickle2', 'rb') as handle:
    lgbm_best = pickle.load(handle)

cnn = load_model('/content/gdrive/My Drive/Models/SSLAMM/cnn_small')

## Fit the models on SSLAMM train

In [40]:
# KNN
knn = KNeighborsClassifier(n_neighbors = int(knn_best['n_neighbors']), \
                           weights = w[knn_best['weights']], \
                               algorithm = algs[knn_best['algorithm']],
                               p = p_knn[knn_best['p']])
start = time()
knn.fit(X_L_train_SSLAMM, y_L_train_SSLAMM)
end = time()
print('knn training time', end - start)

# SVM
lda = LDA(
    solver = 'svd',
    n_components = int(lda_best['n_components']),
    tol = lda_best['tol'])

lda.fit(X_L_train_SSLAMM, y_L_train_SSLAMM)

# LGBM
lgbm = LGBMClassifier(learning_rate = lgbm_best['learning_rate'],
    n_estimators = int(lgbm_best['n_estimators']),
    num_leaves = num_leaves[lgbm_best['num_leaves']], # large num_leaves helps improve accuracy but might lead to over-fitting
    boosting_type = bt[lgbm_best['boosting_type']], # for better accuracy -> try dart
    objective = objective[lgbm_best['objective']],
    max_bin = max_bin[lgbm_best['max_bin']], # large max_bin helps improve accuracy but might slow down training progress
    colsample_bytree = colsample_bytree[lgbm_best['colsample_bytree']],
    subsample = subsample[lgbm_best['subsample']],
    reg_alpha = reg_alpha[lgbm_best['reg_alpha']],
    reg_lambda = reg_lambda[lgbm_best['reg_lambda']],
    is_unbalance = is_unbalance[lgbm_best['is_unbalance']])

lgbm.fit(X_L_train_SSLAMM, y_L_train_SSLAMM)

knn training time 0.006409645080566406


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.66,
               importance_type='split', is_unbalance=True,
               learning_rate=0.007732539689192715, max_bin=510, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=1176, n_jobs=-1, num_leaves=8, objective='r',
               random_state=None, reg_alpha=1, reg_lambda=1.4, silent=True,
               subsample=0.7, subsample_for_bin=200000, subsample_freq=0)

## Predictions on SWINGS data 

In [41]:
#********************************
# Prediction of the models
#********************************

knn_preds = knn.predict(X_L_test_SWINGS)  
lda_preds = lda.predict(X_L_test_SWINGS)  
lgbm_preds = lgbm.predict(X_L_test_SWINGS) 
cnn_preds = cnn.predict(X_P_test_SWINGS).argmax(1)

In [42]:
#********************************
# Accuracy computations
#********************************

prec = pd.DataFrame(columns= cluster_classes + ['algorithm'])
recall = pd.DataFrame(columns= cluster_classes + ['algorithm'])

# KNN 
prec_knn, recall_knn = prec_rec_function(y_L_test_SWINGS, knn_preds, cluster_classes, 'knn')
prec = prec.append(prec_knn, ignore_index = True)
recall = recall.append(recall_knn, ignore_index = True)

# LDA
prec_lda, recall_lda = prec_rec_function(y_L_test_SWINGS, lda_preds, cluster_classes, 'lda')
prec = prec.append(prec_lda, ignore_index = True)
recall = recall.append(recall_lda, ignore_index = True)


# LGBM
prec_lgbm, recall_lgbm = prec_rec_function(y_L_test_SWINGS, lgbm_preds, cluster_classes, 'lgbm')
prec = prec.append(prec_lgbm, ignore_index = True)
recall = recall.append(recall_lgbm, ignore_index = True)

# CNN
prec_cnn, recall_cnn = prec_rec_function(y_P_test_SWINGS, cnn_preds, cluster_classes, 'cnn')
prec = prec.append(prec_cnn, ignore_index = True)
recall = recall.append(recall_cnn, ignore_index = True)

In [43]:
precision = prec.set_index('algorithm').T
rec = recall.set_index('algorithm').T

bench_res = precision.join(rec, lsuffix=' precision', rsuffix= ' recall')
bench_res = (bench_res * 100).round(2)
F1 = 2 * (precision * rec) / (precision + rec)
print(bench_res)
print(F1)

algorithm   knn precision  lda precision  lgbm precision  cnn precision  \
MICRO               38.51          20.28           91.16          37.36   
ORGNANO             41.84          12.36           64.03          57.26   
ORGPICOPRO          52.78          38.36           97.37          96.23   
REDNANO             33.47          19.04           16.26          14.21   
REDPICOEUK          98.46          92.34           99.74          99.35   
REDPICOPRO          10.08           6.32           88.49          93.42   
inf1microm          88.85          93.37           98.91          98.01   
sup1microm          29.52          12.54           54.13          53.70   

algorithm   knn recall  lda recall  lgbm recall  cnn recall  
MICRO            87.21       98.17        89.50       77.63  
ORGNANO          34.45       45.38        74.79       59.66  
ORGPICOPRO       32.84       99.99        99.63       99.97  
REDNANO          95.54       87.75        99.83       95.84  
REDPICOEUK    

In [44]:
bench_res.to_latex('/content/gdrive/My Drive/Results/precision_recall_SSLAMMtoSWINGS.tex')
F1.to_latex('/content/gdrive/My Drive/Results/F1_SSLAMMtoSWINGS.tex')