In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import os 
os.chdir('/content/gdrive/My Drive/data')

In [3]:
os.getcwd()

'/content/gdrive/My Drive/data'

In [None]:
 #!git clone https://github.com/RobeeF/phyto_curves_reco

In [4]:
!pip install fastparquet
!sudo apt-get install libsnappy-dev
!python3 -m pip install  python-snappy
!python3 -m pip install pyarrow
! pip install hyperas
!pip install tensorflow_addons --upgrade

Collecting fastparquet
  Downloading fastparquet-0.7.1-cp37-cp37m-manylinux2010_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 7.9 MB/s 
[?25hCollecting fsspec
  Downloading fsspec-2021.10.1-py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 44.8 MB/s 
[?25hCollecting cramjam>=2.3.0
  Downloading cramjam-2.4.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 54.0 MB/s 
Collecting thrift>=0.11.0
  Downloading thrift-0.15.0.tar.gz (59 kB)
[K     |████████████████████████████████| 59 kB 5.6 MB/s 
Building wheels for collected packages: thrift
  Building wheel for thrift (setup.py) ... [?25l[?25hdone
  Created wheel for thrift: filename=thrift-0.15.0-cp37-cp37m-linux_x86_64.whl size=348183 sha256=1d0145712db036faaaf994d15f43ca545d071999a8105467e59be1c84dc2c63a
  Stored in directory: /root/.cache/pip/wheels/ba/1f/8e/e6fd36837eecf3d1f2b23f1729477e8e06558d8d60b7093f51
Successfu

In [None]:
%pwd

'/content/gdrive/My Drive/cyto_classif'

In [5]:
import re
import os 
import numpy as np
import pandas as pd
import pickle
import fastparquet as fp

import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow_addons.optimizers import RectifiedAdam, Lookahead
from tensorflow.keras.models import load_model, model_from_json

#from sklearn import svm
from lightgbm import LGBMClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA


from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

############################################################
# Training of other algorithms on the unbiased dataset
############################################################
os.chdir('/content/gdrive/My Drive/phyto_curves_reco')

cluster_classes = ['MICRO', 'ORGNANO', 'ORGPICOPRO', 'REDNANO', 'REDPICOEUK',\
                   'REDPICOPRO', 'inf1microm','sup1microm']

Define utilities for hyperoptimisation

In [6]:
def prec_rec_function(y_test, preds, cluster_classes, algo):
    ''' Compute the precision and recall for all classes'''
    prec = precision_score(y_test, preds, average=None)
    prec = dict(zip(cluster_classes, prec))
    prec['algorithm'] = algo
    
    recall= recall_score(y_test, preds, average=None)
    recall = dict(zip(cluster_classes, recall))
    recall['algorithm'] = algo
    
    return prec, recall


# Load the data

In [7]:
os.chdir('/content/gdrive/My Drive/data/SWINGS/')

In [8]:
# Load nomenclature
tn = pd.read_csv('L2/Pulse/train_test_nomenclature.csv')
tn.columns = ['name', 'id']

In [13]:
# Import Listmode data (for benchmark models)
train = np.load('L2/Listmodes/train.npz')
X_train_na = train['X']
y_train_oh = train['y']

valid = np.load('L2/Listmodes/valid.npz')
X_valid_na = valid['X']
y_valid_oh = valid['y']

test = np.load('L2/Listmodes/test.npz')
X_test_na = test['X']
y_test_oh = test['y']

In [14]:
y_train_na = y_train_oh.argmax(1)
y_valid_na = y_valid_oh.argmax(1)
y_test_na = y_test_oh.argmax(1)

In [15]:
# Import Pulse data (for the CNN)
train_Pulses = np.load('L2/Pulse/train.npz')
X_train_Pulses = train_Pulses['X']
y_train_Pulses = train_Pulses['y']

valid_Pulses = np.load('L2/Pulse/valid.npz')
X_valid_Pulses = valid_Pulses['X']
y_valid_Pulses = valid_Pulses['y']

test_Pulses = np.load('L2/Pulse/test.npz')
X_test_Pulses = test_Pulses['X']
y_test_Pulses = test_Pulses['y']

In [16]:
print(len(X_train_na))
print(len(X_valid_na))
print(len(X_test_na))
print(len(X_train_Pulses))
print(len(X_valid_Pulses))
print(len(X_test_Pulses))

57241
365863
224426
57241
365863
224426


In [17]:
# Fetch the NaN indices (for LDA and KNN, no need for LGBM)
nan_train = np.isnan(X_train_na).any(1)
nan_valid = np.isnan(X_valid_na).any(1)
nan_test = np.isnan(X_test_na).any(1)

# Delete NaNs observations
X_train = X_train_na[~nan_train]
y_train = y_train_na[~nan_train]

X_valid = X_valid_na[~nan_valid]
y_valid = y_valid_na[~nan_valid]

X_test = X_test_na[~nan_test]
y_test = y_test_na[~nan_test]

# Predict with the best specifications of the benchmark models 

In [20]:
#************************************
# Defining parameters spaces
#************************************

# kNN
nn = (1, 50, 1)
w = ('uniform','distance')
algs = ('ball_tree', 'kd_tree', 'brute')
p_knn = (1, 2)

equal_weights = dict(zip(class_names, np.full(len(class_names), 1 / len(class_names))))

# LDA
n_classes = len(tn)
class_names, nb_samples  = np.unique(y_valid, return_counts = True)
priors = nb_samples/ nb_samples.sum()
solver = ('svd')
n_components = (1, n_classes -1, 1)
tol = (1.0e-5, 1.0e-1)

lda_params = {
    'classif': 'lda',
    'solver': 'svd', 
    'n_components': hp.quniform('n_components', *n_components),
    'tol': hp.uniform('tol', *tol),
    'class_weight': equal_weights, 
    'priors': tuple(priors)}

class_names, nb_samples  = np.unique(y_train, return_counts = True)
reweighted = dict(zip(class_names, 1/ nb_samples))

# Lgbm
lr = (1e-3, 1e-2)
n_est = (10, 1200, 1) 
num_leaves = (6,8,12,16)
bt = ('gbdt', 'dart')
objective = ('binary')
max_bin = (255, 510)
colsample_bytree = (0.64, 0.65, 0.66)
subsample = (0.7,0.75)
reg_alpha = (1,1.2)
reg_lambda = (1,1.2,1.4)
is_unbalance = (True, False)
class_weight = (reweighted, equal_weights)


In [None]:
%pwd

'/content/gdrive/My Drive/data/SWINGS'

In [24]:
# Load the best specifications for each benchmark model
with open('../../Models/SWINGS/knn_best.pickle', 'rb') as handle:
    knn_best = pickle.load(handle)

with open('../../Models/SWINGS/lda_best.pickle', 'rb') as handle:
    lda_best = pickle.load(handle)

with open('../../Models/SWINGS/lgbm_best_trainweights.pickle', 'rb') as handle:
    lgbm_best = pickle.load(handle)

In [22]:
lgbm_best

{'boosting_type': 0,
 'colsample_bytree': 1,
 'is_unbalance': 1,
 'learning_rate': 0.009434479039397122,
 'max_bin': 1,
 'n_estimators': 1092.0,
 'num_leaves': 2,
 'objective': 4,
 'reg_alpha': 1,
 'reg_lambda': 0,
 'subsample': 0}

In [30]:
#********************************
# Fitting of the models
#********************************

from time import time

# KNN
knn = KNeighborsClassifier(n_neighbors = int(knn_best['n_neighbors']), \
                           weights = w[knn_best['weights']], \
                               algorithm = algs[knn_best['algorithm']],
                               p = p_knn[knn_best['p']])
start = time()
knn.fit(X_train, y_train)
end = time()
print('knn training time', end - start)

# LDA
start = time()
lda = LDA(
    solver = 'svd',
    n_components = int(lda_best['n_components']),
    tol = lda_best['tol'])

lda.fit(X_train, y_train)
end = time()
print('lda training time', end - start)


# LGBM
lgbm = LGBMClassifier(learning_rate = lgbm_best['learning_rate'],
    n_estimators = int(lgbm_best['n_estimators']),
    num_leaves = num_leaves[lgbm_best['num_leaves']], # large num_leaves helps improve accuracy but might lead to over-fitting
    boosting_type = bt[lgbm_best['boosting_type']], # for better accuracy -> try dart
    objective = objective[lgbm_best['objective']],
    max_bin = max_bin[lgbm_best['max_bin']], # large max_bin helps improve accuracy but might slow down training progress
    colsample_bytree = colsample_bytree[lgbm_best['colsample_bytree']],
    subsample = subsample[lgbm_best['subsample']],
    reg_alpha = reg_alpha[lgbm_best['reg_alpha']],
    reg_lambda = reg_lambda[lgbm_best['reg_lambda']],
    is_unbalance = is_unbalance[lgbm_best['is_unbalance']])
    #class_weight = class_weight[lgbm_best['class_weight']])

start = time()
lgbm.fit(X_train_na, y_train_na)
end = time()
print('lgbm training time', end - start)


knn training time 0.7442500591278076
lda training time 0.765902042388916
lgbm training time 140.38012981414795


# Predict with the two neural nets

## CNN

In [27]:
cnn = load_model('../../Models/SWINGS/cnn_small')
#cnn.load_weights('/content/gdrive/My Drive/Models/SWINGS/weights_categorical_crossentropy_cnn_HyperCateg41_9.hdf5')

# Final output of the prediction of the test set

In [None]:
os.chdir('/content/gdrive/My Drive/cyto_classif/data/XP_Listmodes')

In [32]:
#********************************
# Prediction of the models
#********************************

knn_preds = knn.predict(X_test)  
lda_preds = lda.predict(X_test)  
lgbm_preds = lgbm.predict(X_test_na) 

#ffnn_preds = ffnn.predict(X_test) 
#ffnn_preds = ffnn_preds.argmax(1)

cnn_preds = cnn.predict(X_test_Pulses) 
cnn_preds = cnn_preds.argmax(1)

In [33]:
#********************************
# Accuracy computations
#********************************

prec = pd.DataFrame(columns= cluster_classes + ['algorithm'])
recall = pd.DataFrame(columns= cluster_classes + ['algorithm'])


# KNN 
prec_knn, recall_knn = prec_rec_function(y_test, knn_preds, cluster_classes, 'knn')
prec = prec.append(prec_knn, ignore_index = True)
recall = recall.append(recall_knn, ignore_index = True)

# LDA
prec_lda, recall_lda = prec_rec_function(y_test, lda_preds, cluster_classes, 'lda')
prec = prec.append(prec_lda, ignore_index = True)
recall = recall.append(recall_lda, ignore_index = True)


# LGBM
prec_lgbm, recall_lgbm = prec_rec_function(y_test_na, lgbm_preds, cluster_classes, 'lgbm')
prec = prec.append(prec_lgbm, ignore_index = True)
recall = recall.append(recall_lgbm, ignore_index = True)

# CNN
prec_cnn, recall_cnn = prec_rec_function(y_test_Pulses.argmax(1), cnn_preds, cluster_classes, 'cnn')
prec = prec.append(prec_cnn, ignore_index = True)
recall = recall.append(recall_cnn, ignore_index = True)

In [34]:
precision = prec.set_index('algorithm').T
rec = recall.set_index('algorithm').T

In [35]:
bench_res = precision.join(rec, lsuffix=' precision', rsuffix= ' recall')
bench_res = (bench_res * 100).round(2)
bench_res

algorithm,knn precision,lda precision,lgbm precision,cnn precision,knn recall,lda recall,lgbm recall,cnn recall
MICRO,24.2,67.66,95.22,75.26,93.15,93.61,100.0,100.0
ORGNANO,10.74,31.68,86.18,96.3,45.38,80.67,89.08,65.55
ORGPICOPRO,67.93,48.54,99.58,99.24,49.04,90.78,99.3,99.16
REDNANO,62.02,83.02,75.56,85.04,82.82,92.58,99.05,96.08
REDPICOEUK,97.19,97.11,99.77,99.65,79.99,91.74,96.93,98.23
REDPICOPRO,12.04,34.13,98.24,94.53,53.75,65.7,95.88,95.8
inf1microm,87.01,97.11,99.63,99.59,75.32,83.6,99.79,99.38
sup1microm,53.55,98.88,93.65,92.02,77.75,61.04,98.1,97.26


In [36]:
F1 = 2 * (precision * rec) / (precision + rec)

In [37]:
from sklearn.metrics import f1_score
f1_score(y_test_Pulses.argmax(1), cnn_preds, average= None)

array([0.85882353, 0.78      , 0.99201116, 0.90221624, 0.98933069,
       0.95161185, 0.99482919, 0.94567642])

In [38]:
F1

algorithm,knn,lda,lgbm,cnn
MICRO,0.384181,0.785441,0.975501,0.858824
ORGNANO,0.173633,0.454976,0.876033,0.78
ORGPICOPRO,0.569588,0.632602,0.994415,0.992011
REDNANO,0.709286,0.875374,0.857217,0.902216
REDPICOEUK,0.877571,0.943477,0.98329,0.989331
REDPICOPRO,0.196788,0.449227,0.970469,0.951612
inf1microm,0.807465,0.898493,0.997086,0.994829
sup1microm,0.634187,0.754862,0.958212,0.945676


In [40]:
bench_res.to_latex('/content/gdrive/My Drive/Results/SWINGS/precision_recall_SWINGS.tex')
F1.to_latex('/content/gdrive/My Drive/Results/SWINGS/F1_SWINGS.tex')