In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!pip install fastparquet
!sudo apt-get install libsnappy-dev
!python3 -m pip install  python-snappy
!python3 -m pip install pyarrow
! pip install hyperas

Collecting fastparquet
  Downloading fastparquet-0.7.1-cp37-cp37m-manylinux2010_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 5.0 MB/s 
[?25hCollecting thrift>=0.11.0
  Downloading thrift-0.15.0.tar.gz (59 kB)
[K     |████████████████████████████████| 59 kB 6.4 MB/s 
Collecting fsspec
  Downloading fsspec-2021.10.1-py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 60.1 MB/s 
Collecting cramjam>=2.3.0
  Downloading cramjam-2.4.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 40.7 MB/s 
Building wheels for collected packages: thrift
  Building wheel for thrift (setup.py) ... [?25l[?25hdone
  Created wheel for thrift: filename=thrift-0.15.0-cp37-cp37m-linux_x86_64.whl size=348186 sha256=5e9419d31b72b131a2749f8d32ba7a5f16f280da7839a5b559bdad8462afcb68
  Stored in directory: /root/.cache/pip/wheels/ba/1f/8e/e6fd36837eecf3d1f2b23f1729477e8e06558d8d60b7093f51
Successfully bu

In [3]:
import re
import os 
import numpy as np
import pandas as pd

from sklearn.metrics import precision_score, recall_score, accuracy_score
#from imblearn.under_sampling import EditedNearestNeighbours

os.chdir('/content/gdrive/My Drive/data/SSLAMM/L2')

############################################################
# Training of other algorithms on the unbiased dataset
############################################################

import pickle
import fastparquet as fp
from imblearn.under_sampling import RandomUnderSampler

from tensorflow.keras.models import load_model, model_from_json

from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils.class_weight import compute_sample_weight

from lightgbm import LGBMClassifier
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

cluster_classes = ['MICRO', 'ORGNANO', 'ORGPICOPRO', 'REDNANO', 'REDPICOEUK',\
                   'REDPICOPRO', 'inf1microm','sup1microm']



In [4]:
def clf_eval(params):
    ''' Wrapper around classifiers for them to be fed into hyperopt '''
    classif = params['classif']
    del params['classif']
    
    if classif == 'knn':
        params['n_neighbors'] = int(params['n_neighbors'])
        clf = KNeighborsClassifier(**params)
    elif classif == 'svm':
        params['C'] = int(params['C'])
        clf = svm.LinearSVC(**params)
    elif classif == 'lgbm':
        params['n_estimators'] = int(params['n_estimators'])
        clf = LGBMClassifier(**params)
        
    clf.fit(X_train, y_train)

    if classif == 'svm':
        pred_valid = clf.predict(X_valid)
        w = compute_sample_weight(params['class_weight'], y_valid)
        accuracy = accuracy_score(y_valid, pred_valid, sample_weight = w)
        ll = -accuracy_score(y_valid, pred_valid, sample_weight = w)
    else:
        ll = log_loss(y_valid, clf.predict_proba(X_valid)) # No sample weight in KNN
    
    return {'loss': ll, 'status': STATUS_OK}

In [5]:
# Import Pulse data 
train = np.load('Listmodes/train.npz', allow_pickle = True)
X_train = train['X']
y_train = train['y']

valid = np.load('Listmodes/valid.npz', allow_pickle = True)
X_valid = valid['X']
y_valid = valid['y']

In [None]:
y_valid.shape

(50682, 8)

In [6]:
# Load nomenclature
tn = pd.read_csv('Listmodes/train_test_nomenclature.csv')
tn.columns = ['name', 'id']

In [7]:
y_train = y_train.argmax(1)
y_valid = y_valid.argmax(1)

In [8]:
#************************************
# Looking for the best hyperparams 
#************************************
algo=tpe.suggest
nb_evals = 30

class_names, nb_samples  = np.unique(y_train, return_counts = True)
reweighted = dict(zip(class_names, 1/ nb_samples))
equal_weights = dict(zip(class_names, np.full(len(class_names), 1 / len(class_names))))
class_weight = (reweighted, equal_weights)

# Lgbm
lr = (1e-3, 1e-2)
n_est = (10, 1200, 1) # Peut mettre encore plus que 1000 ?
num_leaves = (6,8,12,16)
bt = ('gbdt', 'dart')
objective = ('binary')
max_bin = (255, 510)
colsample_bytree = (0.64, 0.65, 0.66)
subsample = (0.7,0.75)
reg_alpha = (1,1.2)
reg_lambda = (1,1.2,1.4)
is_unbalance = (True, False)
class_weight = (reweighted, equal_weights)


lgbm_params = {
    'classif': 'lgbm',
    'eval_metric': 'logloss', 
    'learning_rate': hp.uniform('learning_rate', *lr),
    'n_estimators': hp.quniform('n_estimators', *n_est),
    'num_leaves': hp.choice('num_leaves', num_leaves), # large num_leaves helps improve accuracy but might lead to over-fitting
    'boosting_type': hp.choice('boosting_type', bt), # for better accuracy -> try dart
    'objective': hp.choice('objective', objective),
    'max_bin': hp.choice('max_bin', max_bin), # large max_bin helps improve accuracy but might slow down training progress
    'colsample_bytree': hp.choice('colsample_bytree', colsample_bytree),
    'subsample': hp.choice('subsample', subsample),
    'reg_alpha': hp.choice('reg_alpha', reg_alpha),
    'reg_lambda':  hp.choice('reg_lambda', reg_lambda),
    'is_unbalance': hp.choice('is_unbalance', is_unbalance),
    'class_weight': hp.choice('class_weight', class_weight)
    }


In [9]:
lgbm_best = fmin(
    fn=clf_eval, 
    space=lgbm_params,
    algo=algo,
    max_evals = nb_evals)

100%|██████████| 30/30 [23:08<00:00, 46.27s/it, best loss: 0.029893922713944657]


In [10]:
os.chdir('/content/gdrive/My Drive/Models/SSLAMM/')

In [11]:
# Store the best specification
with open('lgbm_best.pickle2', 'wb') as handle:
    pickle.dump(lgbm_best, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [12]:
lgbm_best

{'boosting_type': 0,
 'class_weight': 1,
 'colsample_bytree': 2,
 'is_unbalance': 0,
 'learning_rate': 0.007732539689192715,
 'max_bin': 1,
 'n_estimators': 1176.0,
 'num_leaves': 1,
 'objective': 4,
 'reg_alpha': 0,
 'reg_lambda': 2,
 'subsample': 0}

In [None]:
# LGBM
lgbm = LGBMClassifier(eval_metric = 'logloss',
    learning_rate = lgbm_best['learning_rate'],
    n_estimators = int(lgbm_best['n_estimators']),
    num_leaves = num_leaves[lgbm_best['num_leaves']], # large num_leaves helps improve accuracy but might lead to over-fitting
    boosting_type = bt[lgbm_best['boosting_type']], # for better accuracy -> try dart
    objective = objective[lgbm_best['objective']],
    max_bin = max_bin[lgbm_best['max_bin']], # large max_bin helps improve accuracy but might slow down training progress
    colsample_bytree = colsample_bytree[lgbm_best['colsample_bytree']],
    subsample = subsample[lgbm_best['subsample']],
    reg_alpha = reg_alpha[lgbm_best['reg_alpha']],
    reg_lambda = reg_lambda[lgbm_best['reg_lambda']],
    is_unbalance = is_unbalance[lgbm_best['is_unbalance']],
    class_weight = class_weight[lgbm_best['class_weight']])

lgbm.fit(X_train, y_train)

LGBMClassifier(boosting_type='gbdt',
               class_weight={0: 0.125, 1: 0.125, 2: 0.125, 3: 0.125, 4: 0.125,
                             5: 0.125, 6: 0.125, 7: 0.125},
               colsample_bytree=0.65, eval_metric='logloss',
               importance_type='split', is_unbalance=False,
               learning_rate=0.009897325697918865, max_bin=510, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=1175, n_jobs=-1, num_leaves=12, objective='n',
               random_state=None, reg_alpha=1.2, reg_lambda=1, silent=True,
               subsample=0.75, subsample_for_bin=200000, subsample_freq=0)

In [None]:
test = np.load('/content/gdrive/My Drive/data/SSLAMM/L2/Listmodes/test.npz')
X_test = test['X']
y_test = test['y']
y_test = y_test.argmax(1)

In [None]:
preds = lgbm.predict(X_test)

In [None]:
print("Evaluation of best performing model:")
class_accuracy = precision_score(y_test, preds, average = None, labels = list(set(y_test)))

print('Micro accuracy: ', precision_score(y_test, preds,\
                                average = 'micro', labels = list(set(y_test))))
print('Classes accuracy: ', dict(zip(tn['name'], class_accuracy)))
print('Macro accuracy: ', precision_score(y_test, preds,\
                                average = 'macro', labels = list(set(y_test))))

print('\n')
pd.set_option("display.max_rows", None, "display.max_columns", None) 
print(pd.DataFrame(confusion_matrix(y_test, preds,\
                    labels = tn['id']), index = tn['name'], columns =  tn['name']))

Evaluation of best performing model:
Micro accuracy:  0.9925621496057716
Classes accuracy:  {'MICRO': 0.9692307692307692, 'ORGNANO': 0.9067357512953368, 'ORGPICOPRO': 0.9993061922343088, 'REDNANO': 0.9869753979739508, 'REDPICOEUK': 0.9901267315060418, 'REDPICOPRO': 0.7360655737704918, 'inf1microm': 0.9996891733423756, 'sup1microm': 0.9712837837837838}
Macro accuracy:  0.9449266716421323


name        MICRO  ORGNANO  ORGPICOPRO  REDNANO  REDPICOEUK  REDPICOPRO  \
name                                                                      
MICRO         441        0           0        1           0           0   
ORGNANO         0      175           0        0           0           0   
ORGPICOPRO      0        0       40329        0           0         103   
REDNANO         1        1           0     2046           9           0   
REDPICOEUK      0        0           1        9        6719           0   
REDPICOPRO      0        0           2        0           0        1347   
inf1micr

# Valid

In [None]:
preds = lgbm.predict(X_valid)
print("Evaluation of best performing model:")
class_accuracy = precision_score(y_valid, preds, average = None, labels = list(set(y_valid)))

print('Micro accuracy: ', precision_score(y_valid, preds,\
                                average = 'micro', labels = list(set(y_valid))))
print('Classes accuracy: ', dict(zip(tn['name'], class_accuracy)))
print('Macro accuracy: ', precision_score(y_valid, preds,\
                                average = 'macro', labels = list(set(y_valid))))

print('\n')
pd.set_option("display.max_rows", None, "display.max_columns", None) 
print(pd.DataFrame(confusion_matrix(y_valid, preds,\
                    labels = tn['id']), index = tn['name'], columns =  tn['name']))

Evaluation of best performing model:
Micro accuracy:  0.9920484590189811
Classes accuracy:  {'MICRO': 0.9849624060150376, 'ORGNANO': 0.8932584269662921, 'ORGPICOPRO': 0.9982038616973506, 'REDNANO': 0.9822784810126582, 'REDPICOEUK': 0.9921587846116148, 'REDPICOPRO': 0.905982905982906, 'inf1microm': 0.9996378386208895, 'sup1microm': 0.9816943380161771}
Macro accuracy:  0.9672721303653657


name        MICRO  ORGNANO  ORGPICOPRO  REDNANO  REDPICOEUK  REDPICOPRO  \
name                                                                      
MICRO         131        0           0        1           0           0   
ORGNANO         0      159           0        1           0           0   
ORGPICOPRO      0        0       17784        0           0          16   
REDNANO         0        4           0     2328           5           0   
REDPICOEUK      0        0           2       26        8098           0   
REDPICOPRO      0        0           1        0           0        1060   
inf1micro