In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!pip install fastparquet
!sudo apt-get install libsnappy-dev
!python3 -m pip install  python-snappy
!python3 -m pip install pyarrow
! pip install hyperas

Collecting fastparquet
  Downloading fastparquet-0.7.1-cp37-cp37m-manylinux2010_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 3.9 MB/s 
[?25hCollecting fsspec
  Downloading fsspec-2021.10.1-py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 9.9 MB/s 
Collecting thrift>=0.11.0
  Downloading thrift-0.15.0.tar.gz (59 kB)
[K     |████████████████████████████████| 59 kB 3.8 MB/s 
[?25hCollecting cramjam>=2.3.0
  Downloading cramjam-2.4.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 10.7 MB/s 
Building wheels for collected packages: thrift
  Building wheel for thrift (setup.py) ... [?25l[?25hdone
  Created wheel for thrift: filename=thrift-0.15.0-cp37-cp37m-linux_x86_64.whl size=348195 sha256=a81fad2dcefc7ccef68c19d3742204ba04a9d18e38917285d162c2e036759a4b
  Stored in directory: /root/.cache/pip/wheels/ba/1f/8e/e6fd36837eecf3d1f2b23f1729477e8e06558d8d60b7093f51
Successful

In [3]:
import re
import os 
import numpy as np
import pandas as pd

from sklearn.metrics import precision_score, recall_score, accuracy_score
#from imblearn.under_sampling import EditedNearestNeighbours

os.chdir('/content/gdrive/My Drive/data/SWINGS/L2')

############################################################
# Training of other algorithms on the unbiased dataset
############################################################

import pickle
import fastparquet as fp
from imblearn.under_sampling import RandomUnderSampler

from tensorflow.keras.models import load_model, model_from_json

from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils.class_weight import compute_sample_weight

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

cluster_classes = ['MICRO', 'ORGNANO', 'ORGPICOPRO', 'REDNANO', 'REDPICOEUK',\
                   'REDPICOPRO', 'inf1microm','sup1microm']
    



In [4]:
def clf_eval(params):
    ''' Wrapper around classifiers for them to be fed into hyperopt '''
    classif = params['classif']
    del params['classif']
    
    if classif == 'knn':
        params['n_neighbors'] = int(params['n_neighbors'])
        clf = KNeighborsClassifier(**params)
    elif classif == 'svm':
        params['C'] = int(params['C'])
        clf = svm.LinearSVC(**params)
    elif classif == 'lgbm':
        params['n_estimators'] = int(params['n_estimators'])
        clf = LGBMClassifier(**params)
        
    clf.fit(X_train, y_train)

    if classif == 'svm':
        pred_valid = clf.predict(X_valid)
        w = compute_sample_weight(params['class_weight'], y_valid)
        accuracy = accuracy_score(y_valid, pred_valid, sample_weight = w)
        ll = -accuracy_score(y_valid, pred_valid, sample_weight = w)
    else:
        ll = log_loss(y_valid, clf.predict_proba(X_valid)) # No sample weight in KNN
    
    return {'loss': ll, 'status': STATUS_OK}

In [5]:
# Import Listmode data 
train = np.load('Listmodes/train.npz')
X_train = train['X']
y_train = train['y']

valid = np.load('Listmodes/valid.npz')
X_valid = valid['X']
y_valid = valid['y']

test = np.load('Listmodes/test.npz')
X_test = test['X']
y_test = test['y']

In [6]:
del(train)
del(valid)
del(test)

In [7]:
# Load nomenclature
tn = pd.read_csv('Listmodes/train_test_nomenclature.csv')
tn.columns = ['name', 'id']

In [10]:
#************************************
# Looking for the best hyperparams 
#************************************
#from sklearn.model_selection import GridSearchCV
algo=tpe.suggest
nb_evals = 30

# kNN
nn = (1,50,1) # Peut mettre plus que 50
w = ('uniform','distance')
algs = ('ball_tree', 'kd_tree', 'brute')
p_knn = (1, 2)

knn_params = {'classif': 'knn', 'n_neighbors': hp.quniform('n_neighbors', *nn), 
               'weights': hp.choice('weights', w),
               'algorithm': hp.choice('algorithm', algs),\
                'p': hp.choice('p', p_knn), 'n_jobs': -1}


In [11]:
y_train = y_train.argmax(1)
y_valid = y_valid.argmax(1)
y_test = y_test.argmax(1)

In [12]:
# Fetch the NaN indices
nan_train = np.isnan(X_train).any(1)
nan_valid = np.isnan(X_valid).any(1)
nan_test = np.isnan(X_test).any(1)

# Delete NaNs observations
X_train = X_train[~nan_train]
y_train = y_train[~nan_train]

X_valid = X_valid[~nan_valid]
y_valid = y_valid[~nan_valid]

X_test = X_test[~nan_test]
y_test = y_test[~nan_test]

In [14]:
knn_best = fmin(
    fn=clf_eval, 
    space=knn_params,
    algo=algo,
    max_evals = nb_evals)

100%|██████████| 30/30 [1:34:57<00:00, 189.90s/it, best loss: 0.7490248262749302]


In [15]:
knn_best

{'algorithm': 1, 'n_neighbors': 50.0, 'p': 1, 'weights': 1}

In [16]:
os.chdir('/content/gdrive/My Drive/Models/SWINGS/')

In [17]:
# Store the best specification
with open('knn_best.pickle', 'wb') as handle:
    pickle.dump(knn_best, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [18]:
knn = KNeighborsClassifier(n_neighbors = int(knn_best['n_neighbors']), \
                           weights = w[knn_best['weights']], \
                               algorithm = algs[knn_best['algorithm']],
                               p = p_knn[knn_best['p']])
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=50, p=2,
                     weights='distance')

In [19]:
preds = knn.predict(X_test)

In [20]:
print("Evaluation of best performing model:")
class_accuracy = precision_score(y_test, preds, average = None, labels = list(set(y_test)))

print('Micro accuracy: ', precision_score(y_test, preds,\
                                average = 'micro', labels = list(set(y_test))))
print('Classes accuracy: ', dict(zip(tn['name'], class_accuracy)))
print('Macro accuracy: ', precision_score(y_test, preds,\
                                average = 'macro', labels = list(set(y_test))))

print('\n')
pd.set_option("display.max_rows", None, "display.max_columns", None) 
print(pd.DataFrame(confusion_matrix(y_test, preds,\
                    labels = tn['id']), index = tn['name'], columns =  tn['name']))

Evaluation of best performing model:
Micro accuracy:  0.7477899973698405
Classes accuracy:  {'MICRO': 0.24199288256227758, 'ORGNANO': 0.1073558648111332, 'ORGPICOPRO': 0.6793397137588901, 'REDNANO': 0.620214568040655, 'REDPICOEUK': 0.9718817486721532, 'inf1microm': 0.12044192401492669, 'sup1microm': 0.8700913778254985}
Macro accuracy:  0.5183479197636107


name        MICRO  ORGNANO  ORGPICOPRO  REDNANO  REDPICOEUK  inf1microm  \
name                                                                      
MICRO         204        5           0        7           0           0   
ORGNANO        13       54           0       45           3           0   
ORGPICOPRO      0        0        7737        0         166        4630   
REDNANO       557      416           0     5492         136           0   
REDPICOEUK      2        5        2527     3257       66605        1224   
inf1microm      0        0          58        0           6        4099   
sup1microm     44        0         988   

In [21]:
preds = knn.predict(X_valid)
print("Evaluation of best performing model:")
class_accuracy = precision_score(y_valid, preds, average = None, labels = list(set(y_valid)))

print('Micro accuracy: ', precision_score(y_valid, preds,\
                                average = 'micro', labels = list(set(y_valid))))
print('Classes accuracy: ', dict(zip(tn['name'], class_accuracy)))
print('Macro accuracy: ', precision_score(y_valid, preds,\
                                average = 'macro', labels = list(set(y_valid))))

print('\n')
pd.set_option("display.max_rows", None, "display.max_columns", None) 
print(pd.DataFrame(confusion_matrix(y_valid, preds,\
                    labels = tn['id']), index = tn['name'], columns =  tn['name']))

Evaluation of best performing model:
Micro accuracy:  0.7285773272554751
Classes accuracy:  {'MICRO': 0.2963325183374083, 'ORGNANO': 0.4778430325680726, 'ORGPICOPRO': 0.7703803837632834, 'REDNANO': 0.6820249919897469, 'REDPICOEUK': 0.9721985673495968, 'inf1microm': 0.03842079922965816, 'sup1microm': 0.8834571716574215}
Macro accuracy:  0.5939570763655279


name        MICRO  ORGNANO  ORGPICOPRO  REDNANO  REDPICOEUK  inf1microm  \
name                                                                      
MICRO         606       33           0      100           1           0   
ORGNANO       107      895           0      242          10           0   
ORGPICOPRO      0        0       20516        0         290       11109   
REDNANO      1205      893           0    10643         371           0   
REDPICOEUK      0       13        4101     4526       77632        1827   
inf1microm      0        0          23        0           1        2394   
sup1microm     84        0        1905   