In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!pip install fastparquet
!sudo apt-get install libsnappy-dev
!python3 -m pip install  python-snappy
!python3 -m pip install pyarrow
! pip install hyperas

Collecting fastparquet
  Downloading fastparquet-0.7.1-cp37-cp37m-manylinux2010_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 5.0 MB/s 
Collecting fsspec
  Downloading fsspec-2021.10.1-py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 66.0 MB/s 
[?25hCollecting thrift>=0.11.0
  Downloading thrift-0.15.0.tar.gz (59 kB)
[K     |████████████████████████████████| 59 kB 6.0 MB/s 
[?25hCollecting cramjam>=2.3.0
  Downloading cramjam-2.4.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 40.1 MB/s 
Building wheels for collected packages: thrift
  Building wheel for thrift (setup.py) ... [?25l[?25hdone
  Created wheel for thrift: filename=thrift-0.15.0-cp37-cp37m-linux_x86_64.whl size=348202 sha256=47782c25c5a16c5ded2d12236b548886e57a156666aee6dac32a64297cbb1680
  Stored in directory: /root/.cache/pip/wheels/ba/1f/8e/e6fd36837eecf3d1f2b23f1729477e8e06558d8d60b7093f51
Successfu

In [None]:
import re
import os 
import pickle
import numpy as np
import pandas as pd

from sklearn.metrics import precision_score, recall_score, accuracy_score
#from imblearn.under_sampling import EditedNearestNeighbours

os.chdir('/content/gdrive/My Drive/data/SSLAMM/L2')

############################################################
# Training of other algorithms on the unbiased dataset
############################################################

import pickle
import fastparquet as fp
from imblearn.under_sampling import RandomUnderSampler

from tensorflow.keras.models import load_model, model_from_json

from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils.class_weight import compute_sample_weight

from sklearn import svm
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

cluster_classes = ['MICRO', 'ORGNANO', 'ORGPICOPRO', 'REDNANO', 'REDPICOEUK',\
                   'REDPICOPRO', 'inf1microm','sup1microm']





In [None]:
def clf_eval(params):
    ''' Wrapper around classifiers for them to be fed into hyperopt '''
    classif = params['classif']
    del params['classif']
    
    if classif == 'knn':
        params['n_neighbors'] = int(params['n_neighbors'])
        clf = KNeighborsClassifier(**params)
    elif classif == 'svm':
        params['C'] = int(params['C'])
        clf = svm.LinearSVC(**params)
    elif classif == 'lgbm':
        params['n_estimators'] = int(params['n_estimators'])
        clf = LGBMClassifier(**params)
        
    clf.fit(X_train, y_train)

    if classif == 'svm':
        pred_valid = clf.predict(X_valid)
        w = compute_sample_weight(params['class_weight'], y_valid)
        accuracy = accuracy_score(y_valid, pred_valid, sample_weight = w)
        ll = -accuracy_score(y_valid, pred_valid, sample_weight = w)
    else:
        ll = log_loss(y_valid, clf.predict_proba(X_valid)) # No sample weight in KNN
    
    return {'loss': ll, 'status': STATUS_OK}

In [None]:
# Import Pulse data 
train = np.load('Listmodes/train.npz')
X_train = train['X']
y_train = train['y']

valid = np.load('Listmodes/valid.npz')
X_valid = valid['X']
y_valid = valid['y']

test = np.load('Listmodes/test.npz')
X_test = test['X']
y_test = test['y']

In [None]:
del(train)
del(valid)
del(test)

In [None]:
# Load nomenclature
tn = pd.read_csv('Listmodes/train_test_nomenclature.csv')
tn.columns = ['name', 'id']

In [None]:
#************************************
# Looking for the best hyperparams 
#************************************
#from sklearn.model_selection import GridSearchCV
algo=tpe.suggest
nb_evals = 30


# SVM
loss = ('hinge', 'squared_hinge')
C = (1, 1000, 1)
class_names, nb_samples  = np.unique(y_train, return_counts = True)
reweighted = dict(zip(class_names, 1/ nb_samples))
equal_weights = dict(zip(class_names, np.full(len(class_names), 1 / len(class_names))))
class_weight = (reweighted, equal_weights)


svm_params = {'classif': 'svm',\
            'penalty': 'l2',\
            'loss': hp.choice('loss', loss),\
            'C': hp.quniform('C', *C),\
            'max_iter': 2000,\
            'class_weight': hp.choice('class_weight', class_weight)}


In [None]:
y_train = y_train.argmax(1)
y_valid = y_valid.argmax(1)
y_test = y_test.argmax(1)

In [None]:
# Fetch the NaN indices
nan_train = np.isnan(X_train).any(1)
nan_valid = np.isnan(X_valid).any(1)
nan_test = np.isnan(X_test).any(1)

# Delete NaNs observations
X_train = X_train[~nan_train]
y_train = y_train[~nan_train]

X_valid = X_valid[~nan_valid]
y_valid = y_valid[~nan_valid]

X_test = X_test[~nan_test]
y_test = y_test[~nan_test]

In [None]:
svm_best = fmin(
    fn=clf_eval, 
    space=svm_params,
    algo=algo,
    max_evals = nb_evals)

  0%|          | 0/30 [00:00<?, ?it/s, best loss: ?]




  3%|▎         | 1/30 [05:43<2:46:05, 343.64s/it, best loss: -0.4842113685004586]




  7%|▋         | 2/30 [11:31<2:41:24, 345.87s/it, best loss: -0.6112047707355265]




 10%|█         | 3/30 [17:23<2:37:01, 348.95s/it, best loss: -0.7109398886180697]




 13%|█▎        | 4/30 [23:14<2:31:34, 349.80s/it, best loss: -0.7109398886180697]




 17%|█▋        | 5/30 [28:52<2:23:55, 345.41s/it, best loss: -0.7109398886180697]




 20%|██        | 6/30 [34:27<2:16:41, 341.74s/it, best loss: -0.7109398886180697]




 23%|██▎       | 7/30 [39:49<2:08:33, 335.38s/it, best loss: -0.7109398886180697]




 27%|██▋       | 8/30 [45:18<2:02:15, 333.41s/it, best loss: -0.7555698108678818]




 30%|███       | 9/30 [50:47<1:56:13, 332.06s/it, best loss: -0.7555698108678818]




 33%|███▎      | 10/30 [56:12<1:49:56, 329.85s/it, best loss: -0.7555698108678818]




 37%|███▋      | 11/30 [1:01:32<1:43:31, 326.93s/it, best loss: -0.7555698108678818]




 40%|████      | 12/30 [1:06:51<1:37:21, 324.50s/it, best loss: -0.7555698108678818]




 43%|████▎     | 13/30 [1:12:10<1:31:27, 322.77s/it, best loss: -0.7555698108678818]




 47%|████▋     | 14/30 [1:17:38<1:26:28, 324.30s/it, best loss: -0.7555698108678818]




 50%|█████     | 15/30 [1:22:56<1:20:38, 322.55s/it, best loss: -0.7555698108678818]




 53%|█████▎    | 16/30 [1:28:17<1:15:07, 321.94s/it, best loss: -0.7555698108678818]




 57%|█████▋    | 17/30 [1:33:42<1:09:58, 322.95s/it, best loss: -0.7555698108678818]




 60%|██████    | 18/30 [1:38:57<1:04:04, 320.39s/it, best loss: -0.7555698108678818]




 63%|██████▎   | 19/30 [1:44:20<58:53, 321.21s/it, best loss: -0.8008790548182727]  




 67%|██████▋   | 20/30 [1:49:47<53:50, 323.05s/it, best loss: -0.8008790548182727]




 70%|███████   | 21/30 [1:55:18<48:48, 325.42s/it, best loss: -0.8008790548182727]




 73%|███████▎  | 22/30 [2:00:52<43:43, 327.96s/it, best loss: -0.8008790548182727]




 77%|███████▋  | 23/30 [2:06:15<38:05, 326.54s/it, best loss: -0.8008790548182727]




 80%|████████  | 24/30 [2:11:42<32:40, 326.76s/it, best loss: -0.8008790548182727]




 83%|████████▎ | 25/30 [2:16:57<26:55, 323.15s/it, best loss: -0.8008790548182727]




 87%|████████▋ | 26/30 [2:22:08<21:17, 319.39s/it, best loss: -0.8008790548182727]




 90%|█████████ | 27/30 [2:27:23<15:54, 318.09s/it, best loss: -0.8008790548182727]




 93%|█████████▎| 28/30 [2:32:41<10:36, 318.24s/it, best loss: -0.8008790548182727]




 97%|█████████▋| 29/30 [2:37:53<05:16, 316.38s/it, best loss: -0.8008790548182727]




100%|██████████| 30/30 [2:43:17<00:00, 326.60s/it, best loss: -0.8008790548182727]


In [None]:
os.chdir('/content/gdrive/My Drive/Models/SSLAMM/')

In [None]:
# Store the best specification
with open('svm_best.pickle', 'wb') as handle:
    pickle.dump(svm_best, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
svm_best

{'C': 731.0, 'class_weight': 0, 'loss': 0}