In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!pip install fastparquet
!sudo apt-get install libsnappy-dev
!python3 -m pip install  python-snappy
!python3 -m pip install pyarrow
! pip install hyperas

Collecting fastparquet
  Downloading fastparquet-0.7.1-cp37-cp37m-manylinux2010_x86_64.whl (1.2 MB)
[?25l[K     |▎                               | 10 kB 18.0 MB/s eta 0:00:01[K     |▋                               | 20 kB 21.5 MB/s eta 0:00:01[K     |▉                               | 30 kB 23.9 MB/s eta 0:00:01[K     |█▏                              | 40 kB 17.2 MB/s eta 0:00:01[K     |█▍                              | 51 kB 9.0 MB/s eta 0:00:01[K     |█▊                              | 61 kB 9.1 MB/s eta 0:00:01[K     |██                              | 71 kB 8.3 MB/s eta 0:00:01[K     |██▎                             | 81 kB 9.2 MB/s eta 0:00:01[K     |██▌                             | 92 kB 8.9 MB/s eta 0:00:01[K     |██▉                             | 102 kB 8.1 MB/s eta 0:00:01[K     |███                             | 112 kB 8.1 MB/s eta 0:00:01[K     |███▍                            | 122 kB 8.1 MB/s eta 0:00:01[K     |███▋                            | 133

In [3]:
import re
import os 
import pickle
import numpy as np
import pandas as pd

from sklearn.metrics import precision_score, recall_score, accuracy_score
#from imblearn.under_sampling import EditedNearestNeighbours

os.chdir('/content/gdrive/My Drive/data/SWINGS/L2')

############################################################
# Training of other algorithms on the unbiased dataset
############################################################

import pickle
import fastparquet as fp
from imblearn.under_sampling import RandomUnderSampler

from tensorflow.keras.models import load_model, model_from_json

from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils.class_weight import compute_sample_weight

from sklearn import svm
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

cluster_classes = ['MICRO', 'ORGNANO', 'ORGPICOPRO', 'REDNANO', 'REDPICOEUK',\
                   'REDPICOPRO', 'inf1microm','sup1microm']





In [4]:
def clf_eval(params):
    ''' Wrapper around classifiers for them to be fed into hyperopt '''
    classif = params['classif']
    del params['classif']
    
    if classif == 'knn':
        params['n_neighbors'] = int(params['n_neighbors'])
        clf = KNeighborsClassifier(**params)
    elif classif == 'svm':
        params['C'] = int(params['C'])
        clf = svm.LinearSVC(**params)
    elif classif == 'lgbm':
        params['n_estimators'] = int(params['n_estimators'])
        clf = LGBMClassifier(**params)
        
    clf.fit(X_train, y_train)

    if classif == 'svm':
        pred_valid = clf.predict(X_valid)
        w = compute_sample_weight(params['class_weight'], y_valid)
        accuracy = accuracy_score(y_valid, pred_valid, sample_weight = w)
        ll = -accuracy_score(y_valid, pred_valid, sample_weight = w)
    else:
        ll = log_loss(y_valid, clf.predict_proba(X_valid)) # No sample weight in KNN
    
    return {'loss': ll, 'status': STATUS_OK}

In [5]:
# Import Pulse data 
train = np.load('Listmodes/train.npz')
X_train = train['X']
y_train = train['y']

valid = np.load('Listmodes/valid.npz')
X_valid = valid['X']
y_valid = valid['y']

test = np.load('Listmodes/test.npz')
X_test = test['X']
y_test = test['y']

In [6]:
del(train)
del(valid)
del(test)

In [7]:
# Load nomenclature
tn = pd.read_csv('Listmodes/train_test_nomenclature.csv')
tn.columns = ['name', 'id']

In [8]:
#************************************
# Looking for the best hyperparams 
#************************************
#from sklearn.model_selection import GridSearchCV
algo=tpe.suggest
nb_evals = 30


# SVM
loss = ('hinge', 'squared_hinge')
C = (1, 1000, 1)
class_names, nb_samples  = np.unique(y_train, return_counts = True)
reweighted = dict(zip(class_names, 1/ nb_samples))
equal_weights = dict(zip(class_names, np.full(len(class_names), 1 / len(class_names))))
class_weight = (reweighted, equal_weights)


svm_params = {'classif': 'svm',\
            'penalty': 'l2',\
            'loss': hp.choice('loss', loss),\
            'C': hp.quniform('C', *C),\
            'max_iter': 2000,\
            'class_weight': hp.choice('class_weight', class_weight)}


In [9]:
y_train = y_train.argmax(1)
y_valid = y_valid.argmax(1)
y_test = y_test.argmax(1)

In [10]:
# Fetch the NaN indices
nan_train = np.isnan(X_train).any(1)
nan_valid = np.isnan(X_valid).any(1)
nan_test = np.isnan(X_test).any(1)

# Delete NaNs observations
X_train = X_train[~nan_train]
y_train = y_train[~nan_train]

X_valid = X_valid[~nan_valid]
y_valid = y_valid[~nan_valid]

X_test = X_test[~nan_test]
y_test = y_test[~nan_test]

In [11]:
svm_best = fmin(
    fn=clf_eval, 
    space=svm_params,
    algo=algo,
    max_evals = nb_evals)

  0%|          | 0/30 [00:00<?, ?it/s, best loss: ?]




  3%|▎         | 1/30 [05:53<2:51:05, 353.99s/it, best loss: -0.4376161890555284]




  7%|▋         | 2/30 [12:10<2:51:18, 367.09s/it, best loss: -0.7472557413389962]




 10%|█         | 3/30 [18:03<2:42:22, 360.83s/it, best loss: -0.7472557413389962]




 13%|█▎        | 4/30 [23:55<2:34:50, 357.32s/it, best loss: -0.7472557413389962]




 17%|█▋        | 5/30 [29:49<2:28:20, 356.02s/it, best loss: -0.7472557413389962]




 20%|██        | 6/30 [36:05<2:25:06, 362.79s/it, best loss: -0.7472557413389962]




 23%|██▎       | 7/30 [42:21<2:20:46, 367.22s/it, best loss: -0.7472557413389962]




 27%|██▋       | 8/30 [48:15<2:13:07, 363.06s/it, best loss: -0.7472557413389962]




 30%|███       | 9/30 [54:16<2:06:51, 362.44s/it, best loss: -0.7472557413389962]




 33%|███▎      | 10/30 [1:00:20<2:00:55, 362.79s/it, best loss: -0.7472557413389962]




 37%|███▋      | 11/30 [1:06:18<1:54:23, 361.24s/it, best loss: -0.7472557413389962]




 40%|████      | 12/30 [1:12:33<1:49:38, 365.48s/it, best loss: -0.7472557413389962]




 43%|████▎     | 13/30 [1:18:51<1:44:37, 369.24s/it, best loss: -0.7472557413389962]




 47%|████▋     | 14/30 [1:24:45<1:37:13, 364.60s/it, best loss: -0.7472557413389962]




 50%|█████     | 15/30 [1:31:00<1:31:56, 367.75s/it, best loss: -0.7472557413389962]




 53%|█████▎    | 16/30 [1:37:07<1:25:46, 367.64s/it, best loss: -0.7472557413389962]




 57%|█████▋    | 17/30 [1:43:18<1:19:50, 368.51s/it, best loss: -0.7472557413389962]




 60%|██████    | 18/30 [1:49:09<1:12:41, 363.48s/it, best loss: -0.7472557413389962]




 63%|██████▎   | 19/30 [1:55:01<1:05:57, 359.80s/it, best loss: -0.7472557413389962]




 67%|██████▋   | 20/30 [2:00:49<59:23, 356.35s/it, best loss: -0.7472557413389962]  




 70%|███████   | 21/30 [2:06:59<54:04, 360.48s/it, best loss: -0.7472557413389962]




 73%|███████▎  | 22/30 [2:13:10<48:29, 363.69s/it, best loss: -0.7472557413389962]




 77%|███████▋  | 23/30 [2:19:28<42:55, 367.91s/it, best loss: -0.7472557413389962]




 80%|████████  | 24/30 [2:25:47<37:07, 371.25s/it, best loss: -0.7803637687368028]




 83%|████████▎ | 25/30 [2:32:02<31:02, 372.49s/it, best loss: -0.8087326796569297]




 87%|████████▋ | 26/30 [2:38:19<24:54, 373.70s/it, best loss: -0.8087326796569297]




 90%|█████████ | 27/30 [2:44:34<18:42, 374.05s/it, best loss: -0.8087326796569297]




 93%|█████████▎| 28/30 [2:50:43<12:25, 372.54s/it, best loss: -0.8087326796569297]




 97%|█████████▋| 29/30 [2:56:55<06:12, 372.32s/it, best loss: -0.8087326796569297]




100%|██████████| 30/30 [3:03:06<00:00, 366.22s/it, best loss: -0.8244091585778087]


In [12]:
os.chdir('/content/gdrive/My Drive/Models/SWINGS/')

In [13]:
# Store the best specification
with open('svm_best.pickle', 'wb') as handle:
    pickle.dump(svm_best, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
svm_best

{'C': 3, 'class_weight': 1, 'loss': 1}