# Tune hyper-parameters SVC model

In [1]:
%matplotlib inline

# Reading files
import h5py
import toml

# Scientific computing
import numpy as np
import pandas as pd
from scipy import interp

# Plot
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
#sns.set_context('paper')

# Machine Learning
## Model
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier

from sklearn.preprocessing import normalize
## Splitter Classes
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
# Splitter Functions
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
# Hyper-parameter optimizers
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
# Model validation
from sklearn.model_selection import learning_curve
# Training metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

# Other
import os
import time
import requests

Intel(R) Data Analytics Acceleration Library (Intel(R) DAAL) solvers for sklearn enabled: https://intelpython.github.io/daal4py/sklearn.html


# NO MULTI GLITCH

## Preparation

Load data and target from `classification/ris/OUT-classified-merged.h5` and load into numpy arrays.

- **Label `0` = NO GLITCH**

- **Label `1` = GLITCH**

In [2]:
first_cycle = True
with pd.HDFStore('../../classification/ris/OUT-classified-merged.h5', mode='r') as in_data:
    for group in ['GLITCH', 'NO_GLITCH']:
        if first_cycle == True:
            data = np.array(in_data[group].to_numpy())
            if group == 'GLITCH':
                target = np.ones(len(data))
            elif group == 'NO_GLITCH':
                target = np.zeros(len(data))
            else:
                print("ERROR.")
            first_cycle = False
        else:
            data = np.concatenate((data, in_data[group].to_numpy()))
            if group == 'GLITCH':
                target = np.concatenate((target, np.ones(len(in_data[group].to_numpy()))))
            elif group == 'NO_GLITCH':
                target = np.concatenate((target, np.zeros(len(in_data[group].to_numpy()))))
            else:
                print("ERROR.")

## Tuning the hyper-parameters

### Kernel: `rbf`

Best parameters set found on development set: `{'C': 0.8, 'gamma': 0.0145}`

Score: `0.9786749394039134`

#### Randomized Search

### Kernel: `sigmoid`

Best parameters set found on development set: `{'C': 0.38000000000000006, 'coef0': -1.55, 'gamma': 0.0029500000000000012}`

Score: `0.7365370516420923`

#### Randomized Search

#### Grid Search

### Bagging Classifier

Best parameters set found on development set: {'n_estimators': 4, 'max_samples': 0.97}

### Sorted data

Best parameters set found on development set: `{'kernel': 'poly', 'gamma': 0.012, 'coef0': 2.4499999999997346, 'C': 7.3}`

Score: `0.9982725496041605`

# YES MULTI GLITCH

## Preparation

Load data and target from `classification/ris/OUT-classified-merged.h5` and load into numpy arrays.

- **Label `0` = NO GLITCH**

- **Label `1` = GLITCH and MULTI GLITCH**

In [3]:
with pd.HDFStore('../../classification/ris/OUT-classified-merged.h5', mode='r') as in_data:
    data = np.concatenate((data, in_data['MULTI_GLITCH'].to_numpy()))
    target = np.concatenate((target, np.ones(len(in_data['MULTI_GLITCH'].to_numpy()))))

## Tuning the hyper-parameters

### Kernel: `rbf`

Best parameters set found on development set: `{'gamma': 0.0151, 'C': 1.45}`

#### Randomized Search

#### Grid Search

### Sorted

In [5]:
data_s = np.sort(data, axis=1)

# Grid
parameters = {
    'C': np.arange(0.01, 10, 0.01)
}

# Start search
GSCV = GridSearchCV(SVC(kernel='linear'), parameters, n_jobs=-1, cv=5, iid=False)
GSCV.fit(data_s, target)

# Print best parameters
print('Best parameters set found on development set:', GSCV.best_params_)
print('Score:', GSCV.best_score_)

# Print into a file the grid score
with open('ris/GridSearch_out.md', mode='a') as f:
    print('# ' + time.ctime(), file=f)
    print('', file=f)
    print('### GridSearchCV parameters:', file=f)
    print('', file=f)
    print('```python', file=f)
    print(GSCV.get_params, file=f)
    print('```', file=f)
    print('', file=f)
    print('### Best SVC parameters:', file=f)
    print('', file=f)
    print('```python', file=f)
    print(GSCV.best_estimator_, file=f)
    print('```', file=f)
    print('', file=f)
    print('### Best parameters set found on development set:', file=f)
    print('', file=f)
    print('```python', file=f)
    print(GSCV.best_params_, file=f)
    print('```', file=f)
    print('', file=f)
    print('### Grid scores on development set:', file=f)
    print('', file=f)
    print('```', file=f)
    means = GSCV.cv_results_['mean_test_score']
    stds = GSCV.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, GSCV.cv_results_['params']):
        print('%0.3f (+/-%0.03f) for %r'
              % (mean, std * 2, params), file=f)
    print('```', file=f)
    print('', file=f)
    print('', file=f)

# Send telegram message
#telegram_bot_id = toml.load('../telegram_bot_id.toml')
#params = {'chat_id': telegram_bot_id['chat_id'], 'text': '[python] SVC randomized search terminated.'}
#requests.post('https://api.telegram.org/' + telegram_bot_id['bot_id'] + '/sendMessage', params=params)

Best parameters set found on development set: {'C': 0.060000000000000005}
Score: 0.9995
