# REAL LIFE TEST

Test the algorithm in a real-life application: scan an entire OD.

**N.B.**: the first step is the data cleaning; here, data is already cleaned with the code in `cleaning` folder.

In [1]:
%matplotlib inline

# Scientific computing
import numpy as np
import pandas as pd

# Plot
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
#sns.set_context('paper')

# Machine Learning
# Model
from sklearn.svm import SVC
# Ensemble model
from sklearn.ensemble import BaggingClassifier

# Other
import time
#import requests
#import threading

## Load data
### Train data

In [2]:
first_cycle = True
with pd.HDFStore('../../classification/ris/OUT-classified-merged.h5', mode='r') as in_data:
    for group in ['GLITCH', 'NO_GLITCH']:
        if first_cycle == True:
            data = np.array(in_data[group].to_numpy())
            if group == 'GLITCH':
                target = np.ones(len(data))
            elif group == 'NO_GLITCH':
                target = np.zeros(len(data))
            else:
                print("ERROR.")
            first_cycle = False
        else:
            data = np.concatenate((data, in_data[group].to_numpy()))
            if group == 'GLITCH':
                target = np.concatenate((target, np.ones(len(in_data[group].to_numpy()))))
            elif group == 'NO_GLITCH':
                target = np.concatenate((target, np.zeros(len(in_data[group].to_numpy()))))
            else:
                print("ERROR.")
    data = np.concatenate((data, in_data['MULTI_GLITCH'].to_numpy()))
    target = np.concatenate((target, np.ones(len(in_data['MULTI_GLITCH'].to_numpy()))))

train_data = data
train_target = target
del data, target

### Test data

In [3]:
filename = 'C:/Users/paolo/OUT-cleaned.h5'
OD = '097'
detector = '143-5'
curr_df = OD + '/' + detector

with pd.HDFStore(filename, mode='r') as in_data:
    data_df = in_data[curr_df]

n_sequences = data_df.index[::100].shape[0]

## Normal model

### Train
Train the model on the classified data.

In [4]:
best_kernel = 'rbf'
best_gamma = 0.0151
best_C = 1.45

clf = SVC(kernel=best_kernel, gamma=best_gamma, C=best_C, probability=True)
clf.fit(train_data, train_target)

SVC(C=1.45, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0151, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

### Test

In [5]:
t = 0
for i in range(0, n_sequences-1):
    d = data_df.iloc[i*100 : i*100 + 100].to_numpy().transpose()
    t_b = time.time()
    clf.predict_proba(d)
    t_e = time.time()
    t += (t_e - t_b)
    print(str(int(i/n_sequences*10000)/100) + '%', end='\r')
print('Time [s]:', t)

Time [s]: 44.40004801750183


## Sorted model

In [6]:
train_data_s = np.sort(train_data, axis=1)

### Train
Train the model on the classified data.

In [7]:
best_kernel = 'linear'
best_C = 0.15

clf = SVC(kernel=best_kernel, C=best_C, probability=True)
clf.fit(train_data_s, train_target)

SVC(C=0.15, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

### Test

In [8]:
t = 0
for i in range(0, n_sequences-1):
    d = np.sort(data_df.iloc[i*100 : i*100 + 100].to_numpy().transpose(), axis=1)
    t_b = time.time()
    clf.predict_proba(d)
    t_e = time.time()
    t += (t_e - t_b)
    print(str(int(i/n_sequences*10000)/100) + '%', end='\r')
print('Time [s]:', t)

Time [s]: 15.12482237815857
