# Imports

In [1]:
import joblib
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from time import perf_counter
import sys; sys.path.append("..")
from scipy.stats import reciprocal
from modules.split import splitting_train_test
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold

# Reading the processed dataset

In [2]:
dataset = pd.read_csv('../dataset/processed/SUEE1.csv', index_col=[0, 1])

In [3]:
dataset.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,number_requisitions,number_different_destinations,mean_frame_length,flag_2,flag_4,flag_16,flag_17,flag_18,flag_20,flag_24,flag_25,flag_82,flag_144,flag_152,flag_194,y
date,source_ip,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1970-01-05 01:38:06+00:00,3232235521,0.607143,0.545455,62.142857,0.5,1.0,0.35,0.714286,0.5,0.0,0.529412,0.0,0.0,0.0,0.0,0.0,0
1970-01-05 01:38:06+00:00,3232235529,0.142857,0.090909,55.0,0.25,0.0,0.1,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0
1970-01-05 01:38:06+00:00,3232235531,0.214286,0.090909,54.666667,0.25,0.0,0.15,0.0,0.0,0.0,0.117647,0.0,0.0,0.0,0.0,0.0,0
1970-01-05 01:38:06+00:00,3232235532,0.0,0.090909,66.8,0.0,0.0,0.2,0.142857,0.25,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,0
1970-01-05 01:38:06+00:00,3232235547,0.0,0.090909,67.333333,0.0,0.0,0.15,0.142857,0.25,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 229324 entries, (1970-01-05 01:38:06+00:00, 3232235521) to (1970-01-05 02:49:24+00:00, 3232235527)
Data columns (total 16 columns):
number_requisitions              229324 non-null float64
number_different_destinations    229324 non-null float64
mean_frame_length                229324 non-null float64
flag_2                           229324 non-null float64
flag_4                           229324 non-null float64
flag_16                          229324 non-null float64
flag_17                          229324 non-null float64
flag_18                          229324 non-null float64
flag_20                          229324 non-null float64
flag_24                          229324 non-null float64
flag_25                          229324 non-null float64
flag_82                          229324 non-null float64
flag_144                         229324 non-null float64
flag_152                         229324 non-null float64
flag_194            

In [6]:
dataset.y.value_counts(normalize=True)

0    0.976439
1    0.023561
Name: y, dtype: float64

# Splitting the whole dataset into a traing and testing dataset

In [26]:
X_train, X_test, y_train, y_test = splitting_train_test(dataset)

# Fine-Tuning SVM

In [None]:
# conjunto de parâmetros
parameters = {
    'C'      : reciprocal(1, 1000),
    'kernel' : ['rbf'],
    'gamma'  : ['scale'],
    'coef0'  : np.arange(0, 10, 0.001),
    'degree' : range(1, 10),
}

# criado 10 folds para classes desbalanceadas
cross_val = StratifiedKFold(n_splits=5)

# instanciando o classificador
svm_clf = SVC(class_weight='balanced', 
              decision_function_shape='ovo')

# implementando o fine tuning
ran_search = RandomizedSearchCV(svm_clf, param_distributions=parameters, 
                                n_iter=5, cv=cross_val, scoring='f1', 
                                verbose=3, n_jobs=4, iid=False, refit=True)

In [None]:
ran_search.fit(X_train, y_train)

In [None]:
ran_search.best_score_

In [None]:
ran_search.best_params_

# Fine-Tuning SGDClassifier


In [22]:
# conjunto de parâmetros
parameters = {
    'alpha'    : [1e-06],
    'l1_ratio' : np.arange(0.00, 1.01, 0.01),
}

# criado 10 folds para classes desbalanceadas
cross_val = StratifiedKFold(n_splits=5)

# the size of training set
n = X_train.shape[0]

# instanciando o classificador
sgd_clf = SGDClassifier(loss='hinge',
                        alpha=0.001,
                        class_weight='balanced', 
                        max_iter = np.ceil(10**6 / n),
                        shuffle = True
                       )
# implementando o fine tuning
grid_search_sgdc = GridSearchCV(sgd_clf, param_grid=parameters, cv=cross_val, 
                               scoring='f1', verbose=3, n_jobs=4, 
                               iid=False, refit=True)

In [23]:
grid_search_sgdc.fit(X_train, y_train)

Fitting 5 folds for each of 101 candidates, totalling 505 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:   13.4s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:   26.0s
[Parallel(n_jobs=4)]: Done 280 tasks      | elapsed:   46.6s
[Parallel(n_jobs=4)]: Done 505 out of 505 | elapsed:  1.2min finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             error_score='raise-deprecating',
             estimator=SGDClassifier(alpha=0.001, average=False,
                                     class_weight='balanced',
                                     early_stopping=False, epsilon=0.1,
                                     eta0=0.0, fit_intercept=True,
                                     l1_ratio=0.15, learning_rate='optimal',
                                     loss='hinge', max_iter=6.0,
                                     n_iter_no_change=5, n_jobs=None,
                                     penalty='l2'...
       0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,
       0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,
       0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,
       0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,
       0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.9

In [24]:
grid_search_sgdc.best_score_

0.7851785973343857

In [25]:
grid_search_sgdc.best_params_

{'alpha': 1e-06, 'l1_ratio': 0.26}

In [11]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [27]:
y_pred = grid_search_sgdc.predict(X_test)

In [17]:
acc = balanced_accuracy_score(y_test, y_pred)
pre = precision_score(y_test, y_pred, labels=np.unique(y_pred))
rec = recall_score(y_test, y_pred, labels=np.unique(y_pred))
f1s = f1_score(y_test, y_pred, labels=np.unique(y_pred))

In [18]:
acc

0.5

In [19]:
pre

0.0

In [20]:
rec

0.0

In [21]:
f1s

0.0

## Saving the optimized classifier

In [34]:
joblib.dump(grid_search_sgdc.best_estimator_, '../classifiers/svm_lin_4.plk')

['../classifiers/svm_lin_4.plk']

# Foo 

In [5]:
sig = SVC(C=9.829998870668184, coef0=5.811, degree=3, gamma='scale', kernel='sigmoid')

In [6]:
sig.fit(X_train, y_train)

SVC(C=9.829998870668184, cache_size=200, class_weight=None, coef0=5.811,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='sigmoid',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [8]:
joblib.dump(sig, '../classifiers/svm_sig.plk')

['../classifiers/svm_sig.plk']

In [9]:
y_pred = sig.predict(X_test)