In [None]:
import sys
import os
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import roc_curve, auc, fbeta_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler


import xgboost as xgb
from itertools import product
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import time
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999

sys.path.insert(0, '../../scripts/modeling_toolbox/')
# load the autoreload extension
%load_ext autoreload
# Set extension to reload modules every time before executing code
%autoreload 2

from metric_processor import MetricProcessor
import evaluation

%matplotlib inline

In [None]:
features = [
            'dimension', 
#             'size',
#             'fps',
#             'temporal_difference-euclidean', 
            #'temporal_difference-manhattan',
            #'temporal_difference-max', 
            #'temporal_difference-mean',
            #'temporal_difference-std', 
#             'temporal_dct-euclidean', 
            #'temporal_dct-manhattan',
#             'temporal_dct-max', 
            'temporal_dct-mean',
#             'temporal_dct-std',
#             'temporal_gaussian_mse-euclidean', 
            #'temporal_gaussian_mse-manhattan',
#             'temporal_gaussian_mse-max', 
            'temporal_gaussian_mse-mean',
#             'temporal_gaussian_mse-std',
#             'temporal_gaussian_difference-max'area, 
#             'temporal_gaussian_difference-mean',
#             'temporal_gaussian_difference-std',
#             'temporal_threshold_gaussian_difference-max', 
#             'temporal_threshold_gaussian_difference-mean',
#             'temporal_threshold_gaussian_difference-std',
#             'temporal_histogram_distance-euclidean',
            #'temporal_histogram_distance-manhattan',
#             'temporal_histogram_distance-max', 
#             'temporal_histogram_distance-mean',
#             'temporal_histogram_distance-std'
               ]
path = '../../machine_learning/cloud_functions/data-large.csv'

metric_processor = MetricProcessor(features,'SL', path, scale=True)
df = metric_processor.read_and_process_data()
df.shape

In [None]:
(x_test_all, y_test_all), (x_train, y_train), (x_test, y_test) = metric_processor.split_test_and_train(df)

In [None]:
classifier = RandomForestClassifier(n_estimators=5, n_jobs=7)

classifier.fit(x_train, y_train)

In [None]:
fb, area, tnr, tpr = evaluation.supervised_evaluation(classifier, x_test, y_test)
print(fb, area, tnr, tpr)

In [None]:
importances = classifier.feature_importances_
indices = np.argsort(importances)[::-1]

print('Feature ranking:')
for f in range(x_test.shape[1]):
    print('{}º: {} -- ({})'.format(f + 1, features[indices[f]], importances[indices[f]]))

# Visualizing the Trees

In [None]:
from sklearn.tree import export_graphviz

In [None]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz


clf = DecisionTreeClassifier(max_depth=2)
clf.fit(x_train, y_train)

In [None]:
fb, area, tnr, tpr = evaluation.supervised_evaluation(classifier, x_test, y_test)
print(fb, tnr, tpr)

In [None]:
export_graphviz(clf, out_file='tree.dot', 
                feature_names=features,
                class_names=True,
                rounded=True, proportion = False, 
                precision=6, filled=True)

In [None]:
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=100'])

# Understanding the Classification

It looks like with a very simple threshold we can achieve outstanding results. Let's check it out:

In [None]:
threshold = 0.062695

In [None]:
y_pred = x_test['temporal_gaussian_mse-mean'] <= threshold

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
TPR = tp/(tp+fn)
TNR = tn/(tn+fp)
print('TPR: {}. TNR: {}'.format(TPR, TNR))

# Training with only one attack

In [None]:
metric_processor = MetricProcessor(features,'UL', path, scale=True)
df = metric_processor.read_and_process_data()
(X_train, X_test, X_attacks), (df_train, df_test, df_attacks) = metric_processor.split_test_and_train(df)

In [None]:
metric_processor = MetricProcessor(features,'SL', path, scale=True)
df = metric_processor.read_and_process_data()
(x_test_all_, y_test_all_), (x_train_, y_train_), (x_test_, y_test_) = metric_processor.split_test_and_train(df)

In [None]:
attack_type = 'low_bitrate_8'
df_attacks_sel = df_attacks[df_attacks['attack'].str.contains(attack_type) 
                            ]
mask = np.random.rand(len(df_attacks_sel)) < 0.8
df_attacks_sel_train = df_attacks_sel[mask]
df_attacks_sel_test = df_attacks_sel[~mask]
attacks_sel_train = np.array(df_attacks_sel_train)[:, :X_train.shape[1]]
attacks_sel_test = np.array(df_attacks_sel_test)[:, :X_train.shape[1]]

In [None]:
x_train = np.concatenate((X_train, attacks_sel_train))
y_train = np.concatenate((np.ones(len(X_train)), np.zeros(len(attacks_sel_train))))

x_test = np.concatenate((X_test, attacks_sel_test))
y_test = np.concatenate((np.ones(len(X_test)), np.zeros(len(attacks_sel_test))))

# clf = DecisionTreeClassifier(max_depth=5)
# clf = RandomForestClassifier(n_estimators=50, n_jobs=7)
clf = xgb.XGBClassifier(max_depth=8, n_jobs=-1, n_estimators=200)
clf.fit(x_train, y_train)

In [None]:
fb, area, tnr, tpr = evaluation.supervised_evaluation(clf, x_train, y_train)
print(fb, tnr, tpr)

In [None]:
fb, area, tnr, tpr = evaluation.supervised_evaluation(clf, x_test, y_test)
print(fb, tnr, tpr)

In [None]:
fb, area, tnr, tpr = evaluation.supervised_evaluation(clf, np.array(x_test_), np.array(y_test_))
print(fb, tnr, tpr)

In [None]:
fb, area, tnr, tpr = evaluation.supervised_evaluation(clf, x_test_all_, y_test_all_)
print(fb, tnr, tpr)