In [1]:
%matplotlib inline

import pandas as pd
import stumpy
import numpy as np
import datetime as dt
import random
import math
import pickle
from statistics import mean
from tqdm.auto import tqdm

from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
with open('../nonzero_traces.npy', 'rb') as f:
    traces = pickle.load(f)
    
with open('../shapelets.pkl', 'rb') as f:
    shapelets = pickle.load(f)

In [3]:
'''
Compute the minimum distance beteen data samples and shapelets
Input:
    data = list of individual packet traces
    shapelets = list of shapelets
Output:
    minimum distance between each sample in data compared with each sample in shapelet
    shape = (len(data),len(shapelets))
'''
def distance_to_shapelet(data, shapelets):
    #data = np.asarray(data)
    #print(len(data))
    
    # processed output data
    data_out = np.zeros((len(data),len(shapelets)))
    
    # loop over each sample in the dataset
    for i,sample in enumerate(tqdm(data)):
        shapelet_score = np.empty(len(shapelets))
        # for each shapelet, calculate distance and assign a score
        for j,shapelet in enumerate(shapelets):
            dist = stumpy.mass(shapelet, sample)
            shapelet_score[j] = dist.min()
        data_out[i] = shapelet_score
    
    return data_out

'\nbuild an average "representative trace" from a collection\n5 shapelets of A vs B,C\n\n'

In [11]:
shapelet_length = 400

# get X values for input

X, y = [], []

for i in range(450000):
    random_id = random.randrange(100)
    random_trace = random.choice(traces[random_id])
    X.append([random_trace])
    y.append(random_id)

# process and remove useless entries (too short)
X = [np.asarray(trace).astype('float64') for trace in X]
X = [trace[~np.isnan(trace)] for trace in X]    
removals = [i for i,x in enumerate(X) if len(x) < shapelet_length]
for idx in removals:
    X[idx] = None
    y[idx] = None
X = [trace for trace in X if trace is not None]
y = [value for value in y if value is not None]

print("pre-processing done")

# compute distance between input trace and shapelet arrays
# return as new X

X = distance_to_shapelet(X, shapelets)

print(X.shape)
print(X[0].shape)

with open('../X.pkl', 'wb') as f:
    pickle.dump(X, f)

with open('../y.pkl', 'wb') as f:
    pickle.dump(y, f)

pre-processing done


  0%|          | 0/388486 [00:00<?, ?it/s]

(388486, 100)
(100,)


In [None]:
with open('../X.pkl', 'rb') as f:
    X = pickle.load(f)

with open('../y.pkl', 'rb') as f:
    y = pickle.load(f)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

#print("Training and testing samples collected ...")

clf = RandomForestClassifier(verbose=1)
clf.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 13.0min finished


RandomForestClassifier(verbose=1)

In [13]:
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    4.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    2.2s finished


In [20]:
print(len(y_prob))

correct = 0

for i in range(len(y_prob)):
    k = 1
    ind = np.argpartition(y_prob[i], -k)[-k:]
    if y_test[i] in ind:
        correct += 1

print(correct/len(y_prob))

38849
0.80617261705578


In [None]:
'''
Parameters

# Samples for shapelet extraction
Shapelet size
# Samples for classifier
Hyper-parameters of classifier
Traffic Representation

Opt: concatenated shapelets
'''