In [1]:
%matplotlib inline

import pandas as pd
import stumpy
import numpy as np
import datetime as dt
import random
import math
import pickle
from statistics import mean
from tqdm.auto import tqdm

from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
with open('../nonzero_traces.npy', 'rb') as f:
    traces = pickle.load(f)

In [3]:
# adjust the lengths of 2 packet traces so they match
# by appending Numpy NaN to the end of the shorter traces
def adjust_lengths(long, short):
    
    # compute the total length, determine how much to append to shorter traces
    short_total = sum([len(i) for i in short])
    long_total = sum([len(i) for i in long])
    extension_size = round((long_total - short_total) / len(long))
    
    # append NaN entries to the end of each trace in the shorter list
    # this way they end up with a similar size
    for trace in short:
        trace = np.append(trace, [np.nan] * extension_size)
    
    return long, short

def merge_traces(id_1):
    trace1 = random.sample(traces[id_1], 100)
    trace2 = [random.choice(traces[x]) for x in range(len(traces))]
    
    # calculate average packet length
    trace1_avg = mean([len(i) for i in trace1])
    trace2_avg = mean([len(i) for i in trace2])
    
    # if the average length of the traces differes by over 15 % of their average length
    # we will append NaN to the ends of packet traces to make them align better
    if trace1_avg > trace2_avg:
        trace1, trace2 = adjust_lengths(trace1, trace2)
    # traces 2 contains longer
    else:
        trace1, trace2 = adjust_lengths(trace2, trace1)
    
    trace1_flat = np.asarray([item for row in trace1 for item in row]).astype('float64')
    trace2_flat = np.asarray([item for row in trace2 for item in row]).astype('float64')
    
    return trace1_flat, trace2_flat

In [4]:
def generate_profile(trace1, trace2, m):
    
    c1_c1 = stumpy.stump(trace1, m)[:, 0].astype(float)
    c2_c2 = stumpy.stump(trace2, m)[:, 0].astype(float)
    
    c1_c1[c1_c1 == np.inf] = np.nan
    c2_c2[c2_c2 == np.inf] = np.nan
    
    
    #print("self-profiles generated...")
    
    c1_c2 = stumpy.stump(trace1, m, trace2, ignore_trivial=False)[:, 0].astype(float)
    c2_c1 = stumpy.stump(trace2, m, trace1, ignore_trivial=False)[:, 0].astype(float)
    
    c1_c2[c1_c2 == np.inf] = np.nan
    c2_c1[c2_c1 == np.inf] = np.nan
    #print("Comparison profiles generated...")
    
    return c1_c1, c2_c2, c1_c2, c2_c1

def generate_shapelets(diff, trace, m, n):
    
    shapelet_list = []
    # find the maximum difference, append to shapelet list
    while(len(shapelet_list) < n):
        idx = np.argmax(diff)
        shapelet_list.append(trace[idx : idx + m])
        
        # create an exclusion zone around the index (so shapelets are spread apart)
        np.put(diff, list(range( max(idx-n, 0), min(idx + n, len(diff)) )), -1)
    
    shapelet_list = [x[~np.isnan(x)] for x in shapelet_list]
    
    return shapelet_list

In [25]:
def distance_to_shapelet(data, shapelet):
    """
    Compute the minimum distance beteen each data sample and a shapelet of interest
    """
    #data = np.asarray(data)
    #print(len(data))
    
    X = np.empty(len(data))
    for i in range(len(data)):
        D = stumpy.mass(shapelet, data[i])
        X[i] = D.min()

    return X.reshape(-1, 1)

def run_classifier(id_1, c1_shapes, m, n):
    
    # generate input data, split into training/testing
    X = random.sample(traces[id_1],n)
    y = [id_1] * n
    
    
    for i in range(n):
        random_id = random.randrange(100)
        if random_id == id_1:
            continue
        random_trace = random.choice(traces[random_id])
        X.append([random_trace])
        y.append(random_id)
    
    
    print(len(X))
    print(len(y))
    
    X = [np.asarray(trace).astype('float64') for trace in X]
    X = [trace[~np.isnan(trace)] for trace in X]

    
    
    removals = [i for i,x in enumerate(X) if len(x) < m]
    for idx in removals:
        X[idx] = None
        y[idx] = None
    X = [trace for trace in X if trace is not None]
    y = [value for value in y if value is not None]
    
    
    X_train_org, X_test_org, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    
    #print("Training and testing samples collected ...")
    
    clf = RandomForestClassifier()
    max_accuracy = 0
    best_shapelet = []
    
    for shape in c1_shapes:
        X_train = distance_to_shapelet(X_train_org, shape)
        X_test = distance_to_shapelet(X_test_org, shape)

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        
        accuracy = metrics.accuracy_score(y_test, y_pred)
        
        if accuracy > max_accuracy:
            max_accuracy = accuracy
            best_shapelet = shape
    
    print("Max Accuracy" + str(max_accuracy))
    
    return best_shapelet

In [26]:
shapelet_storage = [0] * 100
shapelet_size=400
num_shapelets=10

for i in tqdm(range(100)):
    # get packet traces for i and all others
    
    trace_i, trace_other = merge_traces(i)

    # compute differences
    ci_ci, co_co, ci_co, co_ci = generate_profile(trace_i, trace_other, shapelet_size)

    # find largest value gap between self and other
    diff_ci = ci_co - ci_ci

    # get maximum points (i.e. shapelets)
    ci_shapes = generate_shapelets(diff_ci, trace_i, shapelet_size, num_shapelets)
    
    shapelet_storage[i] = run_classifier(i, ci_shapes, shapelet_size, 500)

  0%|          | 0/100 [00:00<?, ?it/s]

997
997
Max Accuracy0.40217391304347827
998
998
Max Accuracy0.3978494623655914
990
990
Max Accuracy0.2631578947368421
990
990
Max Accuracy0.4574468085106383
995
995
Max Accuracy0.39361702127659576
996
996
Max Accuracy0.40860215053763443
996
996
Max Accuracy0.3978494623655914
997
997
Max Accuracy0.35789473684210527
995
995
Max Accuracy0.3763440860215054
994
994
Max Accuracy0.4148936170212766
993
993
Max Accuracy0.44565217391304346
997
997
Max Accuracy0.3548387096774194
994
994
Max Accuracy0.34782608695652173
990
990
Max Accuracy0.3978494623655914
997
997
Max Accuracy0.3763440860215054
994
994
Max Accuracy0.41975308641975306
998
998
Max Accuracy0.3978494623655914
994
994
Max Accuracy0.3655913978494624
995
995
Max Accuracy0.425531914893617
996
996
Max Accuracy0.40425531914893614
996
996
Max Accuracy0.044444444444444446
999
999
Max Accuracy0.425531914893617
995
995
Max Accuracy0.4891304347826087
996
996
Max Accuracy0.4148936170212766
994
994
Max Accuracy0.3978494623655914
995
995
Max Accur

In [27]:
with open('../shapelets.pkl', 'wb') as f:
    mynewlist = pickle.dump(shapelet_storage, f)