In [1]:
%matplotlib inline

import pandas as pd
import stumpy
import numpy as np
import datetime as dt
import random
import math
import pickle
from statistics import mean
from tqdm.auto import tqdm

from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
with open('../nonzero_traces.npy', 'rb') as f:
    traces = pickle.load(f)

In [None]:
# Goal: generate shapelets for every combination of traces (no reverse)

In [None]:
shapelet_storage = 

In [3]:
# adjust the lengths of 2 packet traces so they match
# by appending Numpy NaN to the end of the shorter traces
def adjust_lengths(long, short):
    
    # compute the total length, determine how much to append to shorter traces
    short_total = sum([len(i) for i in short])
    long_total = sum([len(i) for i in long])
    extension_size = round((long_total - short_total) / len(long))
    
    # append NaN entries to the end of each trace in the shorter list
    # this way they end up with a similar size
    for trace in short:
        trace = np.append(trace, [np.nan] * extension_size)
    
    return long, short

def merge_traces(id_1, id_2, num_traces=50):
    trace1 = random.sample(traces[id_1], num_traces)
    trace2 = random.sample(traces[id_2], num_traces)
    
    # calculate average packet length
    trace1_avg = mean([len(i) for i in trace1])
    trace2_avg = mean([len(i) for i in trace2])
    
    
    # if the average length of the traces differes by over 15 % of their average length
    # we will append NaN to the ends of packet traces to make them align better
    if trace1_avg > trace2_avg:
        trace1, trace2 = adjust_lengths(trace1, trace2)
    # traces 2 contains longer
    else:
        trace1, trace2 = adjust_lengths(trace2, trace1)
    
    trace1_flat = np.asarray([item for row in trace1 for item in row]).astype('float64')
    trace2_flat = np.asarray([item for row in trace2 for item in row]).astype('float64')
    
    
    
    return trace1_flat, trace2_flat

In [4]:
def generate_profile(trace1, trace2, m):
    
    c1_c1 = stumpy.stump(trace1, m)[:, 0].astype(float)
    c2_c2 = stumpy.stump(trace2, m)[:, 0].astype(float)
    
    c1_c1[c1_c1 == np.inf] = np.nan
    c2_c2[c2_c2 == np.inf] = np.nan
    
    
    #print("self-profiles generated...")
    
    c1_c2 = stumpy.stump(trace1, m, trace2, ignore_trivial=False)[:, 0].astype(float)
    c2_c1 = stumpy.stump(trace2, m, trace1, ignore_trivial=False)[:, 0].astype(float)
    
    c1_c2[c1_c2 == np.inf] = np.nan
    c2_c1[c2_c1 == np.inf] = np.nan
    #print("Comparison profiles generated...")
    
    return c1_c1, c2_c2, c1_c2, c2_c1

def generate_shapelets(diff, trace, m, n):
    
    shapelet_list = []
    # find the maximum difference, append to shapelet list
    while(len(shapelet_list) < n):
        idx = np.argmax(diff)
        shapelet_list.append(trace[idx : idx + m])
        
        # create an exclusion zone around the index (so shapelets are spread apart)
        np.put(diff, list(range( max(idx-n, 0), min(idx + n, len(diff)) )), -1)
    
    shapelet_list = [x[~np.isnan(x)] for x in shapelet_list]
    
    return shapelet_list

In [5]:
def distance_to_shapelet(data, shapelets):
    """
    Compute the minimum distance beteen each data sample and a shapelet of interest
    """
    #data = np.asarray(data)
    #print(len(data))
    
    # processed output data
    data_out = np.zeros((len(data),len(shapelets)))
    
    # loop over each sample in the dataset
    for i,sample in enumerate(data):
        shapelet_score = np.empty(len(shapelets))
        # for each shapelet, calculate distance and assign a score
        for j,shapelet in enumerate(shapelets):
            dist = stumpy.mass(shapelet, sample)
            shapelet_score[j] = dist.min()
        data_out[i] = shapelet_score
    
    return data_out

def run_classifier(id_1, id_2, c1_shapes, c2_shapes, m, n):
    
    # generate input data, split into training/testing
    X = random.sample(traces[id_1],n)
    X.extend(random.sample(traces[id_2],n))
    y = [id_1] * n + [id_2] * n
    X = [np.asarray(trace).astype('float64') for trace in X]
    X = [trace[~np.isnan(trace)] for trace in X]

    
    removals = [i for i,x in enumerate(X) if len(x) < m]
    for idx in removals:
        X[idx] = None
        y[idx] = None
    X = [trace for trace in X if trace is not None]
    y = [value for value in y if value is not None]
    
    
    X_c1 = distance_to_shapelet(X, c1_shapes)
    X_c2 = distance_to_shapelet(X, c2_shapes)
    
    X = np.hstack((X_c1, X_c2))
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    
    #print("Training and testing samples collected ...")
    
    clf = GradientBoostingClassifier()
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    score = metrics.accuracy_score(y_test, y_pred)
    
    return score

In [6]:
def distance_to_shapelet(data, shapelet):
    """
    Compute the minimum distance beteen each data sample and a shapelet of interest
    """
    #data = np.asarray(data)
    #print(len(data))
    
    X = np.empty(len(data))
    for i in range(len(data)):
        D = stumpy.mass(shapelet, data[i])
        X[i] = D.min()

    return X.reshape(-1, 1)

def run_classifier(id_1, id_2, c1_shapes, c2_shapes, m, n):
    sample_size = 250
    
    # generate input data, split into training/testing
    X = random.sample(traces[id_1],n)
    X.extend(random.sample(traces[id_2],n))
    y = [id_1] * n + [id_2] * n
    X = [np.asarray(trace).astype('float64') for trace in X]
    X = [trace[~np.isnan(trace)] for trace in X]

    
    removals = [i for i,x in enumerate(X) if len(x) < m]
    for idx in removals:
        X[idx] = None
        y[idx] = None
    X = [trace for trace in X if trace is not None]
    y = [value for value in y if value is not None]

    X_train_org, X_test_org, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    
    #print("Training and testing samples collected ...")
    
    clf = RandomForestClassifier()
    max_accuracy = 0
    
    for i, (c1_shape, c2_shape) in enumerate(zip(c1_shapes, c2_shapes)):
        X_c1 = distance_to_shapelet(X_train_org, c1_shape)
        X_c2 = distance_to_shapelet(X_train_org, c2_shape)
        X_train = np.concatenate((X_c1, X_c2), axis=1)

        X_c1 = distance_to_shapelet(X_test_org, c1_shape)
        X_c2 = distance_to_shapelet(X_test_org, c2_shape)
        X_test = np.concatenate((X_c1, X_c2), axis=1)

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        
        max_accuracy = max(max_accuracy, metrics.accuracy_score(y_test, y_pred))
        
    return max_accuracy
    

In [7]:
def score_shapelets(id_1, id_2, num_traces=20, shapelet_size=500, num_shapelets = 10, sample_size=100):
    
    # get packet traces and merge
    trace1, trace2 = merge_traces(id_1, id_2, num_traces)
    
    #print("Traces Merged...")
    
    # compute differences
    c1_c1, c2_c2, c1_c2, c2_c1 = generate_profile(trace1, trace2, shapelet_size)
    
    # find largest value gap between self and other
    
    diff_c1 = c1_c2 - c1_c1
    diff_c2 = c2_c1 - c2_c2
    
    #print("Differences Computed...")
    
    # get maximum points (i.e. shapelets)
    
    c1_shapes = generate_shapelets(diff_c1, trace1, shapelet_size, num_shapelets)
    c2_shapes = generate_shapelets(diff_c2, trace2, shapelet_size, num_shapelets)
    
    #print("Shapelets Generated...")
    
    final_score = run_classifier(id_1, id_2, c1_shapes, c2_shapes, shapelet_size, sample_size)
    
    return final_score

In [8]:
score_shapelets(22,88)

1.0

In [9]:
'''
Results



# which classifier is used to predict outcome
Classifier: [XGBoost, RandomForest]

# pass data for all shapelets into classifier, or just single shapelet and report best score
Shapelet Choice: [All, Best]

# whether incoming (positive), outgoing (negative), or both directions are used
Packet Direction: [Positive, Negative, Both]

# consider shapelet distance from both packet traces or just one
Traces: [Both, One]

Accuracy using parameters:
(Classifier, shapelet choice, packet direction, traces)

(XGBoost, All, Positive, One): 0.86
(XGBoost, All, Positive, Both): 0.88
(XGBoost, All, Both, Both): 0.88
(XGBoost, All, Negative, Both): 0.87

(XGBoost, Best, Negative, Both): 0.93
(XGBoost, Best, Both, Both): 0.93

(RandomForest, Best, Both, Both): 0.92

'''



# work with her
# how much can she help
# her own research topics

'\nResults\n\n\n\n# which classifier is used to predict outcome\nClassifier: [XGBoost, RandomForest]\n\n# pass data for all shapelets into classifier, or just single shapelet and report best score\nShapelet Choice: [All, Best]\n\n# whether incoming (positive), outgoing (negative), or both directions are used\nPacket Direction: [Positive, Negative, Both]\n\n# consider shapelet distance from both packet traces or just one\nTraces: [Both, One]\n\nAccuracy using parameters:\n(Classifier, shapelet choice, packet direction, traces)\n\n(XGBoost, All, Positive, One): 0.86\n(XGBoost, All, Positive, Both): 0.88\n(XGBoost, All, Both, Both): 0.88\n(XGBoost, All, Negative, Both): 0.87\n\n(XGBoost, Best, Negative, Both): 0.93\n(XGBoost, Best, Both, Both): 0.93\n\n(RandomForest, Best, Both, Both): \n\n'

In [10]:
scores = []

for i in tqdm(range(50)):

    a = random.randrange(100)
    b = random.randrange(100)
    
    try:
        score = score_shapelets(a, b)
    except ValueError:
        continue
        
    scores.append(score)
    
    if(score < 0.7):
        print("Bad (score=" + str(score) + "): " + "ID1=" + str(a) + " ID2=" +  str(b))
    
    print("Running Average: " + str(mean(scores)))

  0%|          | 0/50 [00:00<?, ?it/s]

Running Average: 1.0
Running Average: 0.925
Running Average: 0.9166666666666666
Running Average: 0.8625
Running Average: 0.89
Running Average: 0.8916666666666666
Running Average: 0.8941558441558441
Running Average: 0.8886363636363637
Running Average: 0.901010101010101
Running Average: 0.9009090909090909
Running Average: 0.9099173553719008
Running Average: 0.9049242424242424
Running Average: 0.9036907536907537
Running Average: 0.8998556998556998
Running Average: 0.9065319865319865
Running Average: 0.9092487373737373
Running Average: 0.9057635175282234
Running Average: 0.9109988776655443
Running Average: 0.9051568314726209
Running Average: 0.9098989898989899
Running Average: 0.9098605098605098
Running Average: 0.9139577594123048
Running Average: 0.9176987263943785
Running Average: 0.9148779461279462
Running Average: 0.9182828282828283
Running Average: 0.9214257964257964
Running Average: 0.9243359521137299
Running Average: 0.9198953823953824
Running Average: 0.9192093347265761
Running Ave