## Step 1: Feature Extraction

For every training example, extract thw following featuers
* **pkg_size** total package size of in and out traffic
* **ratio**: ratio of total out packet size and in packet size
* **RMS**: root mean square of in and out traffic
* **PAR**: peak-to-average-ratio of in and out package
* **MAD**: Median absolute deviation of in and out traffic
* **MEAN_AD**:Mean absolute deviation of in and out traffic


After extraction, each url is packet trace is represented in the following vector form:


`[pkg_size, ratio, rms, ...]`


In [15]:
import re
data_dir = "./Training example/"
from os import listdir
from os.path import join
import numpy as np
from datetime import datetime
from numpy import sqrt, mean, absolute, median


def extract_feature(lines, dim=10):
    if len(lines) == 0:
        return np.zeros(dim)

    pkg_in = np.zeros(len(lines))
    pkg_out = np.zeros(len(lines))
    time_arr = []

    for i, line in enumerate(lines):
        timestamp, size, direction = line.split()

        if "in" in direction:
            pkg_in[i] = int(size)
        else:
            pkg_out[i] = int(size)
        time_arr.append(timestamp)

    pkg_in_size = sum(pkg_in) + 1
    pkg_out_size = sum(pkg_out)

    ratio = pkg_out_size / pkg_in_size

    pkg_in_max = max(pkg_in)
    pkg_out_max = max(pkg_out)

    start_time = datetime.strptime(time_arr[0], '%H:%M:%S.%f')
    end_time = datetime.strptime(time_arr[-1], '%H:%M:%S.%f')
    total_time = end_time-start_time


    vec =  [pkg_in_size, pkg_out_size, ratio,
            rms(pkg_in), par(pkg_in, rms(pkg_in)), mad(pkg_in), mean_ad(pkg_in),
            rms(pkg_out), par(pkg_out, rms(pkg_out)), mean_ad(pkg_out)]

    if not np.all(np.isfinite(vec)):
        vec = [0 if np.isnan(v) else v for v in vec]
    return vec

def rms(x, axis=None):
    return sqrt(mean(x**2, axis=axis))

def par(x, X_rms):
    if X_rms == 0:
        return 0
    return max(absolute(x))/X_rms

def zcr(x):
    l = len(x)
    zcr = 0
    for i in range(l-1):
        if x[i]*x[i+1]<0:
            zcr = zcr + 1
    return zcr/(l-1)

def mad(x):
    med = median(x)
    return median(abs(x - med))

def mean_ad(x):
    return mean(absolute(x - mean(x)))

def load_feature():
    N =  10 * 35
    dim = 10
    X = np.zeros((N, dim))
    y = np.zeros((N, 1))


    index = 0
    for i in range(2, 12):
        folder = "day%d" % i
        filenames = listdir(data_dir + folder)
        filenames = list(filter(lambda x: ".txt" in x and "analysis" in x and "traffic" not in x,
                                filenames))
        for f in filenames:
            with open(join(data_dir, folder, f)) as input:
                lines = input.readlines()
            x = extract_feature(lines)
            label = int(re.findall(r'\d+', f)[0])
            X[index] = x
            y[index] = label
            index = index + 1

    return X, y.ravel()






X, y = load_feature()





## Step 2: 10-fold cross validation of the training examples

Using Toshal's training examples, there are **350** data points in total.

10-fold cross validation and KNN learning algorithms (# neighbors = 4) is used. 

The average precision and recall are **0.723810** and **0.794286**.


In [16]:
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, precision_score, recall_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

def k_fold_train(X, y):
    print("Start training and predict...")
    kf = KFold(n_splits=10)
    avg_p = 0
    avg_r = 0

    for train, test in kf.split(y):
        model = KNeighborsClassifier(n_neighbors=4)

        # train
        model.fit(X[train], y[train])
        # predict
        predicts = model.predict(X[test])


        print(classification_report(y[test],predicts))
        avg_p   += precision_score(y[test],predicts, average='macro')
        avg_r   += recall_score(y[test],predicts, average='macro')

    print('Average Precision is %f.' %(avg_p/10.0))
    print('Average Recall is %f.' %(avg_r/10.0))


k_fold_train(X, y)


Start training and predict...
             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00         1
        1.0       0.00      0.00      0.00         1
        2.0       0.00      0.00      0.00         1
        3.0       0.00      0.00      0.00         1
        4.0       0.00      0.00      0.00         1
        5.0       0.00      0.00      0.00         1
        6.0       0.00      0.00      0.00         1
        7.0       0.00      0.00      0.00         1
        8.0       1.00      1.00      1.00         1
        9.0       0.00      0.00      0.00         1
       10.0       0.00      0.00      0.00         1
       11.0       1.00      1.00      1.00         1
       12.0       0.00      0.00      0.00         1
       13.0       0.50      1.00      0.67         1
       14.0       0.50      1.00      0.67         1
       15.0       0.50      1.00      0.67         1
       16.0       0.50      1.00      0.67         1
       17.0    

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## Step 3: prediction on the test set

1. Using all training examples to train the classifier
2. extract features of test packets using the same pipeline
3. output prediction

In [17]:
from sklearn.neighbors import KNeighborsClassifier

def predict(X, y):
    model = KNeighborsClassifier(n_neighbors=4)

    # train
    model.fit(X, y)

    # predict
    x_test = np.zeros((35, 10))
    test_dir = "./data/test/"
    for i in range(34):
        with open(test_dir + "packet_trace_%d.txt" % i, 'r') as input:
            lines = input.readlines()
            x_test[i] = extract_feature(lines)
    predicts = model.predict(x_test)
    predicts = [str("%d"%p) for p in predicts]
    print("\n".join(predicts))

        
predict(X, y)



15
16
11
27
26
10
20
23
18
34
20
30
26
3
21
28
3
0
14
24
9
3
21
5
11
23
24
16
13
27
2
21
8
7
8
