# Adversarial Input to Tor Web Resource Classifiers

## Setup

In [1]:
SEED = 42

In [2]:
import numpy as np
from os import listdir
import os
import sys
import time

import random
random = random.SystemRandom(SEED)

from copy import deepcopy
from os.path import join
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from pycm import *
import numpy as np
import matplotlib.pyplot as plt
import itertools
import pandas as pd

  from numpy.core.umath_tests import inner1d


## Helper Functions

In [3]:
def create_matrix(all_resources, accessed_resources, total_size):

    datapoints = np.zeros(shape=(total_size, len(all_resources)), dtype=int)

    labels = np.array([])
    label_nums = np.array([], dtype=int)
    resources_arr = np.array(all_resources)

    # we want to iterate trough all the resources
    # and create boolean matrix in which each row
    # corresponds to which resources were requested
    # while loading a page that is our label

    index = 0
    count = 0
    for label, resources_groups in accessed_resources.items():

        rows_num = len(resources_groups)

        # we want to avoid rows that have all the 0s
        for i in range(rows_num):
            datapoints[i+index] = np.in1d(resources_arr, resources_groups[i]).astype(int)

        labels = np.append(labels, np.full((1, rows_num), label))
        label_nums = np.append(label_nums, np.full((1, rows_num), count))


        # print("Rows for ", label," - ", count, " created; ", rows_num, " rows were added")

        count += 1
        index += rows_num

    # got the boolean matrix, now call function that processes it

    print('Matrix is created...')
    return datapoints, label_nums

In [4]:
def read_from_dir(path, mode, clf=None):
    dirs = listdir(path)

    accessed_resources = {}
    all_resources = set()
    total_size = 0

    for dir in dirs:
        newpath = join(path, dir)
        files = listdir(newpath)

        # get the name of the file which contains requested resources
        # strip .txt to save it as label for classification
        # create path name to access the file

        for file in files:
            filepath = join(newpath,file)
            if os.stat(filepath).st_size == 0:
                continue

            r_list = []
            key = file[:-4]

            if key[0:3] == 'www':
                key = key[4:]

            if not bool(accessed_resources.get(key)):
                accessed_resources[key] = []
            lines = [line.rstrip('\n') for line in open(filepath)]

            # each line has hostname and ip address
            # we only need hostname and we want to strip 'www.'
            # we add each resource to the overall set of resources ever accessed +
            # we add resources for current key to be added to hash table later by key

            tld = key.split('.')[0]

            for line in lines:
                requested_resource = line.split(',')[0]
                if (mode == "nontld" and tld in line) or (mode == "tld" and tld not in line):
                    continue
                if requested_resource[0:3] == 'www':
                    requested_resource = requested_resource[4:]
                all_resources.add(requested_resource)
                r_list.append(requested_resource)

            # we want to crate a row only if has at least one '1'
            # If list of resources is empty than there will no such thing

            if r_list:
                accessed_resources[key].append(r_list)
                total_size += 1

    print('Files are processed...')
    return list(all_resources), accessed_resources, total_size

## Adversarial Function

In [5]:
def add_adv_noise(accessed_resources, adv_mode, adv_min, adv_max):
    if adv_mode is None:
        return accessed_resources
    
    # Insert noise based on adv_mode
    new_accessed_resources = {}
    
    num_to_add = random.randint(adv_min, adv_max)
    print("Number of domains to add: {}".format(num_to_add))
    
    for i, (site, resources_loaded_per_domain) in enumerate(accessed_resources.items()):
        if adv_mode == "random_internal":
            new_resources_loaded_per_domain = []
            # print("Adding {} resources to every {} list".format(num_to_add, site))
            
            for domain_resources in resources_loaded_per_domain:
                new_resources = deepcopy(domain_resources)
                
                idx = 0
                while idx < num_to_add:
                    new_resource_choice = random.choice(all_resources)
                    if new_resource_choice not in new_resources:
                        new_resources.append(new_resource_choice)
                        idx += 1
                new_resources_loaded_per_domain.append(new_resources)
            
            new_accessed_resources[site] = new_resources_loaded_per_domain
        elif adv_mode == "disjoint_domain":
            new_resources_loaded_per_domain = []
            this_domain_unique_resources = list(set([x for xs in accessed_resources[site] for x in xs]))
            
            for domain_resources in resources_loaded_per_domain:
                new_resources = deepcopy(domain_resources)
                
                idx = 0
                while idx < num_to_add:
                    new_resource_choice = random.choice(all_resources)
                    if new_resource_choice not in new_resources and new_resource_choice not in this_domain_unique_resources:
                        new_resources.append(new_resource_choice)
                        idx += 1
                
                new_resources_loaded_per_domain.append(new_resources)
            new_accessed_resources[site] = new_resources_loaded_per_domain
        else:
            raise ValueError("Not implemented yet!")
            
    return new_accessed_resources

## Configuring Adversarial Input

In [6]:
# One of None, random_internal, disjoint_domain, disjoint_exit
adv_mode = "disjoint_domain"

# Range of resources to add to each sample
# Chosen randomly and uniform across all samples once a number is chosen
adv_min = 50
adv_max = 100

## Parsing the Data

In [7]:
all_resources, accessed_resources, total_size = read_from_dir("../opt/tor_alexa_resolutions", "tld")

Files are processed...


## Construct the New Dataset with or without Adversarial Noise

In [8]:
new_accessed_resources = add_adv_noise(accessed_resources, adv_mode, adv_min, adv_max)

Number of domains to add: 82


In [9]:
X, Y = create_matrix(list(all_resources), new_accessed_resources, total_size)

Matrix is created...


## Analysis

In [10]:
method = "mlp"

X = X.astype(float)

train_f, test_f, train_l, test_l = train_test_split(
    X, Y, test_size=0.35, random_state=56)

print("Finished splitting data!")

Finished splitting data!


In [11]:
# Construct the classifier
clf = LinearSVC()

if method == 'mlp':
    clf = MLPClassifier(batch_size=1000, random_state=SEED)
elif method == 'forest':
    clf = RandomForestClassifier()
elif method != 'svc':
    print('Specify another method: "svm", "mlp" or "forrest"')
    raise ValueError("Invalid choice!")

In [12]:
start = time.time()
clf.fit(train_f, train_l)
end = time.time()
elapsed_fit_time = end - start
print("Fit time: {}".format(elapsed_fit_time))
print('Fitting is done!')

Fit time: 1464.1755030155182
Fitting is done!


In [13]:
start = time.time()
y_true, y_pred = test_l, clf.predict(test_f)
end = time.time()
elapsed_predict_time = end - start
print("Test time: {}".format(elapsed_predict_time))
print('Testing is done!')

Test time: 2.2373197078704834
Testing is done!


## Results

Using the PyCM library: https://github.com/sepandhaghighi/pycm

Docs on it here: http://www.shaghighi.ir/pycm/doc/

In [14]:
cm = ConfusionMatrix(actual_vector=y_true, predict_vector=y_pred)

In [15]:
d = cm.overall_stat
df = pd.DataFrame.from_dict(d, orient="index")

In [16]:
df

Unnamed: 0,0
Overall_ACC,0.209654
Kappa,0.208809
Overall_RACC,0.00106825
Strength_Of_Agreement(Landis and Koch),Fair
Strength_Of_Agreement(Fleiss),Poor
Strength_Of_Agreement(Altman),Fair
Strength_Of_Agreement(Cicchetti),Poor
TPR_Macro,
PPV_Macro,
TPR_Micro,0.209654


In [17]:
def plot_confusion_matrix(cm,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function modified to plot the ConfusionMatrix object.
    Normalization can be applied by setting `normalize=True`.
    
    Code Reference : 
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    
    """

    plt_cm = []
    for i in cm.classes:
        row=[]
        for j in cm.classes:
            row.append(cm.table[i][j])
        plt_cm.append(row)
    plt_cm = np.array(plt_cm)
    if normalize:
        plt_cm = plt_cm.astype('float') / plt_cm.sum(axis=1)[:, np.newaxis]     
    plt.imshow(plt_cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(cm.classes))
    plt.xticks(tick_marks, cm.classes, rotation=45)
    plt.yticks(tick_marks, cm.classes)

    fmt = '.2f' if normalize else 'd'
    thresh = plt_cm.max() / 2.
    for i, j in itertools.product(range(plt_cm.shape[0]), range(plt_cm.shape[1])):
        plt.text(j, i, format(plt_cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if plt_cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('Actual')
    plt.xlabel('Predict')

In [None]:
plot_confusion_matrix(cm)