# <center>Decision Tree</center>

In [1]:
from sklearnex import patch_sklearn
patch_sklearn()
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import sklearn as sk
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix,accuracy_score, classification_report
import pickle
from sklearn.tree import _tree
import tree_func 
import he_operations 
from tqdm import tqdm
import tenseal as ts
import warnings
warnings.filterwarnings("ignore")

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
# Importing the dataset and undersampling it to avoid biases
df = pd.read_csv(r"C:\Users\manig\Downloads\Mitacs\top_features.csv")
grouped = df.groupby('Label')
df = grouped.apply(lambda x: x.sample(grouped.size().min(), random_state=73).reset_index(drop=True))
x = df.drop(["Label"], axis = 1)
y = df["Label"]

# Scaling in the range [1/2, 3/2]
scaler = preprocessing.MinMaxScaler(feature_range=(0.5, 1.5))
x = scaler.fit_transform(x)

#Splitting the dataset into training and testing set
x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.00007, random_state=42)
print(f"x_train shape: {x_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"x_test shape: {x_temp.shape}")
print(f"y_test shape: {y_temp.shape}")

x_train shape: (851422, 30)
y_train shape: (851422,)
x_test shape: (60, 30)
y_test shape: (60,)


In [3]:
del grouped, x_train, y_train

In [3]:
# JSON object which contains all the parameters that RandomizedSearchCV will be looping through
# Limited the depth of the tree to reduce the computations with encrypted data
model_params = {
    'Decision Tree' : {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini', 'entropy', 'log_loss'],
            'splitter': ['best', 'random'],
            'max_features': ['auto', 'sqrt', 'log2'],
            'max_depth': [16, 17, 18]
        }
    }
}

In [19]:
# Using RandomizedSearchCV to find the best parameters
scores1 = []

for model_name, mp in model_params.items():
    clf =  RandomizedSearchCV(mp['model'], mp['params'], cv=3, return_train_score=False)
    clf.fit(x_train, y_train)
    
    scores1.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_,
        'accuracy' : clf.score(x_temp, y_temp)
    })
    
acc1 = pd.DataFrame(scores1,columns=['model','best_score','best_params', 'accuracy'])

acc1

Unnamed: 0,model,best_score,best_params,accuracy
0,Decision Tree,0.996726,"{'splitter': 'best', 'max_features': 'auto', '...",0.996353


In [20]:
# Best parameters found through RandomizedSearchCV
acc1['best_params'][0]

{'splitter': 'best',
 'max_features': 'auto',
 'max_depth': 18,
 'criterion': 'entropy'}

In [21]:
# Fitting the model with best parameters through K-fold cross validation
model = clf.best_estimator_

In [22]:
# Testing - Calculating Accuracy, Recall, Precision and F1- Score on unencrypted data
y_pred = model.predict(x_temp)
accuracy = accuracy_score(y_temp, y_pred)
recall = recall_score(y_temp, y_pred, average='binary') 
precision = precision_score(y_temp, y_pred, average='binary')
f1 = f1_score(y_temp, y_temp, average='binary')  

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")
print(f"F1 Score: {f1}")

print(classification_report(y_temp, y_pred))

Accuracy: 0.9963534295965284
Recall: 0.996641655217764
Precision: 0.9960685365567422
F1 Score: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85136
           1       1.00      1.00      1.00     85161

    accuracy                           1.00    170297
   macro avg       1.00      1.00      1.00    170297
weighted avg       1.00      1.00      1.00    170297



In [23]:
# Saving the model
# filename = 'decision_tree.pkl'
# pickle.dump(model, open(filename, 'wb'))

In [4]:
# Loading the model
model = pickle.load(open(r"C:\Users\manig\Downloads\Mitacs\Anomaly-Detection-On-Encrypted-Traffic\Code\models\decision_tree.pkl", 'rb'))

In [5]:
# Function to encode the decison node of the tree as function with if-else statement
# The threshold for each node is extracted to perform batch comparision 
def tree_to_code(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    print("def tree_eval(x):")
    
    threshold_list = []
    feature_name_he = []
    count = [0]  
    def recurse(node, depth):
        indent = "    " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            threshold_list.append(threshold)
            feature_name_he.append(name)
            print("{}if x[{}] == True:".format(indent, count[0], threshold))
            count[0] +=1  
            recurse(tree_.children_left[node], depth + 1)
            print("{}else:  # if {} > {}".format(indent, name, threshold))
            recurse(tree_.children_right[node], depth + 1)
        else:
            single_spaced_string = ' '.join(str(tree_.value[node]).split())
            comma_separated_string = single_spaced_string.replace(' ', ',')
            s_processed = comma_separated_string.replace("[[,", "[[")
            print("{}return {}".format(indent, s_processed))

    recurse(0, 1)
    
    return threshold_list, feature_name_he

In [6]:
rules, name = tree_to_code(model, list(df.columns)[:-1])

def tree_eval(x):
    if x[0] == True:
        if x[1] == True:
            if x[2] == True:
                if x[3] == True:
                    if x[4] == True:
                        if x[5] == True:
                            if x[6] == True:
                                if x[7] == True:
                                    if x[8] == True:
                                        if x[9] == True:
                                            return [[5.,0.]]
                                        else:  # if Init_Win_bytes_forward > 0.5078277587890625
                                            if x[10] == True:
                                                if x[11] == True:
                                                    if x[12] == True:
                                                        return [[0.,13299.]]
                                                    else:  # if Flow Packets/s > 0.9069570302963257
                                                        retu

                                                                            return [[2.,1.]]
                                                                    else:  # if Total Fwd Packets > 0.5000204741954803
                                                                        return [[1638.,0.]]
                                                                else:  # if Fwd PSH Flags > 1.0
                                                                    if x[793] == True:
                                                                        return [[34.,0.]]
                                                                    else:  # if Max Packet Length > 0.53795325756073
                                                                        if x[794] == True:
                                                                            return [[8.,2.]]
                                                                        else:  # if Total Backward Packets > 0.50012162327

In [7]:
df_x_test = pd.DataFrame(x_temp, columns=list(df.columns)[:-1])
df_x_test.head()

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Flow Bytes/s,Flow Packets/s,...,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,Down/Up Ratio,Init_Win_bytes_forward,Init_Win_bytes_backward,min_seg_size_forward,Active Mean,Active Std
0,0.500001,0.5,0.500003,0.500001,0.5,0.501009,0.500337,0.5,0.505817,0.905882,...,0.5,1.5,0.5,0.5,0.53125,0.51564,0.500015,1.5,0.5,0.5
1,0.500001,0.5,0.500003,0.500001,0.5,0.501009,0.500337,0.5,0.505809,0.905,...,0.5,1.5,0.5,0.5,0.53125,0.51564,0.500015,1.5,0.5,0.5
2,0.500208,0.500005,0.500007,0.500024,0.5,0.517146,0.505723,0.5,0.505766,0.900032,...,0.5,0.5,0.5,0.5,0.53125,0.5,0.5,1.5,0.5,0.5
3,1.052698,0.500027,0.500021,0.500125,0.500018,0.5,0.508609,0.519195,0.505761,0.9,...,0.5,0.5,1.5,0.5,0.5,0.504196,0.503601,1.5,0.500009,0.5
4,1.210595,0.500018,0.500017,0.500231,0.500018,0.5,0.522286,0.525332,0.505761,0.9,...,0.5,0.5,0.5,0.5,0.53125,0.500015,0.503601,1.5,0.5,0.5


In [9]:
del df

In [20]:
# Defining TenSEAL Context
poly_mod_degree = 32768
bits_scale = 31
integer_scale = 40
coeff_mod_bit_sizes = [integer_scale, 
                       bits_scale, 
                       bits_scale, 
                       bits_scale, 
                       bits_scale, 
                       bits_scale, 
                       bits_scale, 
                       bits_scale, 
                       bits_scale, 
                       bits_scale,  
                       bits_scale,  
                       bits_scale,  
                       bits_scale, 
                       bits_scale, 
                       bits_scale, 
                       bits_scale, 
                       bits_scale, 
                       bits_scale, 
                       bits_scale, 
                       bits_scale, 
                       bits_scale, 
                       bits_scale,
                       integer_scale]
ctx_eval = ts.context(ts.SCHEME_TYPE.CKKS, poly_mod_degree, -1, coeff_mod_bit_sizes)
ctx_eval.global_scale = 2 ** bits_scale

In [29]:
%%time
# Evaluation on encrypted data 
rules_float32 = [np.float32(num) for num in rules]
enc_num_b = ts.ckks_vector(ctx_eval, rules_float32)
result_final = []
for j in range(len(df_x_test)):
    result = []
    org_list = []
    for i in name:
#         print(df_x_test[i][j])
        org_list.append(df_x_test[i][j])
#     print(f"Sample {j+1}")
    org_list_float32 = [np.float32(num) for num in org_list]
    enc_num_a = ts.ckks_vector(ctx_eval, org_list_float32)
    result = he_operations.comparision(ctx_eval, enc_num_a, enc_num_b, 2,2,3,2)
    comp_res = result.decrypt()
    for i in range(len(comp_res)):
        if comp_res[i] <= 0.5:
            comp_res[i] = True
        else:
            comp_res[i] = False
    interim_res = tree_func.tree_eval(comp_res)
    final_res = np.argmax(interim_res[0])
    result_final.append(final_res)
    
    
#     result = [a_val <= b_val for a_val, b_val in zip(org_list, rules)]

In [28]:
# Testing - Calculating Accuracy, Recall, Precision and F1- Score on encrypted data
accuracy = accuracy_score(y_temp, result_final)
recall = recall_score(y_temp, result_final, average='binary') 
precision = precision_score(y_temp, result_final, average='binary')
f1 = f1_score(y_temp, result_final, average='binary')  

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")
print(f"F1 Score: {f1}")

Accuracy: 81.568794028175
Recall: 83.4819382745601
Precision: 80.7973625481
F1 Score: 82.11771536901183
