In [1]:
import time
import ast
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.metrics import classification_report, RocCurveDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from statistics import mean
sns.set_theme()

import warnings
warnings.filterwarnings('ignore')

In [20]:
stateless_df = pd.read_csv("Data/Prepared Data/stateless.csv")
stateless_df = stateless_df.fillna("")

In [21]:
#stateless_df["Predictions"] = 0
skf = StratifiedKFold(n_splits=5)
X = stateless_df[[col for col in stateless_df.columns if col not in ["Predictions","timestamp", "attack", "origin", "data_type", "original_index"]]]
y = stateless_df["attack"]

In [22]:
X

Unnamed: 0,FQDN_count,subdomain_length,upper,lower,numeric,entropy,special,labels,labels_max,labels_average,longest_word,sld,len,subdomain
0,26,9,0,10,10,2.742338,6,6,7,3.500000,2,192,13,1
1,26,9,0,10,10,2.742338,6,6,7,3.500000,2,192,13,1
2,27,10,0,10,11,2.767195,6,6,7,3.666667,2,192,14,1
3,27,10,0,10,11,2.767195,6,6,7,3.666667,2,192,14,1
4,24,7,0,10,8,2.054029,6,6,7,3.166667,4,224,11,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
757206,24,7,0,10,8,2.054029,6,6,7,3.166667,4,224,11,1
757207,27,10,0,10,11,2.570417,6,6,7,3.666667,2,192,14,1
757208,27,10,0,10,11,2.570417,6,6,7,3.666667,2,192,14,1
757209,27,10,0,10,11,2.570417,6,6,7,3.666667,2,192,14,1


(Time benchmarks in the following code cells do not include transformation time to generate the features, as these should be consistent in the ensemble and non-ensemble versions.)

# LR

In [79]:
optimized_features = ['FQDN_count', 'subdomain_length', 'lower', 'numeric', 'special',
       'labels', 'longest_word_islower',
       'longest_word_isnumeric', 'sld_islower', 'sld_isnumeric']
    
stateless_df = pd.read_csv("Data/Prepared Data/stateless.csv")
stateless_df = stateless_df.fillna("")

stateless_df["longest_word_islower"] = stateless_df["longest_word"].apply(lambda x: str(x).islower())
stateless_df["longest_word_isnumeric"] = stateless_df["longest_word"].apply(lambda x: str(x).isnumeric())
stateless_df["sld_islower"] = stateless_df["sld"].apply(lambda x: str(x).islower())
stateless_df["sld_isnumeric"] = stateless_df["sld"].apply(lambda x: str(x).isnumeric())


skf = StratifiedKFold(n_splits=5)
#X = stateless_df[[col for col in stateless_df.columns if col not in ["timestamp", "attack", "origin", "original_index"]]]
X = stateless_df[optimized_features + ["data_type"]]
y = stateless_df["attack"]
trained_pipes_lr = []
categorical_transformer_pipes=[]
accuracies = []
avg_train = []
avg_test = []

total_prediction_time_stateless = 0

data_types = set(stateless_df.data_type.unique()) - set(["benign"])
for data_type in data_types:
    stateless_df["Predictions_"+data_type+"_model"] = 0

    
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    X_test.drop(columns=['data_type'], inplace=True)
    for data_type in data_types:
        
        
        X_train_data_type = X_train[X_train.data_type == data_type]
        X_train_benign = X_train[X_train.data_type == "benign"].sample(X_train_data_type.shape[0])
        X_train_smaller_subset = pd.concat([
            X_train_data_type,
            X_train_benign
        ])
        X_train_smaller_subset.sample(frac=1, random_state=0) # Shuffle
        X_train_smaller_subset.drop(columns=['data_type'], inplace=True)
        
        y_train_data_type =  pd.DataFrame([1] * X_train_data_type.shape[0])
        y_train_benign = pd.DataFrame([0] * X_train_benign.shape[0])
        y_train_smaller_subset = pd.concat([
            y_train_data_type,
            y_train_benign
        ])
        #print(X_train_smaller_subset.shape, y_train_smaller_subset.shape)
        y_train_smaller_subset.sample(frac=1, random_state=0) # Shuffle
        
        category_pipeline = ColumnTransformer(
        [
        #('ordinal_encoder', OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), ["longest_word", "sld"]),
        # ('onehot_encoder', OneHotEncoder(handle_unknown="ignore"), ["longest_word", "sld"]),
        ('scaler', StandardScaler(), optimized_features)
        ]
        )
        X_train_smaller_subset = category_pipeline.fit_transform(X_train_smaller_subset)
        pipe = Pipeline([
            ('lr', LogisticRegression(random_state=0, max_iter=300))
        ])    
        pipe.fit(X_train_smaller_subset, y_train_smaller_subset)

    # prediction_time_start = time.time()
       # X_test = category_pipeline.transform(X_test)
        print(f"Test Accuracy for fold {i} subset {data_type}:", pipe.score(category_pipeline.transform(X_test), y_test))
    #print(X_test)
    # print("Avg. Train Accuracy:", pipe.score(X_train, y_train))
    # print("Avg. Test Accuracy:", pipe.score(X_test, y_test))
#     avg_train.append(pipe.score(X_train, y_train))
#     avg_test.append(pipe.score(X_test, y_test))
        prediction_start_time = time.time()
        stateless_df.loc[test_index, "Predictions_"+data_type+"_model"] = pipe.predict(category_pipeline.transform(X_test))
        prediction_end_time = time.time()
        total_prediction_time_stateless += (prediction_end_time-prediction_start_time)
    # prediction_time_end = time.time()
    # print(f"Total time elapsed for prediction: {(prediction_time_end - prediction_time_start):.2f}")
    # total_prediction_time_stateless += (prediction_time_end - prediction_time_start)
    trained_pipes_lr.append(pipe)
    categorical_transformer_pipes.append(category_pipeline)

# lr_disp = RocCurveDisplay.from_estimator(pipe, X_test, y_test)
# plt.show()    
print("Total Prediction Time: ", total_prediction_time_stateless)

Test Accuracy for fold 0 subset video: 0.7517415793400818
Test Accuracy for fold 0 subset image: 0.7504275536010248
Test Accuracy for fold 0 subset exe: 0.7520453239832808
Test Accuracy for fold 0 subset text: 0.750856757988154
Test Accuracy for fold 0 subset audio: 0.7504275536010248
Test Accuracy for fold 0 subset compressed: 0.7516161195961517
Test Accuracy for fold 1 subset video: 0.7357602250366477
Test Accuracy for fold 1 subset image: 0.735773431412686
Test Accuracy for fold 1 subset exe: 0.735885685609012
Test Accuracy for fold 1 subset text: 0.735846066480897
Test Accuracy for fold 1 subset audio: 0.7358724792329737
Test Accuracy for fold 1 subset compressed: 0.7357866377887244
Test Accuracy for fold 2 subset video: 0.761974881472775
Test Accuracy for fold 2 subset image: 0.7619418655326792
Test Accuracy for fold 2 subset exe: 0.7620541197290052
Test Accuracy for fold 2 subset text: 0.7620343101649476
Test Accuracy for fold 2 subset audio: 0.7619682782847559
Test Accuracy for 

In [73]:
prediction_columns = []
for data_type in data_types:
    prediction_columns.append("Predictions_"+data_type+"_model")
#stateless_df['majority'] = stateless_df[prediction_columns].mode(axis=1)[0]
stateless_df["Prediction_sum"] = stateless_df[prediction_columns].sum(axis=1)
pd.DataFrame(classification_report(stateless_df["attack"], stateless_df["Prediction_sum"] >= 3, output_dict=True)).T

Unnamed: 0,precision,recall,f1-score,support
0,0.996042,0.596401,0.746074,462858.0
1,0.610867,0.996273,0.757358,294353.0
accuracy,0.751845,0.751845,0.751845,0.751845
macro avg,0.803455,0.796337,0.751716,757211.0
weighted avg,0.846312,0.751845,0.750461,757211.0


Unnamed: 0,precision,recall,f1-score,support
0,0.995916,0.596332,0.745985,462858.0
1,0.610798,0.996154,0.757271,294353.0
accuracy,0.751756,0.751756,0.751756,0.751756
macro avg,0.803357,0.796243,0.751628,757211.0
weighted avg,0.846208,0.751756,0.750372,757211.0


## RF

In [74]:
optimized_features = ['FQDN_count', 'subdomain_length', 'lower', 'numeric', 'special',
       'labels', 'longest_word_islower',
       'longest_word_isnumeric', 'sld_islower', 'sld_isnumeric']
stateless_df = pd.read_csv("Data/Prepared Data/stateless.csv")
stateless_df = stateless_df.fillna("")

stateless_df["longest_word_islower"] = stateless_df["longest_word"].apply(lambda x: str(x).islower())
stateless_df["longest_word_isnumeric"] = stateless_df["longest_word"].apply(lambda x: str(x).isnumeric())
stateless_df["sld_islower"] = stateless_df["sld"].apply(lambda x: str(x).islower())
stateless_df["sld_isnumeric"] = stateless_df["sld"].apply(lambda x: str(x).isnumeric())

    
    
skf = StratifiedKFold(n_splits=5)
#X = stateless_df[[col for col in stateless_df.columns if col not in ["timestamp", "attack", "origin", "original_index"]]]
X = stateless_df[optimized_features + ["data_type"]]
y = stateless_df["attack"]
trained_pipes_lr = []
categorical_transformer_pipes=[]
accuracies = []
avg_train = []
avg_test = []

total_prediction_time_stateless = 0

data_types = set(stateless_df.data_type.unique()) - set(["benign"])
for data_type in data_types:
    stateless_df["Predictions_"+data_type+"_model"] = 0

    
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    X_test.drop(columns=['data_type'], inplace=True)
    for data_type in data_types:
        
        X_train_data_type = X_train[X_train.data_type == data_type]
        X_train_benign = X_train[X_train.data_type == "benign"].sample(X_train_data_type.shape[0])
        X_train_smaller_subset = pd.concat([
            X_train_data_type,
            X_train_benign
        ])
        X_train_smaller_subset.sample(frac=1, random_state=0) # Shuffle
        X_train_smaller_subset.drop(columns=['data_type'], inplace=True)
        
        y_train_data_type =  pd.DataFrame([1] * X_train_data_type.shape[0])
        y_train_benign = pd.DataFrame([0] * X_train_benign.shape[0])
        y_train_smaller_subset = pd.concat([
            y_train_data_type,
            y_train_benign
        ])
        #print(X_train_smaller_subset.shape, y_train_smaller_subset.shape)
        y_train_smaller_subset.sample(frac=1, random_state=0) # Shuffle
        
        category_pipeline = ColumnTransformer(
        [
        #('ordinal_encoder', OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), ["longest_word", "sld"]),
        # ('onehot_encoder', OneHotEncoder(handle_unknown="ignore"), ["longest_word", "sld"]),
        ('scaler', StandardScaler(), optimized_features)
        ]
        )
        X_train_smaller_subset = category_pipeline.fit_transform(X_train_smaller_subset)
        pipe = Pipeline([
            ('rf', RandomForestClassifier(random_state=0))
        ])    
        pipe.fit(X_train_smaller_subset, y_train_smaller_subset)

    # prediction_time_start = time.time()
       # X_test = category_pipeline.transform(X_test)
        print(f"Test Accuracy for fold {i} subset {data_type}:", pipe.score(category_pipeline.transform(X_test), y_test))
    #print(X_test)
    # print("Avg. Train Accuracy:", pipe.score(X_train, y_train))
    # print("Avg. Test Accuracy:", pipe.score(X_test, y_test))
#     avg_train.append(pipe.score(X_train, y_train))
#     avg_test.append(pipe.score(X_test, y_test))
        prediction_start_time = time.time()
        stateless_df.loc[test_index, "Predictions_"+data_type+"_model"] = pipe.predict(category_pipeline.transform(X_test))
        prediction_end_time = time.time()
        total_prediction_time_stateless += (prediction_end_time-prediction_start_time)
        
    # prediction_time_end = time.time()
    # print(f"Total time elapsed for prediction: {(prediction_time_end - prediction_time_start):.2f}")
    # total_prediction_time_stateless += (prediction_time_end - prediction_time_start)
    trained_pipes_lr.append(pipe)
    categorical_transformer_pipes.append(category_pipeline)

# lr_disp = RocCurveDisplay.from_estimator(pipe, X_test, y_test)
# plt.show()    
print("Total Prediction Time: ", total_prediction_time_stateless)

Test Accuracy for fold 0 subset video: 0.7523754812041494
Test Accuracy for fold 0 subset image: 0.7525735755366706
Test Accuracy for fold 0 subset exe: 0.7524283063594884
Test Accuracy for fold 0 subset text: 0.7524547189371579
Test Accuracy for fold 0 subset audio: 0.7524547189371579
Test Accuracy for fold 0 subset compressed: 0.7523886874929842
Test Accuracy for fold 1 subset video: 0.7360309557454339
Test Accuracy for fold 1 subset image: 0.736446956590642
Test Accuracy for fold 1 subset exe: 0.7360639716855297
Test Accuracy for fold 1 subset text: 0.7362356545740283
Test Accuracy for fold 1 subset audio: 0.7364205438385653
Test Accuracy for fold 1 subset compressed: 0.7360771780615681
Test Accuracy for fold 2 subset video: 0.7621201516091969
Test Accuracy for fold 2 subset image: 0.7620277069769285
Test Accuracy for fold 2 subset exe: 0.7623644695659064
Test Accuracy for fold 2 subset text: 0.7621267547972161
Test Accuracy for fold 2 subset audio: 0.7621135484211777
Test Accuracy 

In [75]:
prediction_columns = []
for data_type in data_types:
    prediction_columns.append("Predictions_"+data_type+"_model")
#stateless_df['majority'] = stateless_df[prediction_columns].mode(axis=1)[0]
stateless_df["Prediction_sum"] = stateless_df[prediction_columns].sum(axis=1)
pd.DataFrame(classification_report(stateless_df["attack"], stateless_df["Prediction_sum"] >= 3, output_dict=True)).T

Unnamed: 0,precision,recall,f1-score,support
0,0.998787,0.59587,0.746426,462858.0
1,0.611171,0.998862,0.758339,294353.0
accuracy,0.752526,0.752526,0.752526,0.752526
macro avg,0.804979,0.797366,0.752383,757211.0
weighted avg,0.848108,0.752526,0.751057,757211.0


# LR + RF

In [76]:
optimized_features = ['FQDN_count', 'subdomain_length', 'lower', 'numeric', 'special',
       'labels', 'longest_word_islower',
       'longest_word_isnumeric', 'sld_islower', 'sld_isnumeric']
stateless_df = pd.read_csv("Data/Prepared Data/stateless.csv")
stateless_df = stateless_df.fillna("")

stateless_df["longest_word_islower"] = stateless_df["longest_word"].apply(lambda x: str(x).islower())
stateless_df["longest_word_isnumeric"] = stateless_df["longest_word"].apply(lambda x: str(x).isnumeric())
stateless_df["sld_islower"] = stateless_df["sld"].apply(lambda x: str(x).islower())
stateless_df["sld_isnumeric"] = stateless_df["sld"].apply(lambda x: str(x).isnumeric())

    
    
skf = StratifiedKFold(n_splits=5)
#X = stateless_df[[col for col in stateless_df.columns if col not in ["timestamp", "attack", "origin", "original_index"]]]
X = stateless_df[optimized_features + ["data_type"]]
y = stateless_df["attack"]
trained_pipes_lr = []
trained_pipes_rf = []
categorical_transformer_pipes=[]
accuracies = []
avg_train = []
avg_test = []

total_prediction_time_stateless = 0

data_types = set(stateless_df.data_type.unique()) - set(["benign"])
for data_type in data_types:
    stateless_df["Predictions_"+data_type+"_model"] = 0

    
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    X_test.drop(columns=['data_type'], inplace=True)
    for data_type in data_types:
        
        X_train_data_type = X_train[X_train.data_type == data_type]
        X_train_benign = X_train[X_train.data_type == "benign"].sample(X_train_data_type.shape[0])
        X_train_smaller_subset = pd.concat([
            X_train_data_type,
            X_train_benign
        ])
        X_train_smaller_subset.sample(frac=1, random_state=0) # Shuffle
        X_train_smaller_subset.drop(columns=['data_type'], inplace=True)
        
        y_train_data_type =  pd.DataFrame([1] * X_train_data_type.shape[0])
        y_train_benign = pd.DataFrame([0] * X_train_benign.shape[0])
        y_train_smaller_subset = pd.concat([
            y_train_data_type,
            y_train_benign
        ])
        #print(X_train_smaller_subset.shape, y_train_smaller_subset.shape)
        y_train_smaller_subset.sample(frac=1, random_state=0) # Shuffle
        
        category_pipeline = ColumnTransformer(
        [
        #('ordinal_encoder', OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), ["longest_word", "sld"]),
        # ('onehot_encoder', OneHotEncoder(handle_unknown="ignore"), ["longest_word", "sld"]),
        ('scaler', StandardScaler(), optimized_features)
        ]
        )
        X_train_smaller_subset = category_pipeline.fit_transform(X_train_smaller_subset)
        lr_pipe = Pipeline([
            ('lr', LogisticRegression(random_state=0, max_iter=300))
        ])    
        rf_pipe = Pipeline([
            ('rf', RandomForestClassifier(random_state=0))
        ])    
        lr_pipe.fit(X_train_smaller_subset, y_train_smaller_subset)
        rf_pipe.fit(X_train_smaller_subset, y_train_smaller_subset)

        print(f"Test Accuracy for LR fold {i} subset {data_type}:", lr_pipe.score(category_pipeline.transform(X_test), y_test))
        print(f"Test Accuracy for rf fold {i} subset {data_type}:", rf_pipe.score(category_pipeline.transform(X_test), y_test))
        prediction_start_time = time.time()
        stateless_df.loc[test_index, "Predictions_"+data_type+"_lr_model"] = lr_pipe.predict(category_pipeline.transform(X_test))
        stateless_df.loc[test_index, "Predictions_"+data_type+"_rf_model"] = rf_pipe.predict(category_pipeline.transform(X_test))
        prediction_end_time = time.time()
        total_prediction_time_stateless += (prediction_end_time-prediction_start_time)
    trained_pipes_lr.append(lr_pipe)
    trained_pipes_rf.append(rf_pipe)
    categorical_transformer_pipes.append(category_pipeline)

print("Total Prediction Time: ", total_prediction_time_stateless)

Test Accuracy for LR fold 0 subset video: 0.7518802453728466
Test Accuracy for rf fold 0 subset video: 0.7523160529043931
Test Accuracy for LR fold 0 subset image: 0.7505398070561201
Test Accuracy for rf fold 0 subset image: 0.75259998811434
Test Accuracy for LR fold 0 subset exe: 0.7517415793400818
Test Accuracy for rf fold 0 subset exe: 0.7523754812041494
Test Accuracy for LR fold 0 subset text: 0.7511142806204314
Test Accuracy for rf fold 0 subset text: 0.7524943378036621
Test Accuracy for LR fold 0 subset audio: 0.7508369485549018
Test Accuracy for rf fold 0 subset audio: 0.752580178681088
Test Accuracy for LR fold 0 subset compressed: 0.7513123749529526
Test Accuracy for rf fold 0 subset compressed: 0.7525207503813316
Test Accuracy for LR fold 1 subset video: 0.7357404154725902
Test Accuracy for rf fold 1 subset video: 0.7360309557454339
Test Accuracy for LR fold 1 subset image: 0.7358658760449545
Test Accuracy for rf fold 1 subset image: 0.7363479087703543
Test Accuracy for LR fo

In [77]:
prediction_columns = []
for data_type in data_types:
    prediction_columns.append("Predictions_"+data_type+"_lr_model")
    prediction_columns.append("Predictions_"+data_type+"_rf_model")
#stateless_df['majority'] = stateless_df[prediction_columns].mode(axis=1)[0]
stateless_df["Prediction_sum"] = stateless_df[prediction_columns].sum(axis=1)
pd.DataFrame(classification_report(stateless_df["attack"], stateless_df["Prediction_sum"] >= 5, output_dict=True)).T

Unnamed: 0,precision,recall,f1-score,support
0,0.99828,0.595777,0.746212,462858.0
1,0.611003,0.998386,0.758073,294353.0
accuracy,0.752284,0.752284,0.752284,0.752284
macro avg,0.804642,0.797081,0.752142,757211.0
weighted avg,0.847733,0.752284,0.750823,757211.0
