In [4]:
import os
import random
import pandas as pd
import numpy
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.tree import export_text
from joblib import dump, load
from tqdm import tqdm
from wandb.sklearn import plot_learning_curve
from sklearn.model_selection import GridSearchCV
import time

MODEL_OUTPUT_PATH = "../src/models/"
SCALER_OUTPUT_PATH = "../src/scaler/"

# Distributed Denial-of-Service Risk Prediction

# Data Generation

In [138]:
columns = ['Industry', 'Region', 'Invested Amount', 'Successful Attacks', 'Failed Attacks', 'Business Value', 'Known Vulnerabilities', 'External Advisor', 'Risk']
features = ddos_columns[:-1]
INDUSTRIES = ['AUTOMOTIVE', 'HEALTHCARE', 'ECOMMERCE', 'TELECOM', 'FINANCIAL SERVICES', 'EDUCATION', 'OTHER']
REGIONS = ['EUROPE', 'NORTH AMERICA', 'SOUTH AMERICA', 'AFRICA', 'ASIA']
INDUSTRY_TO_RISK = {
    'AUTOMOTIVE': 0.2,
    'HEALTHCARE': 0.8,
    'ECOMMERCE': 0.6,
    'TELECOM': 0.75,
    'FINANCIAL SERVICES': 1,
    'EDUCATION': 0.3,
    'OTHER': 0.5
}

REGION_TO_RISK = {
    'EUROPE': 0.8,
    'NORTH AMERICA': 1,
    'SOUTH AMERICA': 1,
    'AFRICA': 0.2,
    'ASIA': 0.5
}

ADVISOR = ['YES', 'NO']
LEVELS = ['LOW', 'MEDIUM', 'HIGH']

In [117]:
def generate_data(nr_entries = 1000, min_nr_attacks = 0, nr_attacks = 50, 
                  avg_business_value = 5000000, std_business_value = 50000, max_invested_perc = 0.3, max_nr_vulnerabilities = 10):
    df = pd.DataFrame(columns=ddos_columns)
    for i in tqdm(range(0, nr_entries)):
        
        region = random.choice(REGIONS)
        industry = random.choice(INDUSTRIES)
        failed_attack = random.randrange(nr_attacks)
        succ_attack = random.randrange(nr_attacks)
        
        business_value = int(numpy.random.normal(loc=avg_business_value, scale=std_business_value))
        invested_perc = random.uniform(0, max_invested_perc)
        invested_amount = int(invested_perc * business_value)
        
        known_vulnerabilities = random.randrange(max_nr_vulnerabilities)
        external_adv = random.choice(ADVISOR)

        # Risk is computed based on the other parameters and is then categorized using 'get_categorized_risk'
        computed_risk = (succ_attack / nr_attacks) + (known_vulnerabilities / max_nr_vulnerabilities) + INDUSTRY_TO_RISK[industry] + REGION_TO_RISK[region] - (invested_perc / max_invested_perc) - ADVISOR.index(external_adv)
        df.loc[i] = [industry, region, invested_amount, succ_attack, failed_attack, business_value, known_vulnerabilities, external_adv, get_categorized_risk(computed_risk)]
    return df

def get_categorized_risk(weighted_risk, low_medium_boundary = 1.0, medium_high_boundary = 2.0):
  if weighted_risk >= medium_high_boundary:
      return "HIGH"
  elif weighted_risk >= low_medium_boundary and weighted_risk < medium_high_boundary:
      return "MEDIUM"
  else:
      return "LOW"

In [125]:
data = generate_data(nr_entries = 10000)

100%|██████████| 10000/10000 [00:21<00:00, 467.52it/s]


In [132]:
data.head()

Unnamed: 0,Industry,Region,Invested Amount,Successful Attacks,Failed Attacks,Business Value,Known Vulnerabilities,External Advisor,Risk
0,FINANCIAL SERVICES,NORTH AMERICA,1335499,36,9,5043962,3,NO,MEDIUM
1,AUTOMOTIVE,SOUTH AMERICA,897682,25,30,5010350,3,YES,MEDIUM
2,OTHER,NORTH AMERICA,705482,8,12,5060170,9,YES,HIGH
3,TELECOM,ASIA,1220247,38,24,4953331,3,YES,MEDIUM
4,OTHER,AFRICA,1216607,49,38,4948951,2,YES,MEDIUM


In [127]:
data.describe()

Unnamed: 0,Industry,Region,Invested Amount,Successful Attacks,Failed Attacks,Business Value,Known Vulnerabilities,External Advisor,Risk
count,10000,10000,10000,10000,10000,10000,10000,10000,10000
unique,7,5,9967,50,50,9720,10,2,3
top,EDUCATION,SOUTH AMERICA,634831,25,30,4947654,9,NO,MEDIUM
freq,1459,2033,2,242,234,3,1050,5075,4222


### Data Export as .csv

In [128]:
data.to_csv('ddos_data.csv')

## Data Processing

### Categorization Step

In [134]:
start = time.time()

levels_mapping = { 'LOW': 0, 'MEDIUM': 1, 'HIGH': 2 }
advisor_mapping = { 'NO': 0, 'YES': 1 }
industry_mapping = {
    'AUTOMOTIVE': 0,
    'HEALTHCARE': 1,
    'ECOMMERCE': 2,
    'TELECOM': 3,
    'FINANCIAL SERVICES': 4,
    'EDUCATION': 5,
    'OTHER': 6
}

region_mapping = {
    'EUROPE': 0,
    'NORTH AMERICA': 1,
    'SOUTH AMERICA': 2,
    'AFRICA': 3,
    'ASIA': 4
}


data = data.replace({'Employee Training': levels_mapping, 'Risk': levels_mapping, 'External Advisor': advisor_mapping, 'Industry': industry_mapping, 'Region': region_mapping})

stop = time.time()
print(f"Categorization time: {stop - start}s")

Categorization time: 0.006540775299072266s


In [137]:
data.head()

Unnamed: 0,Industry,Region,Invested Amount,Successful Attacks,Failed Attacks,Business Value,Known Vulnerabilities,External Advisor,Risk
0,4,1,1335499,36,9,5043962,3,0,1
1,0,2,897682,25,30,5010350,3,1,1
2,6,1,705482,8,12,5060170,9,1,2
3,3,4,1220247,38,24,4953331,3,1,1
4,6,3,1216607,49,38,4948951,2,1,1


### Normalization Step - using scaler

In [139]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Used for the scikit models
#data = pd.read_csv("ddos_data.csv")
data.head()
X_train, X_test, y_train, y_test = train_test_split(data[features].values, data["Risk"].values, random_state=0)

scaler = MinMaxScaler()
scaler.fit(X_train)

# Normalized Data
X = data[features]
Y = data["Risk"]
X_normalized = scaler.transform(data[features])


# Normalized Splitted Data
X_train_normalized = scaler.transform(X_train)
X_test_normalized = scaler.transform(X_test)

In [140]:
dump(scaler, SCALER_OUTPUT_PATH + 'MinMaxScaler_ddos.joblib') 

['../src/scaler/MinMaxScaler_ddos.joblib']

In [142]:
scaler.scale_

array([1.66666667e-01, 2.50000000e-01, 6.47020117e-07, 2.04081633e-02,
       2.04081633e-02, 2.55117660e-06, 1.11111111e-01, 1.00000000e+00])

# Multilayer Perceptron (MLP) using Backpropagation

In [154]:
from sklearn.neural_network import MLPClassifier

In [169]:
mlp = MLPClassifier(max_iter=100000)

In [170]:
mlp_params={
'learning_rate': ["constant", "invscaling", "adaptive"],
'hidden_layer_sizes': [(5, 2)],
'solver': ["lbfgs", "sgd", "adam"],
'activation': ["logistic", "relu", "tanh"]
}

# Instanciate the Grid for parameter tuning
mlp_grid = GridSearchCV(mlp, mlp_params, cv=10, scoring='accuracy', n_jobs=-1)

mlp = mlp_grid.fit(X_normalized, Y)

In [171]:
print(mlp.best_estimator_)
print(mlp.best_params_)
print('Best Accuracy: {0:.2%}'.format(mlp.best_score_))

MLPClassifier(activation='logistic', hidden_layer_sizes=(5, 2), max_iter=100000,
              solver='lbfgs')
{'activation': 'logistic', 'hidden_layer_sizes': (5, 2), 'learning_rate': 'constant', 'solver': 'lbfgs'}
Best Accuracy: 90.03%


## Prediction

In [175]:
prediction_result_mapping = {0: 'LOW', 1: 'MEDIUM', 2: 'HIGH'}

predicted_risk = mlp.predict(scaler.transform([[4, 0, 57879, 1, 6, 4947796, 0, 0]]))[0]
print("Predicted RISK is: " + prediction_result_mapping[predicted_risk])

Predicted RISK is: LOW


## Export

In [176]:
dump(mlp, MODEL_OUTPUT_PATH + 'MLP_ddos_classifier.joblib') 

['../src/models/MLP_ddos_classifier.joblib']