In [113]:
import os
import random
import pandas as pd
import numpy
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.tree import export_text
from joblib import dump, load

# Data Generation

In [114]:
columns = ['Invested Amount', 'Successful Attacks', 'Failed Attacks', 'Business Value', 'Number of Employees', 'Employee Training', 'Known Vulnerabilities', 'External Advisor', 'Risk']
features = columns[:-1]
LEVELS = ["LOW", "MEDIUM", "HIGH"]
prediction_result_mapping = {0: "LOW", 1: "MEDIUM", 2: "HIGH"}

In [115]:
def generate_data(nr_entries = 1000, min_empl = 30, max_empl = 90000, min_nr_attacks = 0, nr_attacks = 50, 
                  avg_business_value = 5000000, std_business_value = 50000, max_invested_perc = 0.3, max_nr_vulnerabilities = 10):
    df = pd.DataFrame(columns=columns)
    for i in range(0, nr_entries):
        nr_employees = random.randint(min_empl, max_empl)
        employees_training = random.choice(LEVELS)
        failed_attack = random.randrange(nr_attacks)
        succ_attack = random.randrange(nr_attacks)
        business_value = int(numpy.random.normal(loc=avg_business_value, scale=std_business_value))
        invested_perc = random.uniform(0, max_invested_perc)
        invested_amount = int(invested_perc * business_value)
        known_vulnerabilities = random.randrange(max_nr_vulnerabilities)
        external_adv = "NO" if (i % 2) == 0 else "YES"

        # Risk is computed based on the other parameters and is then categorized using 'get_categorized_risk'
        computed_risk = invested_perc - (succ_attack / nr_attacks) + (nr_employees / max_empl) * LEVELS.index(employees_training) - (known_vulnerabilities / max_nr_vulnerabilities) + (i % 2) / 3
        
        df.loc[i] = [invested_amount, succ_attack, failed_attack, business_value, nr_employees, employees_training, known_vulnerabilities, external_adv, get_categorized_risk(computed_risk)]
    return df

def get_categorized_risk(weighted_risk, low_medium_boundary = 1.0, medium_high_boundary = 0.0):
  if weighted_risk >= low_medium_boundary:
      return "LOW"
  elif weighted_risk >= medium_high_boundary and weighted_risk < low_medium_boundary:
      return "MEDIUM"
  else:
      return "HIGH"

In [116]:
data = generate_data(nr_entries = 10000)

In [117]:
data.head()

Unnamed: 0,Invested Amount,Successful Attacks,Failed Attacks,Business Value,Number of Employees,Employee Training,Known Vulnerabilities,External Advisor,Risk
0,734240,29,0,5006577,58502,HIGH,9,NO,HIGH
1,1298563,30,40,5010682,22547,MEDIUM,4,YES,HIGH
2,384418,6,24,5004666,12540,MEDIUM,6,NO,HIGH
3,133058,13,31,4978776,40092,HIGH,5,YES,MEDIUM
4,579157,39,32,4992176,48295,HIGH,3,NO,MEDIUM


In [118]:
data.describe()

Unnamed: 0,Invested Amount,Successful Attacks,Failed Attacks,Business Value,Number of Employees,Employee Training,Known Vulnerabilities,External Advisor,Risk
count,10000,10000,10000,10000,10000,10000,10000,10000,10000
unique,9968,50,50,9718,9455,3,10,2,3
top,102823,28,15,4991849,7636,MEDIUM,4,NO,HIGH
freq,2,231,230,3,4,3393,1044,5000,6041


## Data Export as .csv

In [119]:
data.to_csv('data.csv')

# Data Processing

## Categorization Step

In [120]:
levels_mapping = { 'LOW': 0, 'MEDIUM': 1, 'HIGH': 2 }
advisor_mapping = { 'NO': 0, 'YES': 1 }

data = data.replace({'Employee Training': levels_mapping, 'Risk': levels_mapping, 'External Advisor': advisor_mapping})

In [121]:
data.head()

Unnamed: 0,Invested Amount,Successful Attacks,Failed Attacks,Business Value,Number of Employees,Employee Training,Known Vulnerabilities,External Advisor,Risk
0,734240,29,0,5006577,58502,2,9,0,2
1,1298563,30,40,5010682,22547,1,4,1,2
2,384418,6,24,5004666,12540,1,6,0,2
3,133058,13,31,4978776,40092,2,5,1,1
4,579157,39,32,4992176,48295,2,3,0,1


## Normalization Step - using scaler

In [126]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

normalized_data = data[features].apply(lambda x: ( (x - x.min()) / (x.max() - x.min())))
normalized_data.head()

X_train, X_test, y_train, y_test = train_test_split(data[features].values, data["Risk"].values, random_state=0)

scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

dump(scaler, 'MinMaxScaler.joblib') 

['MinMaxScaler.joblib']

# Deep Neural Network Classifier

In [127]:
import tensorflow as tf
print(tf.__version__)

2.4.1


In [128]:
# Define the training inputs
def get_train_inputs():
    x = { feature_name: tf.constant(X_train, dtype=tf.float64)}
    y = tf.constant(y_train, dtype=tf.int32)
    return x, y

# Define the test inputs
def get_test_inputs():
    x = { feature_name: tf.constant(X_test, dtype=tf.float64)}
    y = tf.constant(y_test, dtype=tf.int32)
    return x, y

In [129]:
X_train, X_test, y_train, y_test = train_test_split(normalized_data[features].values, data["Risk"].values, test_size=0.35)

In [130]:
# Specify that all features have real-value data
feature_name = "risk_features"
feature_columns = [tf.feature_column.numeric_column(feature_name, 
                                                    shape=len(features))]

In [131]:
# Build 5 layer DNN with 512, 256, 128, 64, 32 units respectively.
classifier = tf.estimator.DNNClassifier(feature_columns=feature_columns,
                                            hidden_units=[512, 256, 128, 64],
                                            n_classes=3)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpecea5k00', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


## Train

In [None]:
classifier.train(input_fn=get_train_inputs, steps=5000)

Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
INFO:tensorflow:Calling model_fn.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 0...
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmpecea5k00/model.ckpt.
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 0...
INFO:tensorflow:loss = 1.1058244, step = 0
INFO:tensorflow:global_step/sec: 7.23013
INFO:tensorflow:loss = 0.9889312, step = 100 (13.836 sec)
INFO:tensorflow:global_step/sec: 7.27287
INFO:tensorflow:loss = 0.91795915, step = 200 (13.749 sec)
INFO:tensorflow:global_step/

## Evaluate

In [None]:
accuracy_score = classifier.evaluate(input_fn=get_test_inputs, steps=100)["accuracy"]

print('Accuracy: {0:.2%}'.format(accuracy_score))

## Export

In [None]:
feature_spec = {'risk_features': tf.io.FixedLenFeature(shape=len(features), dtype=tf.float32)}

serving_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)

classifier.export_saved_model(export_dir_base='./model/export', serving_input_receiver_fn=serving_fn)

## Prediction

### Data Preparation

In [None]:
def get_data_for_prediction():
  return { feature_name: tf.constant(normalized_prediction_df, dtype=tf.float64)}

# Create DataFrame from data to be predicted
prediction_df = pd.DataFrame([[1077113, 8, 29, 4947796, 57879, 1, 6, 0]], columns=features)

# Normalize using the initial dataframe -> bring the data for the prediction to the same scale
normalized_prediction_df = prediction_df[features]
for feature in features:
    normalized_prediction_df[feature] = normalized_prediction_df[feature].apply(lambda x: ( (x - data[feature].min()) / (data[feature].max() - data[feature].min())))

normalized_prediction_df

### Result

In [None]:
for pred in classifier.predict(input_fn=get_data_for_prediction):
  print("\n")
  print("Predicted RISK is: " + prediction_result_mapping[pred['class_ids'][0]])
  break

# Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score

In [None]:
# Data here is not normalized
X_train, X_test, y_train, y_test = train_test_split(data[features].values, data["Risk"].values, random_state=0)

## Train

In [None]:
dtree_model = DecisionTreeClassifier().fit(X_train, y_train) 

## Evaluate

In [None]:
accuracy = dtree_model.score(X_test, y_test) 

print('Accuracy: {0:.2%}'.format(accuracy))

In [None]:
plot_confusion_matrix(dtree_model, X_test, y_test)  
plt.show()

## Export

In [None]:
#r = export_text(dtree_model, feature_names=features)
#print(r)

dump(dtree_model, 'Tree_classifier.joblib') 

## Prediction

In [None]:
# Create DataFrame from data to be predicted
prediction_df = pd.DataFrame([[1077113, 8, 29, 4947796, 57879, 1, 6, 0]], columns=features)
prediction_df

In [None]:
predicted_risk = dtree_model.predict(prediction_df)[0]
print("Predicted RISK is: " + prediction_result_mapping[predicted_risk])

# Support Vector Machine (SVM) classifier

In [None]:
from sklearn.svm import LinearSVC, SVC 

## Train

In [None]:
# Linear
#svm_model = LinearSVC().fit(X_train, y_train)

svm_model = SVC(kernel='poly').fit(X_train, y_train)

## Evaluate

In [None]:
accuracy = svm_model.score(X_test, y_test) 

print('Accuracy: {0:.2%}'.format(accuracy))

In [None]:
plot_confusion_matrix(svm_model, X_test, y_test)  
plt.show()

## Export

In [None]:
dump(svm_model, 'SVM_classifier.joblib') 

## Prediction

In [None]:
predicted_risk = svm_model.predict(scaler.transform([[1077113, 8, 29, 4947796, 57879, 1, 6, 0]]))[0]
print("Predicted RISK is: " + prediction_result_mapping[predicted_risk])

# K-nearest Neighbours Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

## Train

In [None]:
knn = KNeighborsClassifier(n_neighbors = 17).fit(X_train, y_train) 

## Evaluate

In [None]:
accuracy = knn.score(X_test, y_test) 
print('Accuracy: {0:.2%}'.format(accuracy))

In [None]:
plot_confusion_matrix(knn, X_test, y_test)  
plt.show()

## Export

In [None]:
dump(knn, 'KNN_classifier.joblib') 

## Prediction

In [None]:
predicted_risk = knn.predict(scaler.transform([[1077113, 8, 29, 4947796, 57879, 1, 6, 0]]))[0]
print("Predicted RISK is: " + prediction_result_mapping[predicted_risk])

# Multilayer Perceptron (MLP) using Backpropagation

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
clf = MLPClassifier(activation='tanh', solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), max_iter=10000)

## Train

In [None]:
clf.fit(X_train, y_train)

## Evaluate

In [None]:
accuracy = clf.score(X_test, y_test) 

print('Accuracy: {0:.2%}'.format(accuracy))

# lbfgs
# identity -> 90.72%
# logistic -> 98.88%
# tanh -> 99.68%
# relu -> 62.08%


# sgd
# identity -> 90.48%
# logistic -> 62.08%
# tanh -> 98.24%
# relu -> 62.08%

# adam
# identity -> 90.88%
# logistic -> 98.48%
# tanh -> 99.04%
# relu -> 98.88%

In [None]:
plot_confusion_matrix(clf, X_test, y_test)  
plt.show()

## Export

In [None]:
dump(clf, 'MLP_back.joblib') 

## Prediction

In [None]:
predicted_risk = clf.predict(scaler.transform([[1077113, 8, 29, 4947796, 57879, 1, 6, 0]]))[0]
print("Predicted RISK is: " + prediction_result_mapping[predicted_risk])