In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import time
import subprocess

# Using the module ....

import os
import platform

from sklearn import preprocessing
import numpy as np

from causalnex.structure.notears import from_pandas
from causalnex.structure import StructureModel
from causalnex.structure.notears import from_pandas

from IPython.display import Image
from causalnex.plots import plot_structure, NODE_STYLE, EDGE_STYLE

import pygraphviz
import networkx as nx

from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectKBest
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn import metrics

from sklearn.model_selection import train_test_split

from causalnex.network import BayesianNetwork
from causalnex.evaluation import classification_report

import dowhy
from dowhy import CausalModel
import dowhy.datasets
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import dowhy.datasets, dowhy.plotter


# Avoiding unnecessary log messges and warnings
import logging
logging.getLogger("dowhy").setLevel(logging.WARNING)
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

# Load some sample data

Read the dataset

In [None]:
df = pd.read_csv("replace this with the dataset path")

Different unique values of the target attributes

In [None]:
df['is_attack'].unique()

# Data Preprocessing

In [None]:
# removing columns that have all nan elements and those rows that have the same value as it's column name
nanAttr = []

for i in df.columns:
    df.drop(df[df[i] == i].index, inplace = True)
    count = df[i].isna().sum()
    if count > len(df)*0.9 :
        nanAttr.append(i)



In [None]:
#removing attributes with more than 90 perc values nan
df.drop(nanAttr,axis=1 ,inplace=True)

In [None]:
#Replacing the NAN values with the median

df.fillna(df.median(), inplace=True)

In [None]:
#eleminating extranous variables
for c in df.columns.tolist()[:-1]:
    if len(df[c].unique()) ==1:
        print(c)
        df.pop(c)

In [None]:
df.info()

In [None]:
col = df.columns.tolist()

In [None]:
df_feat = df.iloc[:,:]

In [None]:
#finding the non numeric column names
non_numeric_columns = list(df_feat.select_dtypes(exclude=[np.number]).columns)
print(non_numeric_columns)

In [None]:
#Transform the non-numeric data to numeric values
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in non_numeric_columns:
    df_feat[col] = le.fit_transform(df_feat[col].astype(str))

df_feat.head(16)

In [None]:
df_feat.info()

In [None]:
#structure learning

sm = from_pandas(df_feat)

In [None]:
sm.size()

In [None]:
sm.edges.data()

In [None]:
#saving the structure as dot file (optional)
nx.drawing.nx_pydot.write_dot(sm, 'graph.dot')

In [None]:
#filtering the edges with the threshold and reconstructing the DAG
sm.remove_edges_below_threshold(0.8)
viz = plot_structure(
    sm,
    graph_attributes={"scale": "1"},
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK,
)
filename = "final_model_crude.png"
viz.draw(filename,prog="circo")
Image(filename)

In [None]:
#flipping the edges
neighbors = []
for i in sm.neighbors('is_attack'): 
    print(i)
    neighbors.append(i)
for i in neighbors:
    sm.remove_edge("is_attack",i)
    sm.add_edge(i, "is_attack")

In [None]:
#adding the edges
sm.add_edge("ttl", "is_attack")
sm.add_edge("src_ip", "is_attack")
sm.add_edge("dst_ip", "is_attack")
sm.add_edge("timestamp", "is_attack")
sm.add_edge("ip_len", "is_attack")
sm.add_edge("protocol", "is_attack")
sm.add_edge("tcp_flag_push", "is_attack")
sm.add_edge("tcp_flag_reset", "is_attack")
sm.add_edge("src_port", "is_attack")
sm.add_edge("dst_port", "is_attack")
sm.add_edge("tcp_flag_fin", "is_attack")
sm.add_edge("ip_flag_df", "is_attack")
sm.add_edge("tcp_flag_syn", "is_attack")

In [None]:
#getting the largest subgraph
sm = sm.get_largest_subgraph()
viz = plot_structure(
    sm,
    graph_attributes={"scale": "2"},
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK,
)
filename = "final_model.png"
viz.draw(filename,prog="circo")
Image(filename)

In [None]:
#saving the largest subgraph in dot format
nx.drawing.nx_pydot.write_dot(sm, 'graphLargestSubgraph.dot')

In [None]:
sm.edges(data="weight")

In [None]:
#constructing the graph
graph_attributes = {
    "splines": "spline",  
    "ordering": "out",
    "ratio": "auto",  
    "size": "16,9!",  
    "label": "Causal inference for UDP port attack",
    "fontcolor": "black",
    "fontname": "Helvetica",
    "fontsize": 150,
    "labeljust": "l",
    "labelloc": "t",
    "pad": "1,1",
    "dpi": 200,
    "nodesep": 0.8,
    "ranksep": ".5 equally",
    "bgcolor":"white",
    "scale":0.5
}
node_attributes = {
    node: {
        "shape": "octagon",
        "width": 10,
        "height": 10,
        "fillcolor": "#BDFF52",
        "penwidth": "10",
        "color": "black",
        "fontsize": 100,
        "labelloc": "b",
        "fontcolor":"black"
    }
    for node in sm.nodes
}

node_attributes["is_attack"]["fillcolor"] = "red"
for node in sm.predecessors("is_attack"):
        node_attributes[node]["fillcolor"] = "#FF370A"
        
edge_attributes = {
    (u, v): {
        "penwidth": 2,  
        "weight": 5 , 
        "arrowsize": 10 ,  
        "arrowtail": "dot",
        "color":"black"
    }
    for u, v, w in sm.edges(data="weight")
}


viz = plot_structure(
    sm,
    prog="circo",
    graph_attributes=graph_attributes,
    node_attributes=node_attributes,
    edge_attributes=edge_attributes,
)
f = "final_causal_model_ScanA.jpg"
viz.draw(f)
Image(f)

In [None]:
# Bayesian Networks in CausalNex support only discrete distributions.
# So change to continuous value to discrete values and distributions

col = df_feat.columns.tolist()
col

In [None]:
# library for make dsicrete value
from causalnex.discretiser import Discretiser

df_c = df_feat.loc[:,:]

for i in range(len(col)-1):
    c = col[i]
    df_c[c] = Discretiser(method="uniform",num_buckets=2).fit(df_c[c].values).transform(df_c[c].values)

In [None]:
df_c.info()

# Leaning the Bayesian causal model

In [None]:
#Splitting the test and train data

train, test = train_test_split(df_c, train_size=0.8, test_size=0.2, random_state=10)

In [None]:
#Learning the conditional probability distribution
bn = BayesianNetwork(sm)

In [None]:
# 1st, The first step in this is specifying all of the states that each node can take.
bn = bn.fit_node_states(df_c)

In [None]:
# fit to train data
bn = bn.fit_cpds(train, method="BayesianEstimator", bayes_prior="K2")

In [None]:
# prediction the target attribute
predictions = bn.predict(test, "is_attack")

In [None]:
test["is_attack"].values

In [None]:
# accuracy score etc.
classification_report(bn, test, "is_attack")

# Building the ROC and AUC curves

In [None]:
from causalnex.evaluation import roc_auc
roc, auc = roc_auc(bn, test, "is_attack")
print(roc)
print(auc)

In [None]:
test['is_attack']

In [None]:
predictions['is_attack_prediction'],

In [None]:
from sklearn.metrics import roc_curve
fpr1, tpr1, thresh1 = roc_curve(test['is_attack'], predictions['is_attack_prediction'], pos_label=1)
print(fpr1, tpr1, thresh1)

In [None]:
from sklearn.metrics import roc_auc_score

# auc scores
auc_score1 = roc_auc_score(test['is_attack'],predictions['is_attack_prediction'])
print(auc_score1 )

In [None]:
x = []
y = []
for i in roc:
    x.append(i[0])
    y.append(i[1])

In [None]:
plt.plot(x,y)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.savefig('roc1', transparent = True,
            facecolor ="w",bbox_inches="tight")
plt.show()

In [None]:
plt.savefig('roc')

# Finding the robustness of the model using microsoft DoWhy

In [None]:
#taking the treatment variables of the target from the constructed model
treatment =  list(sm.predecessors("is_attack"))

In [None]:
outcome ='is_attack'

In [None]:
#learning the model with the constructed causal model 
model= dowhy.CausalModel(
        data = df_feat,
        graph=("./graphLargestSubgraph.dot"),
        treatment= treatment,
        outcome=outcome)
model.view_model()

display(Image(filename="causal_model.png"))

In [None]:
#Identify the causal effect
identified_estimand = model.identify_effect(proceed_when_unidentifiable=True)
print(identified_estimand)

In [None]:
#estimate the values
estimate = model.estimate_effect(identified_estimand,method_name="backdoor.linear_regression")
print(estimate)

In [None]:
#placebo treatment refuter
refute2_results=model.refute_estimate(identified_estimand, estimate,
        method_name="placebo_treatment_refuter")
print(refute2_results)

In [None]:
#data subset refuter
refute3_results=model.refute_estimate(identified_estimand, estimate,
        method_name="data_subset_refuter")
print(refute3_results)

# Calculating the results of other machine learning algorithm from the filtered attributes

In [None]:
def getMeasurements( y_pred,y_test):
    meas = {}
    meas['Accuracy'] = metrics.accuracy_score(y_test, y_pred)
    meas['Precison'] = metrics.precision_score(y_test, y_pred)
    meas['Recall'] = metrics.recall_score(y_test, y_pred)
    meas['F1score'] = metrics.f1_score(y_test, y_pred)
    
    return meas

In [None]:
def MLAlgorithms(df_feat,x,y):
        
            X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=42)
            print("SVM")
            clf = svm.SVC()
            clf.fit(X_train, y_train)
            y_pred= clf.predict(X_test) # 0:Overcast, 2:Mild
            fpr1, tpr1, thresh1 = roc_curve(y_test, y_pred, pos_label=1)
            print(metrics.auc(fpr1, tpr1))
            plt.plot(fpr1, tpr1, label="SVM")
            print(getMeasurements(y_pred,y_test))
            print("KNN")
            
            
            knn = KNeighborsClassifier()
 
            knn.fit(X_train, y_train)

            # Predict on dataset which model has not seen before
            y_pred = knn.predict(X_test)
            fpr1, tpr1, thresh1 = roc_curve(y_test, y_pred, pos_label=1)
            print(metrics.auc(fpr1, tpr1))
            plt.plot(fpr1, tpr1, label="KNN")
            print(getMeasurements( y_pred,y_test))
            
            print("Decision tree")
            clf = tree.DecisionTreeClassifier()
            clf = clf.fit(X_train,y_train)
            y_pred = clf.predict(X_test)
            fpr1, tpr1, thresh1 = roc_curve(y_test, y_pred, pos_label=1)
            print(metrics.auc(fpr1, tpr1))
            #plot(fpr1,tpr1,"Decision Tree")
            plt.plot(fpr1, tpr1, label="Decision Tree")
            print(getMeasurements( y_pred,y_test))
            
            print("Random Forest")
            regressor = RandomForestRegressor(n_estimators=20, random_state=0)
            regressor.fit(X_train, y_train)
            y_pred = regressor.predict(X_test)
            print(getMeasurements( y_pred.round(),y_test))
            
            print("Naive Bayesian")
            model = GaussianNB()

            # Train the model using the training sets
            model.fit(X_train,y_train)
            #Predict Output
            y_pred= model.predict(X_test) # 0:Overcast, 2:Mild
            roc_auc_score(y_test,y_pred)
            fpr1, tpr1, thresh1 = roc_curve(y_test, y_pred, pos_label=1)
            print(metrics.auc(fpr1, tpr1))
            plt.plot(fpr1, tpr1, label="Naive Bayesian")
            #plot(fpr1,tpr1,"Naive Bayesian")
            print(getMeasurements( y_pred,y_test))
            #initialPlot()
            plt.legend(loc='best')
            plt.savefig('ROC'+filename[:-4]+".pdf"
                        ,dpi=300)
            plt.show();


In [None]:
columnsNames = sm.predecessors("is_attack")
dfNew = pd.DataFrame(df_c, columns = columnsNames)


In [None]:
y = df_c["is_attack"]
x = dfNew
dfNew["is_attack"] = df_c["is_attack"]
MLAlgorithms(dfNew,x,y)