<h2> <div align="center">Development of Dynamic Machine Model based on Static model to detect Network Intrusion on real-time datastream using Kafka Integration</div> </h2>

---
---

In [None]:
!pip install tensorflow-io
!pip install kafka-python



In [None]:
# Necessary modules
import random
random.seed(22)
import numpy as np
import pandas as pd
import os
from datetime import datetime
import time
import threading
import json
from kafka import KafkaProducer, KafkaConsumer
from kafka.errors import KafkaError
import pickle 

# Visualization
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('whitegrid')
sns.set_context('notebook')
matplotlib.rcParams['figure.figsize'] = (12,8)
import warnings
warnings.filterwarnings("ignore")


from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_selection import mutual_info_classif, VarianceThreshold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix


In [None]:
# Creating Kafka Consumer
consumer = KafkaConsumer('ml-raw-dns',
                         bootstrap_servers="localhost:9092",
                         auto_offset_reset='earliest',
                         enable_auto_commit=False)

In [None]:
static_win_acc = []
dynamic_win_acc = []
static_cm = []
dynamic_cm = []
df = pd.DataFrame(columns=['timestamp','FQDN_count','subdomain_length','upper','lower',
                           'numeric','entropy','special','labels','labels_max','labels_average',
                           'longest_word','sld','len','subdomain','Target Attack'])
for i in range(1,16):
  dataStream=[]
  realtimeData=[]
  df.drop(df.index, inplace=True)
  ranges=0
  for m in consumer:
      if(ranges>((i*1000)+50) and ranges <= ((i+1)*1000)+50):
          dataStream.append(m)
      if(ranges==((i+1)*1000)+50):
          break
      ranges=ranges+1
  for j in range(0,1000):
      rec_string=(dataStream[j][6]).decode("utf-8")
      rec_string=rec_string[2:-3]
      rec_string=rec_string.split(",")
      realtimeData.append(rec_string)
  df=pd.DataFrame(columns=['timestamp','FQDN_count','subdomain_length','upper','lower',
                            'numeric','entropy','special','labels','labels_max','labels_average',
                            'longest_word','sld','len','subdomain','Target Attack'])
  for loc in range(0,len(realtimeData)):
      df = df.append(pd.Series(realtimeData[loc], index=['timestamp','FQDN_count','subdomain_length','upper','lower','numeric',
                                                          'entropy','special','labels','labels_max','labels_average','longest_word',
                                                          'sld','len','subdomain','Target Attack']),ignore_index=True)
  hash = FeatureHasher(n_features=5, input_type='string')
  df.drop(['timestamp'], axis=1, inplace=True) 
  df = df.replace(np.nan, 0)
  longest_word = hash.fit_transform(df[['longest_word']].astype(str).values).todense()
  longest_word = pd.DataFrame(longest_word, columns=['longest_word_hash'+str(i) for i in range(1,6)])
  sld  = hash.fit_transform(df[['sld']].astype(str).values).todense()
  sld = pd.DataFrame(sld, columns=['sld'+str(i) for i in range(1,6)])
  df.drop(['longest_word', 'sld'],axis=1,inplace=True)
  df = pd.concat([df,longest_word,sld],axis=1)
  X = df.drop(['Target Attack'], axis = 1)
  y = df['Target Attack'].astype(float)
  X_new = X[['FQDN_count', 'subdomain_length', 'lower', 'numeric', 'entropy', 'special',
            'labels', 'labels_max', 'labels_average', 'len', 'subdomain',
            'longest_word_hash2', 'longest_word_hash5', 'sld2']].astype(float)
  
  X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.25, random_state=22)
  

  
  statModel = pickle.load(open('XGB_Model.pkl', 'rb'))
  statPred = statModel.predict(X_test)
  statAcc = statModel.score(X_test, y_test)
  static_win_acc.append(statAcc)
  static_cm.append((statPred))

  pipe_xgbc = make_pipeline(XGBClassifier(verbosity=1,random_state=22,n_jobs =-1))
  param_grid_xgbc = {'xgbclassifier__n_estimators': [1000],'xgbclassifier__reg_alpha': [0.01],'xgbclassifier__gamma': [0.001],'xgbclassifier__learning_rate': [0.1], 'xgbclassifier__scale_pos_weight': [0.9],'xgbclassifier__subsample': [0.9], 'xgbclassifier__min_child_weight':[1]}
  dynaModel = GridSearchCV(pipe_xgbc, param_grid_xgbc, cv=5, n_jobs=-1, scoring="f1" )
  dynaModel.fit(X_train, y_train)
  dynaPred = dynaModel.predict(X_test)
  dynaModel_result=dynaModel.score(X_test,y_test)
  dynamic_win_acc.append(dynaModel_result) 
  dynamic_cm.append((dynaPred))

print('Static Model Accuracies',static_win_acc)
print('Dynamic Model Accuracies',dynamic_win_acc)

Static Model Accuracies [0.8502994011976048, 0.8317460317460318, 0.8753993610223643, 0.8698224852071006, 0.8417910447761193, 0.8721311475409836, 0.9054441260744985, 0.8498402555910544, 0.8145695364238411, 0.8613569321533924, 0.8463949843260188, 0.8433734939759037, 0.8690095846645367, 0.8387096774193548, 0.8708708708708708]
Dynamic Model Accuracies [0.8328267477203647, 0.8242811501597446, 0.8645161290322582, 0.8656716417910447, 0.8468468468468469, 0.8675496688741721, 0.8959537572254335, 0.8414239482200647, 0.8039867109634551, 0.8511904761904762, 0.8444444444444444, 0.844984802431611, 0.8717948717948718, 0.814569536423841, 0.8606060606060607]


In [None]:
staticAccuracies = {}
staticAccuracies_max = []
windows = ['win_'+str(i) for i in range(1,16)]
for a,b in zip(list(windows), static_win_acc):
    staticAccuracies[a] = b
print(staticAccuracies)
print('Max Acc Window:', max(staticAccuracies, key=staticAccuracies.get))
print('Max Acc: ',staticAccuracies[max(staticAccuracies, key=staticAccuracies.get)])
staticAccuracies_max.append(staticAccuracies[max(staticAccuracies, key=staticAccuracies.get)])

{'win_1': 0.8502994011976048, 'win_2': 0.8317460317460318, 'win_3': 0.8753993610223643, 'win_4': 0.8698224852071006, 'win_5': 0.8417910447761193, 'win_6': 0.8721311475409836, 'win_7': 0.9054441260744985, 'win_8': 0.8498402555910544, 'win_9': 0.8145695364238411, 'win_10': 0.8613569321533924, 'win_11': 0.8463949843260188, 'win_12': 0.8433734939759037, 'win_13': 0.8690095846645367, 'win_14': 0.8387096774193548, 'win_15': 0.8708708708708708}
Max Acc Window: win_7
Max Acc:  0.9054441260744985


In [None]:
dynamicAccuracies = {}
dynamicAccuracies_max = []
for a,b in zip(list(windows), dynamic_win_acc):
    dynamicAccuracies[a] = b
print(dynamicAccuracies)
print('Max Acc Window: ',max(dynamicAccuracies, key=dynamicAccuracies.get))
print('Max Acc: ',dynamicAccuracies[max(dynamicAccuracies, key=dynamicAccuracies.get)])
dynamicAccuracies_max.append(dynamicAccuracies[max(dynamicAccuracies, key=dynamicAccuracies.get)])

{'win_1': 0.8328267477203647, 'win_2': 0.8242811501597446, 'win_3': 0.8645161290322582, 'win_4': 0.8656716417910447, 'win_5': 0.8468468468468469, 'win_6': 0.8675496688741721, 'win_7': 0.8959537572254335, 'win_8': 0.8414239482200647, 'win_9': 0.8039867109634551, 'win_10': 0.8511904761904762, 'win_11': 0.8444444444444444, 'win_12': 0.844984802431611, 'win_13': 0.8717948717948718, 'win_14': 0.814569536423841, 'win_15': 0.8606060606060607}
Max Acc Window:  win_7
Max Acc:  0.8959537572254335


In [None]:
#Static and Dynamic model Confusion Matrix
static_cm = confusion_matrix(y_test, static_cm[staticAccuracies_max.index(max(staticAccuracies_max))])
print('Confusion matrix of Static Model\n\n', static_cm)
dynamic_cm = confusion_matrix(y_test, dynamic_cm[dynamicAccuracies_max.index(max(dynamicAccuracies_max))])
print('Confusion matrix of Dynamic Model\n\n', dynamic_cm)


Confusion matrix of Static Model

 [[ 23  82]
 [ 36 109]]
Confusion matrix of Dynamic Model

 [[ 27  78]
 [ 37 108]]


In [None]:
#Static and Dynamic model Classification Report
print('Classification Report of Static Model\n')
print(classification_report(y_test, static_cm[staticAccuracies_max.index(max(staticAccuracies_max))]))
print('\Classification Report of Dynamic Model\n')
print(classification_report(y_test, dynamic_cm[dynamicAccuracies_max.index(max(dynamicAccuracies_max))]))