In [13]:
pip install influxdb-client


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [14]:
from kafka import KafkaConsumer
from json import loads
import json
from time import sleep
import pandas as pd
from pyspark.sql import SparkSession
import influxdb_client
from influxdb_client.client.write_api import SYNCHRONOUS

In [15]:
token = "FroEVNbslUfLwbOznZtPjG7mryBVgdiT_O8dIl3mJSHEGRXuZfPSw5jHfL5g2kfxRZlGKhsgJSIWfC_fpH_bbg=="
org = "mema_org"
bucket = "mema_bucket"
url = "http://localhost:8086"

In [16]:
# Definitions of functions
def calculate_delay_of_detection(true_change_indexes, detected_indexes):
    
    delays = [index - detected_index for detected_index, index in zip(detected_indexes, true_change_indexes)]

    if len(delays) > 0:
        average_delay = sum(delays) / len(delays)
        return average_delay
    else:
        return 0  

In [17]:

def calculate_false_detection_rate(true_change_indexes, detected_indexes):
    total_drifts = len(true_change_indexes)
    total_detected = len(detected_indexes)
    false_detections = total_detected - total_drifts
    fdr = false_detections / total_drifts
    return fdr

In [18]:

def calculate_miss_detection_rate(true_change_indexes, detected_indexes):
    total_drifts = len(true_change_indexes)
    total_detected = len(detected_indexes)
    mdr = (total_drifts - total_detected) / total_drifts
    return mdr


In [19]:

def calculate_rate_of_drift(detected_indexes, total_time):
    total_detected = len(detected_indexes)
    rod = total_detected / total_time
    return rod

In [20]:
import json
from kafka import KafkaConsumer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

# EDDM Class Definition
class EDDM:
    def __init__(self, alpha=0.95, beta=0.9, min_num_instances=30):
        self.alpha = alpha
        self.beta = beta
        self.min_num_instances = min_num_instances

        self.n = 0
        self.in_concept_change = 0
        self.in_warning_zone = 0
        self.delay = 0
        self.num_errors = 0
        self.last_d = 0
        self.d = 0
        self.mean = 0
        self.std_temp = 0
        self.estimation = 0
        self.m2s_max = 0

    def add_element(self, prediction):
        self.in_concept_change = 0
        self.n += 1

        if prediction == 1.0:
            self.in_warning_zone = 0
            self.delay = 0
            self.num_errors += 1
            self.last_d = self.d
            self.d = self.n - 1
            distance = self.d - self.last_d
            old_mean = self.mean
            self.mean = self.mean + (float(distance) - self.mean) / self.num_errors
            self.std_temp = self.std_temp + (distance - self.mean) * (distance - old_mean)
            std = np.sqrt(self.std_temp / self.num_errors)
            m2s = self.mean + 2 * std

            if self.n < self.min_num_instances:
                return

            if m2s > self.m2s_max:
                self.m2s_max = m2s
            else:
                p = m2s / self.m2s_max
                threshold_warning = (self.mean + 2 * std) / (self.m2s_max + 2 * np.sqrt(self.std_temp))
                threshold_drift = threshold_warning * self.beta

                if (self.num_errors > self.min_num_instances) and (p < self.alpha):
                    self.in_concept_change = 1
                elif (self.num_errors > self.min_num_instances) and (p < self.beta):
                    self.in_warning_zone = 1
                else:
                    self.in_warning_zone = 0

    def detected_warning_zone(self):
        return self.in_warning_zone

    def detected_change(self):
        return self.in_concept_change

# Loading the Random Forest Model
model = joblib.load('../albert/model/random_forest_model.joblib')

# Kafka Consumer Settings
consumer = KafkaConsumer(
    'hai-input',
    bootstrap_servers='localhost:9092',
    auto_offset_reset='earliest',
    enable_auto_commit=True,
    value_deserializer=lambda x: json.loads(x.decode('utf-8'))
)

# Initialise EDDM detector
eddm_detector = EDDM()

# Initialise the list used to collect metrics
change_points = []
accuracies = []
predictions = []
real_labels = []

# Kafka consumption loop
for i, message in consumer:
    message_data = message.value
    df = pd.DataFrame([message_data])
    features = df.drop(['time', 'attack'], axis=1)
    
   # Predict and update drift detectors
    pred = model.predict(features)
    predictions.append(pred[0])
    real_labels.append(df['attack'].iloc[0])

    error_signal = 1 if pred[0] != df['attack'].iloc[0] else 0
    eddm_detector.add_element(error_signal)

    if eddm_detector.detected_change():
        change_points.append(i)

    # Accuracy updated
    if i > 0:
        accuracies.append(accuracy_score(real_labels, predictions))

# Generate graphs of accuracy and change points
plt.figure(figsize=(10, 5))
plt.plot(range(len(accuracies)), accuracies, label='Accuracy')
plt.scatter(change_points, [accuracies[j] for j in change_points], color='red', marker='x', label='Change Point')
plt.title('Change Detection using EDDM with RandomForest on Kafka Data')
plt.xlabel('Message Count')
plt.ylabel('Accuracy')
plt.legend()
plt.show()


KeyboardInterrupt: 

In [None]:
class HaiConsumer:
    def __init__(self, topic, bootstrap_servers):
        self.topic = topic
        self.bootstrap_servers = bootstrap_servers
        self.consumer = KafkaConsumer(
            self.topic,
            bootstrap_servers=self.bootstrap_servers,
            auto_offset_reset='earliest',
            enable_auto_commit=True,
            value_deserializer=lambda x: loads(x.decode('utf-8')))

    def consume(self):
        # Create an instance of the ADWIN class
        ADWIN_detector = ADWIN()
        # write to file
        with influxdb_client.InfluxDBClient(url=url, token=token, org=org) as client:
            write_api = client.write_api(write_options=SYNCHRONOUS)
            for i, message in enumerate(self.consumer):
                message = message.value
                df = pd.DataFrame([message])
                df = df.iloc[:,1:]

                random_forest_detection = model.predict(df)
                stream_size = df.shape[0]
                # List to store detected change points using ADWIN
                detected_indexes_ADWIN = []
            
                # Update ADWIN with the label (1 for anomaly, 0 for normal)
                ADWIN_detector.set_input(random_forest_detection)
                if ADWIN_detector.get_change():
                    detected_indexes_ADWIN.append(i)

                
                print("Detected Change Points (ADWIN):", detected_indexes_ADWIN)

                # Evaluate change detection performance using ADWIN
                average_delay_ADWIN = calculate_delay_of_detection(random_forest_detection, detected_indexes_ADWIN)
                fdr_ADWIN = calculate_false_detection_rate(random_forest_detection, detected_indexes_ADWIN)
                mdr_ADWIN = calculate_miss_detection_rate(random_forest_detection, detected_indexes_ADWIN)
                rod_ADWIN = calculate_rate_of_drift(detected_indexes_ADWIN, total_time=stream_size - 500)

                p = influxdb_client.Point("Change_Detection_ADWIN").field('fdr_ADWIN', fdr_ADWIN)
                write_api.write(bucket, org, p)

                p = influxdb_client.Point("Change_Detection_ADWIN").field('mdr_ADWIN', mdr_ADWIN)
                write_api.write(bucket, org, p)

                p = influxdb_client.Point("Change_Detection_ADWIN").field('rod_ADWIN', rod_ADWIN)
                write_api.write(bucket, org, p)
                
                sleep(3)
                if i > 100:
                    break

            client.close()


In [None]:
import joblib

# Load the model from the file
model = joblib.load('../albert/model/random_forest_model.joblib')

In [None]:
class HaiConsumer:
    def __init__(self, topic, bootstrap_servers):
        self.topic = topic
        self.bootstrap_servers = bootstrap_servers
        self.consumer = KafkaConsumer(
            self.topic,
            bootstrap_servers=self.bootstrap_servers,
            auto_offset_reset='earliest',
            enable_auto_commit=True,
            value_deserializer=lambda x: loads(x.decode('utf-8')))

    def consume(self):
        # Create an instance of the EDDM class
        eddm_detector = EDDM()
        # write to file
        with influxdb_client.InfluxDBClient(url=url, token=token, org=org) as client:
            write_api = client.write_api(write_options=SYNCHRONOUS)
            for i, message in enumerate(self.consumer):
                message = message.value
                # selected_columns = {key: value for key, value in message.items() if key in columns_to_scale_and_monitor}
                df = pd.DataFrame([message])
                df = df.iloc[:,1:]

                random_forest_detection = model.predict(df)
                stream_size = df.shape[0]
                # List to store detected change points using EDDM
                detected_indexes_eddm = []

                # Update EDDM with the label (1 for anomaly, 0 for normal)
                eddm_detector.add_element(random_forest_detection)
                if eddm_detector.detected_change():
                    detected_indexes_eddm.append(i)
                
                print("Detected Change Points (EDDM):", detected_indexes_eddm)


                
                # Evaluate change detection performance using EDDM
                average_delay_eddm = calculate_delay_of_detection(random_forest_detection, detected_indexes_eddm)
                fdr_eddm = calculate_false_detection_rate(random_forest_detection, detected_indexes_eddm)
                mdr_eddm = calculate_miss_detection_rate(random_forest_detection, detected_indexes_eddm)
                rod_eddm = calculate_rate_of_drift(detected_indexes_eddm, total_time=stream_size - 500)

                p = influxdb_client.Point("Change_Detection_EDDM").field('fdr_eddm', fdr_eddm)
                write_api.write(bucket, org, p)

                p = influxdb_client.Point("Change_Detection_EDDM").field('mdr_eddm', mdr_eddm)
                write_api.write(bucket, org, p)
                
                p = influxdb_client.Point("Change_Detection_EDDM").field('rod_eddm', rod_eddm)
                write_api.write(bucket, org, p)
                
                sleep(3)
                if i > 100:
                    break

            client.close()

In [None]:
topic = 'hai-input'
bootstrap_servers = ['localhost:9092']
consumer = HaiConsumer(topic, bootstrap_servers)
consumer.consume()

KeyboardInterrupt: 