In [1]:
pip install influxdb-client

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
from kafka import KafkaConsumer
from json import loads
import json
from time import sleep
import pandas as pd
from pyspark.sql import SparkSession
import influxdb_client
from influxdb_client.client.write_api import SYNCHRONOUS

In [2]:
token = "FroEVNbslUfLwbOznZtPjG7mryBVgdiT_O8dIl3mJSHEGRXuZfPSw5jHfL5g2kfxRZlGKhsgJSIWfC_fpH_bbg=="
org = "mema_org"
bucket = "mema_bucket"
url = "http://localhost:8086"

In [3]:
# Definitions of functions
def calculate_delay_of_detection(true_change_indexes, detected_indexes):
    
    delays = [index - detected_index for detected_index, index in zip(detected_indexes, true_change_indexes)]

    if len(delays) > 0:
        average_delay = sum(delays) / len(delays)
        return average_delay
    else:
        return 0  

In [4]:

def calculate_false_detection_rate(true_change_indexes, detected_indexes):
    total_drifts = len(true_change_indexes)
    total_detected = len(detected_indexes)
    false_detections = total_detected - total_drifts
    fdr = false_detections / total_drifts
    return fdr

In [5]:
def calculate_miss_detection_rate(true_change_indexes, detected_indexes):
    total_drifts = len(true_change_indexes)
    total_detected = len(detected_indexes)
    mdr = (total_drifts - total_detected) / total_drifts
    return mdr

In [6]:
def calculate_rate_of_drift(detected_indexes, total_time):
    total_detected = len(detected_indexes)
    rod = total_detected / total_time
    return rod

In [7]:
# ADWIN Algorithm Implementation
class ADWIN:
    def __init__(self, delta=0.002):
        self.delta = delta
        self.window = []
        self.total = 0
        self.variance = 0
        self.width = 0

    def add_element(self, value):
        self.window.append(value)
        self.width += 1
        self.total += value
        self.update_variance(value)

        if self.width > 1:
            cutpoint = self.find_cut()
            if cutpoint:
                self.window = self.window[cutpoint:]
                self.width = len(self.window)
                self.total = sum(self.window)
                self.recalculate_variance()
        return cutpoint

    def update_variance(self, value):
        mean = self.total / self.width
        self.variance += (value - mean) * (value - self.total / (self.width - 1))

    def find_cut(self):
        for i in range(1, self.width):
            w0 = i
            w1 = self.width - i
            mean0 = sum(self.window[:i]) / w0
            mean1 = sum(self.window[i:]) / w1
            var0 = sum((x - mean0) ** 2 for x in self.window[:i]) / w0
            var1 = sum((x - mean1) ** 2 for x in self.window[i:]) / w1
            m = 1 / w0 + 1 / w1
            epsilon = ((2 / (m - 1)) * log(4 / self.delta)) ** 0.5
            if abs(mean0 - mean1) > epsilon:
                return i
        return None

    def recalculate_variance(self):
        mean = self.total / self.width
        self.variance = sum((x - mean) ** 2 for x in self.window)

def log(x):
    from math import log
    return log(x)

def abs(x):
    return x if x >= 0 else -x


In [8]:
import joblib

# Load the model from the file
model = joblib.load('../albert/model/random_forest_model.joblib')

In [9]:
def split_df(df):
    label = df.pop('attack')
    return df, label

In [12]:
class HaiConsumer:
    def __init__(self, topic, bootstrap_servers):
        self.topic = topic
        self.bootstrap_servers = bootstrap_servers
        self.consumer = KafkaConsumer(
            self.topic,
            bootstrap_servers=self.bootstrap_servers,
            auto_offset_reset='earliest',
            enable_auto_commit=True,
            value_deserializer=lambda x: loads(x.decode('utf-8')))

    def consume(self):
        # Initialize ADWIN detector
        adwin = ADWIN(delta=0.002)

        counter = 0
        # Perform change detection on the test set
        global change_points
        global accuracies
        global whole_df
        global whole_labels
        change_points = []
        accuracies = []
        whole_df = pd.DataFrame()
        whole_labels = pd.Series()
        # write to file
        with influxdb_client.InfluxDBClient(url=url, token=token, org=org) as client:
            write_api = client.write_api(write_options=SYNCHRONOUS)
            for i, message in enumerate(self.consumer):
                message = message.value
                # selected_columns = {key: value for key, value in message.items() if key in columns_to_scale_and_monitor}
                df = pd.DataFrame([message])
                x_i, y_i = split_df(df)
                x_i = x_i.iloc[0:1]

                whole_df = pd.concat([whole_df, x_i], ignore_index=True)
                whole_labels = pd.concat([whole_labels, y_i], ignore_index=True)

                # Predict using the RandomForest model
                pred = model.predict(x_i)

                # Check for a change point using ADWIN
                if adwin.add_element(pred == int(y_i)):
                    change_points.append(i)

                # Calculate accuracy at each step
                accuracy = model.score(whole_df.iloc[:i + 1], whole_labels.iloc[:i + 1])
                accuracies.append(accuracy)

                print(f'accuracy: {accuracy}   -    drift_detected: {int(y_i)}')

                p = influxdb_client.Point("ChangeDetection_ADWIN").field('accuracy', accuracy)
                write_api.write(bucket, org, p)

                p = influxdb_client.Point("ChangeDetection_ADWIN").field('drift_detected', int(y_i))
                write_api.write(bucket, org, p)
                counter += 1
                if counter > 5000:
                    break

            client.close()


In [11]:
topic = 'hai-preprocessed-mao'
bootstrap_servers = ['localhost:9092']
consumer = HaiConsumer(topic, bootstrap_servers)
consumer.consume()

  whole_labels = pd.Series()


{'P1_B2004': {'0': 0.0983}, 'P1_B2016': {'0': 1.0702}, 'P1_B3004': {'0': 399.2321}, 'P1_B3005': {'0': 1110.3986}, 'P1_B4002': {'0': 32.0}, 'P1_B4005': {'0': 0.0}, 'P1_B400B': {'0': 28.9981}, 'P1_B4022': {'0': 35.7395}, 'P1_FCV01D': {'0': 0.0}, 'P1_FCV01Z': {'0': 0.2838}, 'P1_FCV02D': {'0': 100.0}, 'P1_FCV02Z': {'0': 95.5215}, 'P1_FCV03D': {'0': 53.785}, 'P1_FCV03Z': {'0': 55.0323}, 'P1_FT01': {'0': 132.9803}, 'P1_FT01Z': {'0': 711.2531}, 'P1_FT02': {'0': 5.188}, 'P1_FT02Z': {'0': 29.9914}, 'P1_FT03': {'0': 312.0803}, 'P1_FT03Z': {'0': 1112.1606}, 'P1_LCV01D': {'0': 19.6892}, 'P1_LCV01Z': {'0': 19.2886}, 'P1_LIT01': {'0': 396.6266}, 'P1_PCV01D': {'0': 35.21}, 'P1_PCV01Z': {'0': 36.3403}, 'P1_PCV02D': {'0': 12}, 'P1_PCV02Z': {'0': 12.0102}, 'P1_PIT01': {'0': 1.0843}, 'P1_PIT02': {'0': 0.2084}, 'P1_TIT01': {'0': 36.0779}, 'P1_TIT02': {'0': 37.3596}, 'P2_24Vdc': {'0': 28.0221}, 'P2_Auto': {'0': 1}, 'P2_Emgy': {'0': 0}, 'P2_On': {'0': 1}, 'P2_SD01': {'0': 20}, 'P2_SIT01': {'0': 815.0}, 'P2_

TypeError: float() argument must be a string or a number, not 'dict'

In [32]:

class HaiConsumer:
    def __init__(self, topic, bootstrap_servers):
        self.topic = topic
        self.bootstrap_servers = bootstrap_servers
        self.consumer = KafkaConsumer(
            self.topic,
            bootstrap_servers=self.bootstrap_servers,
            auto_offset_reset='earliest',
            enable_auto_commit=True,
            value_deserializer=lambda x: loads(x.decode('utf-8')))

    def consume(self):
        # Create an instance of the ADWIN class
        ADWIN_detector = ADWIN()

        # Assuming url, token, org, and bucket are defined
        with influxdb_client.InfluxDBClient(url=url, token=token, org=org) as client:
            write_api = client.write_api(write_options=SYNCHRONOUS)

            for i, message in enumerate(self.consumer):
                message = message.value
                df = pd.DataFrame([message])
                df = df.iloc[:, 1:]

                # Assuming model.predict() returns a single value for detection
                random_forest_detection = model.predict(df)
                
                # List to store detected change points using ADWIN
                detected_indexes_ADWIN = []
                stream_size = df.shape[0]

                # Initialize ADWIN for change detection
                adwin = ADWIN(delta=0.002)

                # List detected change points
                detected_indexes_adwin = []

                error = int(random_forest_detection != test_labels.iloc[i])

                # Update ADWIN with the error signal
                adwin.add_element(error)

                # Check for change detection
                if adwin.check_drift():
                    detected_indexes_adwin.append(i)



                # Update ADWIN with the label (1 for anomaly, 0 for normal)
                ADWIN_detector.add_element(random_forest_detection)
                if ADWIN_detector.detected_change():
                    detected_indexes_ADWIN.append(i)

                print("Detected Change Points (ADWIN):", detected_indexes_ADWIN)

                # Evaluate change detection performance using ADWIN
                average_delay_ADWIN = calculate_delay_of_detection(random_forest_detection, detected_indexes_ADWIN)
                fdr_ADWIN = calculate_false_detection_rate(random_forest_detection, detected_indexes_ADWIN)
                mdr_ADWIN = calculate_miss_detection_rate(random_forest_detection, detected_indexes_ADWIN)
                rod_ADWIN = calculate_rate_of_drift(detected_indexes_ADWIN, total_time=stream_size - 500)

                p = influxdb_client.Point("Change_Detection_ADWIN").field('fdr_ADWIN', fdr_ADWIN)
                write_api.write(bucket, org, p)

                p = influxdb_client.Point("Change_Detection_ADWIN").field('mdr_ADWIN', mdr_ADWIN)
                write_api.write(bucket, org, p)

                p = influxdb_client.Point("Change_Detection_ADWIN").field('rod_ADWIN', rod_ADWIN)
                write_api.write(bucket, org, p)

                sleep(3)
                if i > 100:
                    break

            client.close()



In [33]:

topic = 'hai-input'
bootstrap_servers = ['localhost:9092']
consumer = HaiConsumer(topic, bootstrap_servers)
consumer.consume()

AttributeError: 'ADWIN' object has no attribute 'detected_change'