# Evaluating Capymoa Drift Detectors

This notebook evaluates the impact of a single Concept Drift Detector on a Streaming Machine Learning model on a specified dataset.  
It measures performance in terms of accuracy, execution time, memory usage, and CPU utilization and saves them in a .csv file for benchmarking

### Install and import Libraries

In [None]:
%%capture
!pip install capymoa
!pip install memory_profiler

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
import time
import psutil
from memory_profiler import memory_usage

### CAPYMOA

from capymoa.classifier import NaiveBayes, HoeffdingTree

from capymoa.datasets import Electricity, Covtype,  Hyper100k, Sensor
from capymoa.datasets import ElectricityTiny # For testing

from capymoa.drift.detectors import ADWIN, STEPD, CUSUM, PageHinkley, DDM, HDDMAverage, HDDMWeighted

from capymoa.drift.eval_detector import EvaluateDetector # Might remove if i don't know true values

from capymoa.evaluation import ClassificationEvaluator, ClassificationWindowedEvaluator
from capymoa.evaluation.results import PrequentialResults
from capymoa.evaluation.visualization import plot_windowed_results



In [None]:
from google.colab import drive

PATH = "Uni/Magistrale/Poli/Streaming Data Analytics/project/"

drive.mount('/content/drive')

os.chdir(f'/content/drive/MyDrive/{PATH}')
os.getcwd()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


'/content/drive/MyDrive/Uni/Magistrale/Poli/Streaming Data Analytics/project'

In [None]:
%%capture
# @title Select Parameters

stream = "Covtype" # @param ["Covtype", "Electricity", "Hyper100k", "Sensor", "ElectricityTiny"]

str_to_stream = {
    "Covtype": Covtype(),
    "Electricity": Electricity(),
    "Hyper100k": Hyper100k(),
    "Sensor": Sensor(),
    "ElectricityTiny": ElectricityTiny()
}

stream = str_to_stream[stream]


classifier = "NaiveBayes" # @param ["NaiveBayes", "HoeffdingTree"]

str_to_classifier = {
    "NaiveBayes": NaiveBayes(schema=stream.get_schema()),
    "HoeffdingTree": HoeffdingTree(schema=stream.get_schema()),
}

classifier = str_to_classifier[classifier]

detector = "ADWIN" # @param ["ADWIN", "STEPD", "CUSUM", "PageHinkley", "DDM", "HDDMAverage", "HDDMWeighted"]

str_to_detector = {
    "ADWIN": ADWIN(),
    "STEPD": STEPD(),
    "CUSUM": CUSUM(),
    "PageHinkley": PageHinkley(),
    "DDM": DDM(),
    "HDDMAverage": HDDMAverage(),
    "HDDMWeighted": HDDMWeighted(),
}

detector = str_to_detector[detector]


WINDOW_SIZE = stream._length // 100 # 1% of dataset size



In [None]:
def evaluate_detector(detector, stream, classifier):
    i = 0
    cumulative_evaluator = ClassificationEvaluator(schema=stream.get_schema())
    windowed_evaluator = ClassificationWindowedEvaluator(schema=stream.get_schema(), window_size=WINDOW_SIZE)


    while stream.has_more_instances():
        i += 1

        instance = stream.next_instance()

        y = instance.y_index
        y_pred = classifier.predict(instance)

        cumulative_evaluator.update(y, y_pred)
        windowed_evaluator.update(y, y_pred)

        classifier.train(instance)

        if detector != None:
            detector.add_element(y)
            if detector.detected_change():
              # print("Change detected at index: " + str(i))
              classifier = NaiveBayes(schema=stream.get_schema())



    results = PrequentialResults(learner=str(classifier),
                                 stream=stream,
                                 cumulative_evaluator=cumulative_evaluator,
                                 windowed_evaluator=windowed_evaluator)
    return results


In [None]:
def benchmark_detector(detector, stream, classifier,filename = "results.csv"):

    stream.restart()

    process = psutil.Process(os.getpid())

    start_time = time.time()
    mem_usage, results = memory_usage((evaluate_detector, (detector, stream, classifier)), retval=True)
    end_time = time.time()


    execution_time = end_time - start_time
    cpu_usage = process.cpu_percent(interval=1)
    memory_usage_max = max(mem_usage)

    results = pd.DataFrame([{
        "Dataset": stream.__class__.__name__,
        "Classifier": classifier.__class__.__name__,
        "Detector": detector.__class__.__name__ if detector else "None",
        "Cumulative Accuracy": results.cumulative.metrics_dict()["accuracy"],
        "Cumulative F1-Score": results.cumulative.metrics_dict()["f1_score"],
        "Windowed Accuracy": results.windowed.metrics_per_window()["accuracy"].tolist(),
        "Windowed F1-Score": results.windowed.metrics_per_window()["f1_score"].tolist(),
        "Execution Time (s)": execution_time,
        "CPU Usage (%)": cpu_usage,
        "Memory Usage (MB)": memory_usage_max,
        "Changes": detector.detection_index if detector != None else "",
        "Warnings": detector.warning_index if detector != None else ""
    }])

    results.to_csv(filename, mode="a", header=not pd.io.common.file_exists(filename), index=False)
    print(f"Results saved to {filename}")

    return results

benchmark_detector(detector, stream, classifier)

Results saved to results.csv


Unnamed: 0,Dataset,Classifier,Detector,Cumulative Accuracy,Cumulative F1-Score,Windowed Accuracy,Windowed F1-Score,Execution Time (s),CPU Usage (%),Memory Usage (MB),Changes,Warnings
0,Covtype,NaiveBayes,ADWIN,84.172616,79.061521,"[73.16695352839932, 69.91394148020655, 73.2702...","[71.68308822177409, 69.1227936570825, 70.57333...",88.016449,0.0,839.199219,"[416, 640, 896, 928, 1184, 1216, 1696, 1728, 1...",[]
