# Imports

In [13]:
# CapyMoa
from capymoa.datasets import Electricity
from capymoa.evaluation import ClassificationEvaluator, prequential_evaluation
from capymoa.classifier import NaiveBayes

# River
from river.evaluate import progressive_val_score
from river.metrics import Accuracy

from river.naive_bayes import GaussianNB

# Miscellaneous for tracking
import time, tracemalloc, psutil


# Utils

In [3]:
# Generator to yield (x_dict, y) in river format
def capymoa_to_river_stream(stream):
    feature_names = stream.get_schema().get_numeric_attributes()
    for instance in stream:
        x_dict = dict(zip(feature_names, instance.x))
        y = instance.y_index  # or instance.y if using class label
        yield x_dict, y

# Start measurements
def start_measurements():
    tracemalloc.start()
    start_time = time.time()
    cpu_start = psutil.cpu_percent(interval=None)
    return start_time, cpu_start

def end_measurements():
    end_time = time.time()
    cpu_end = psutil.cpu_percent(interval=None)
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()
    return end_time, cpu_end, current, peak

# CapyMOA

In [9]:
elec_stream = Electricity()
ob_learner = NaiveBayes(schema=elec_stream.get_schema())
ob_evaluator = ClassificationEvaluator(schema=elec_stream.get_schema())

predict_time = 0 
train_time = 0
for instance in elec_stream:
    # Measure predict time 
    start_time, cpu_start = start_measurements()
    prediction = ob_learner.predict(instance)
    end_time, cpu_end, current, peak = end_measurements()

    predict_time = predict_time + (end_time - start_time)

    # Measure train time
    start_time, cpu_start = start_measurements()
    ob_learner.train(instance)
    end_time, cpu_end, current, peak = end_measurements()
    
    train_time = train_time + (end_time - start_time)

    # Update evaluator
    ob_evaluator.update(instance.y_index, prediction)


print(f"\n--- Time Summary ---")
print(f"Accuracy: {ob_evaluator.accuracy():.4f}")
print(f"Total Train Time: {train_time:.2f} seconds")
print(f"Total Prediction Time: {predict_time:.2f} seconds")


--- Time Summary ---
Accuracy: 73.3625
Total Train Time: 1.51 seconds
Total Prediction Time: 2.03 seconds


In [None]:
elec_stream = Electricity()
start_time, cpu_start = start_measurements()

# Obtain the results from the high-level function.
# The results from a high-level evaluation function are represented as a PrequentialResults object
results_NB = prequential_evaluation(stream=elec_stream, learner=ob_learner, window_size=4500)
print(f"Cumulative accuracy = {results_NB.cumulative.accuracy()}, wall-clock time: {results_NB.wallclock()}")

end_time, cpu_end, current, peak = end_measurements()

print(f"\n--- Performance Summary ---")
print(f"Accuracy: {ob_evaluator.accuracy():.4f}")
print(f"Total Time: {end_time - start_time:.2f} seconds")
print(f"CPU Usage: {cpu_end}%")
print(f"Peak Memory Usage: {peak / 10**6:.2f} MB")

# The windowed results are conveniently stored in a pandas DataFrame.
display(results_NB.windowed.metrics_per_window())

Cumulative accuracy = 72.46645480225989, wall-clock time: 0.1660599708557129

--- Performance Summary ---
Accuracy: 73.3625
Total Time: 0.17 seconds
CPU Usage: 33.6%
Peak Memory Usage: 0.02 MB


Unnamed: 0,instances,accuracy,kappa,kappa_t,kappa_m,f1_score,f1_score_0,f1_score_1,precision,precision_0,precision_1,recall,recall_0,recall_1
0,4500.0,81.4,59.825028,-15.289256,52.604757,80.235907,74.158691,85.471272,81.491807,81.756297,81.227318,79.018129,67.853107,90.18315
1,9000.0,75.8,49.559981,-44.238411,47.340426,77.625123,65.894143,81.246771,81.703704,93.511111,69.896296,73.934381,50.870406,96.998355
2,13500.0,74.333333,47.671005,-74.73525,46.37883,76.498142,65.635228,79.517645,79.844331,91.687448,68.001213,73.421143,51.112141,95.730145
3,18000.0,71.888889,36.594445,-95.216049,32.962374,73.540432,50.527962,80.366289,82.007911,96.41791,67.597911,66.657875,34.234234,99.081515
4,22500.0,72.8,35.001717,-70.473538,30.572887,73.470567,47.195858,81.682131,83.867365,98.558559,69.176172,65.367184,31.026659,99.707709
5,27000.0,64.266667,16.161115,-181.118881,14.010695,66.324986,25.348189,76.511832,79.123627,96.126761,62.120493,57.09034,14.59893,99.581749
6,31500.0,63.222222,7.456192,-202.007299,5.590416,60.988722,13.48667,76.647383,71.602823,80.625,62.580645,53.115155,7.358813,98.871496
7,36000.0,63.777778,11.398127,-173.94958,9.394108,64.08296,18.581419,76.707631,77.043899,91.625616,62.462183,54.85484,10.339077,99.370603
8,40500.0,78.444444,55.395162,-45.427286,50.281907,77.912597,73.30765,81.92322,78.585327,79.144385,78.026269,77.251287,68.272681,86.229894
9,45000.0,78.355556,54.996636,-39.142857,56.205036,79.564583,70.662651,82.852113,82.706111,92.654028,72.758194,76.652978,57.108082,96.197874


# River

In [11]:
model = GaussianNB()
metric = Accuracy()
elec_stream = Electricity()

predict_time = 0 
train_time = 0
for instance in elec_stream:
    # Create dictionary with keys as instance.schema.get_numeric_attributes() and values as instance.x
    x_dict = dict(zip(instance.schema.get_numeric_attributes(), instance.x))

    # Predict class
    start_time, cpu_start = start_measurements()
    y_p = model.predict_one(x_dict) 
    end_time, cpu_end, current, peak = end_measurements()

    predict_time = predict_time + (end_time - start_time)

    if y_p is not None:
        metric.update(y_true=instance.y_index, y_pred=y_p)

    # Train the model
    start_time, cpu_start = start_measurements()
    model.learn_one(x_dict, instance.y_index)    
    end_time, cpu_end, current, peak = end_measurements()

    train_time = train_time + (end_time - start_time)

print(f"\n--- Time Summary ---")
print(f"{metric}")
print(f"Total Train Time: {train_time:.2f} seconds")
print(f"Total Prediction Time: {predict_time:.2f} seconds")


--- Time Summary ---
Accuracy: 72.45%
Total Train Time: 5.97 seconds
Total Prediction Time: 27.08 seconds


In [12]:
# Setup stream and estimators 
# There is iter_sklearn_dataset also 
elec_stream = Electricity()
nb = GaussianNB()
metric = Accuracy()
river_stream = capymoa_to_river_stream(elec_stream)

start_time, cpu_start = start_measurements()

# Setup evaluator
progressive_val_score(dataset=river_stream, model=nb, metric=metric)

end_time, cpu_end, current, peak = end_measurements()

print(f"\n--- Performance Summary ---")
print(f"{metric}")
print(f"Total Time: {end_time - start_time:.2f} seconds")
print(f"CPU Usage: {cpu_end}%")
print(f"Peak Memory Usage: {peak / 10**6:.2f} MB")


--- Performance Summary ---
Accuracy: 57.53%
Total Time: 57.53 seconds
CPU Usage: 19.9%
Peak Memory Usage: 0.26 MB
