# Imports

In [79]:
# CapyMoa
from capymoa.datasets import Electricity, Covtype, Fried
from capymoa.evaluation import ClassificationEvaluator, ClassificationWindowedEvaluator, prequential_evaluation
from capymoa.classifier import OnlineBagging, HoeffdingTree, HoeffdingAdaptiveTree, AdaptiveRandomForestClassifier
from capymoa.classifier import NaiveBayes, EFDT, KNN, OnlineAdwinBagging, LeveragingBagging
from capymoa.regressor import KNNRegressor, AdaptiveRandomForestRegressor

# River
from river.evaluate import progressive_val_score, iter_progressive_val_score
from river.stream import iter_sklearn_dataset, iter_pandas, iter_csv
from river.metrics import Accuracy

from river.naive_bayes import GaussianNB
from river.tree import HoeffdingTreeClassifier, HoeffdingAdaptiveTreeRegressor, HoeffdingTreeRegressor, HoeffdingAdaptiveTreeClassifier
from river.tree import ExtremelyFastDecisionTreeClassifier
from river.forest import ARFClassifier, ARFRegressor
from river.ensemble import ADWINBaggingClassifier, BaggingClassifier, LeveragingBaggingClassifier
from river.neighbors import KNNClassifier, KNNRegressor

# Scikit learn
from sklearn import preprocessing, model_selection, datasets, metrics

# Miscellaneous for tracking
import time, tracemalloc, psutil


# Utils

In [80]:
# Generator to yield (x_dict, y) in river format
def capymoa_to_river_stream(stream):
    feature_names = stream.get_schema().get_numeric_attributes()
    for instance in stream:
        x_dict = dict(zip(feature_names, instance.x))
        y = instance.y_index  # or instance.y if using class label
        yield x_dict, y

# Start measurements
def start_measurements():
    tracemalloc.start()
    start_time = time.time()
    cpu_start = psutil.cpu_percent(interval=None)
    return start_time, cpu_start

def end_measurements():
    end_time = time.time()
    cpu_end = psutil.cpu_percent(interval=None)
    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()
    return end_time, cpu_end, current, peak

# CapyMOA

In [83]:
elec_stream = Electricity()
ob_learner = NaiveBayes(schema=elec_stream.get_schema())
ob_evaluator = ClassificationEvaluator(schema=elec_stream.get_schema())

start_time, cpu_start = start_measurements()

for instance in elec_stream:
    prediction = ob_learner.predict(instance)
    ob_learner.train(instance)
    ob_evaluator.update(instance.y_index, prediction)

end_time, cpu_end, current, peak = end_measurements()

print(f"\n--- Performance Summary ---")
print(f"Accuracy: {ob_evaluator.accuracy():.4f}")
print(f"Total Time: {end_time - start_time:.2f} seconds")
print(f"CPU Usage: {cpu_end}%")
print(f"Peak Memory Usage: {peak / 10**6:.2f} MB")


--- Performance Summary ---
Accuracy: 73.3625
Total Time: 2.49 seconds
CPU Usage: 22.7%
Peak Memory Usage: 0.02 MB


In [81]:
# Obtain the results from the high-level function.
# Note that we need to specify a window_size as we obtain both windowed and cumulative results.
# The results from a high-level evaluation function are represented as a PrequentialResults object
results_NB = prequential_evaluation(stream=elec_stream, learner=ob_learner, window_size=4500)

start_time, cpu_start = start_measurements()

print(f"Cumulative accuracy = {results_NB.cumulative.accuracy()}, wall-clock time: {results_NB.wallclock()}")

end_time, cpu_end, current, peak = end_measurements()

print(f"\n--- Performance Summary ---")
print(f"Accuracy: {ob_evaluator.accuracy():.4f}")
print(f"Total Time: {end_time - start_time:.2f} seconds")
print(f"CPU Usage: {cpu_end}%")
print(f"Peak Memory Usage: {peak / 10**6:.2f} MB")

# The windowed results are conveniently stored in a pandas DataFrame.
display(results_NB.windowed.metrics_per_window())

Cumulative accuracy = 72.90342514124293, wall-clock time: 0.17723798751831055

--- Performance Summary ---
Accuracy: 73.3625
Total Time: 0.01 seconds
CPU Usage: 25.0%
Peak Memory Usage: 0.01 MB


Unnamed: 0,instances,accuracy,kappa,kappa_t,kappa_m,f1_score,f1_score_0,f1_score_1,precision,precision_0,precision_1,recall,recall_0,recall_1
0,4500.0,81.866667,61.099882,-12.396694,53.793884,80.745063,75.317604,85.669125,81.671068,81.054688,82.287449,79.839821,70.338983,89.340659
1,9000.0,76.644444,51.458274,-39.205298,49.17795,78.134591,67.849495,81.661141,81.635151,92.339717,70.930585,74.921899,53.626692,96.217105
2,13500.0,75.377778,49.871915,-67.624811,48.560817,77.213344,67.77196,80.079108,80.088558,91.015625,69.161491,74.537419,53.985171,95.089667
3,18000.0,72.733333,38.756645,-89.351852,34.976153,74.186886,52.970487,80.801127,82.024705,95.706371,68.343039,67.716298,36.618972,98.813624
4,22500.0,72.888889,35.21413,-69.916435,30.799773,73.60832,47.368421,81.741993,84.072894,98.918919,69.226869,65.460442,31.140102,99.780782
5,27000.0,64.2,15.982505,-181.643357,13.850267,66.25582,25.104603,76.478318,79.080865,96.085409,62.076321,57.010126,14.438503,99.581749
6,31500.0,63.688889,8.889297,-198.175182,6.788363,61.788989,15.599174,76.86863,72.702279,82.513661,62.890896,53.724449,8.613805,98.835093
7,36000.0,64.066667,12.26727,-171.764706,10.116732,64.330863,19.910847,76.837129,77.013594,91.363636,62.663551,55.234715,11.172874,99.296557
8,40500.0,78.444444,55.558108,-45.427286,50.281907,77.907224,73.741202,81.718809,78.388667,78.141136,78.636199,77.431658,69.810354,85.052962
9,45000.0,78.777778,55.922426,-36.428571,57.059353,79.898577,71.466985,83.106315,82.872045,92.498067,73.246024,77.131095,58.227848,96.034342


# River

In [84]:
model = GaussianNB()
metric = Accuracy()
elec_stream = Electricity()

start_time, cpu_start = start_measurements()

for instance in elec_stream:
    # Create dictionary with keys as instance.schema.get_numeric_attributes() and values as instance.x
    x_dict = dict(zip(instance.schema.get_numeric_attributes(), instance.x))

    # Predict class
    y_p = model.predict_one(x_dict) 
    if y_p is not None:
        metric.update(y_true=instance.y_index, y_pred=y_p)
    # Train the model
    model.learn_one(x_dict, instance.y_index)    

end_time, cpu_end, current, peak = end_measurements()

print(f"\n--- Performance Summary ---")
print(f"{metric}")
print(f"Total Time: {end_time - start_time:.2f} seconds")
print(f"CPU Usage: {cpu_end}%")
print(f"Peak Memory Usage: {peak / 10**6:.2f} MB")


--- Performance Summary ---
Accuracy: 72.45%
Total Time: 42.82 seconds
CPU Usage: 20.3%
Peak Memory Usage: 0.07 MB


In [85]:
# Setup stream and estimators 
# There is iter_sklearn_dataset also 
elec_stream = Electricity()
nb = GaussianNB()
metric = Accuracy()
river_stream = capymoa_to_river_stream(elec_stream)

start_time, cpu_start = start_measurements()

# Setup evaluator
progressive_val_score(dataset=river_stream, model=nb, metric=metric)

end_time, cpu_end, current, peak = end_measurements()

print(f"\n--- Performance Summary ---")
print(f"{metric}")
print(f"Total Time: {end_time - start_time:.2f} seconds")
print(f"CPU Usage: {cpu_end}%")
print(f"Peak Memory Usage: {peak / 10**6:.2f} MB")

#steps = iter_progressive_val_score(dataset=elec_stream, model=model, metric=metrics, step=1)


--- Performance Summary ---
Accuracy: 57.53%
Total Time: 57.77 seconds
CPU Usage: 17.5%
Peak Memory Usage: 0.22 MB


## River with scikit-learn

In [24]:
from river import compat, compose, linear_model, preprocessing

# Load data
dataset = datasets.load_breast_cancer()
X, y = dataset.data, dataset.target

# Define a determistic cross-validation procedure
cv = model_selection.KFold(n_splits=5, shuffle=True, random_state=42)

scorer = metrics.make_scorer(metrics.roc_auc_score)

# We define a Pipeline 
model = compose.Pipeline(
    ('scale', preprocessing.StandardScaler()),
    ('log_reg', linear_model.LogisticRegression())
)

# We make the Pipeline compatible with sklearn
model = compat.convert_river_to_sklearn(model)

# We compute the CV scores using the same CV scheme and the same scoring
scores = model_selection.cross_val_score(model, X, y, scoring=scorer, cv=cv)

# Display the average score and its standard deviation
print(f'ROC AUC: {scores.mean():.3f} (± {scores.std():.3f})')



ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/sebastianballesteros/Documents/Polimi/LastSemester/Streaming Data Analytics/2024-2025_SML-Notebooks-in-CapyMOA/sml-env/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/sebastianballesteros/Documents/Polimi/LastSemester/Streaming Data Analytics/2024-2025_SML-Notebooks-in-CapyMOA/sml-env/lib/python3.11/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sebastianballesteros/Documents/Polimi/LastSemester/Streaming Data Analytics/2024-2025_SML-Notebooks-in-CapyMOA/sml-env/lib/python3.11/site-packages/sklearn/pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sebastianballesteros/Documents/Polimi/LastSemester/Streaming Data Analytics/2024-2025_SML-Notebooks-in-CapyMOA/sml-env/lib/python3.11/site-packages/sklearn/pipeline.py", line 588, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sebastianballesteros/Documents/Polimi/LastSemester/Streaming Data Analytics/2024-2025_SML-Notebooks-in-CapyMOA/sml-env/lib/python3.11/site-packages/joblib/memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sebastianballesteros/Documents/Polimi/LastSemester/Streaming Data Analytics/2024-2025_SML-Notebooks-in-CapyMOA/sml-env/lib/python3.11/site-packages/sklearn/pipeline.py", line 1551, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sebastianballesteros/Documents/Polimi/LastSemester/Streaming Data Analytics/2024-2025_SML-Notebooks-in-CapyMOA/sml-env/lib/python3.11/site-packages/sklearn/utils/_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sebastianballesteros/Documents/Polimi/LastSemester/Streaming Data Analytics/2024-2025_SML-Notebooks-in-CapyMOA/sml-env/lib/python3.11/site-packages/sklearn/base.py", line 921, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sebastianballesteros/Documents/Polimi/LastSemester/Streaming Data Analytics/2024-2025_SML-Notebooks-in-CapyMOA/sml-env/lib/python3.11/site-packages/sklearn/utils/_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sebastianballesteros/Documents/Polimi/LastSemester/Streaming Data Analytics/2024-2025_SML-Notebooks-in-CapyMOA/sml-env/lib/python3.11/site-packages/river/compat/river_to_sklearn.py", line 484, in transform
    utils.validation.check_is_fitted(self, attributes="instance_")
  File "/Users/sebastianballesteros/Documents/Polimi/LastSemester/Streaming Data Analytics/2024-2025_SML-Notebooks-in-CapyMOA/sml-env/lib/python3.11/site-packages/sklearn/utils/validation.py", line 1751, in check_is_fitted
    tags = get_tags(estimator)
           ^^^^^^^^^^^^^^^^^^^
  File "/Users/sebastianballesteros/Documents/Polimi/LastSemester/Streaming Data Analytics/2024-2025_SML-Notebooks-in-CapyMOA/sml-env/lib/python3.11/site-packages/sklearn/utils/_tags.py", line 430, in get_tags
    sklearn_tags_provider[klass] = klass.__sklearn_tags__(estimator)  # type: ignore[attr-defined]
                                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sebastianballesteros/Documents/Polimi/LastSemester/Streaming Data Analytics/2024-2025_SML-Notebooks-in-CapyMOA/sml-env/lib/python3.11/site-packages/sklearn/base.py", line 859, in __sklearn_tags__
    tags = super().__sklearn_tags__()
           ^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'super' object has no attribute '__sklearn_tags__'
