# Description 

In this notebook we demonstrate how someone can directly use scikit-learn learners in CapyMOA and River. In addition, we show how to use Pytorch with CapyMOA.

# Imports

In [4]:
# CapyMoa
from capymoa.datasets import Electricity
from capymoa.evaluation import ClassificationEvaluator, prequential_evaluation
from capymoa.base import SKClassifier

# River
from river.evaluate import progressive_val_score, iter_progressive_val_score
from river.stream import iter_sklearn_dataset, iter_pandas, iter_csv
from river.metrics import Accuracy
from river import compat, linear_model

# Scikit learn
from sklearn import model_selection, datasets, metrics
from sklearn import linear_model as sk_linear_model

# Torch
import torch
from torch import nn

# Miscellaneous for tracking
import time, tracemalloc, psutil
import numpy as np

# Scikit Learn

## CapyMOA

This code evaluates the performance of a standard SGDClassifier from scikit-learn using the CapyMOA prequential evaluation framework on the Electricity dataset. The classifier is wrapped using SKClassifier to make it compatible with CapyMOA’s streaming API. The model is evaluated using a sliding window approach (size 4500), and cumulative accuracy is computed to assess its online learning performance.

In [None]:
from capymoa.evaluation import prequential_evaluation

# Load the Electricity stream dataset
elec_stream = Electricity()

# Wrap scikit-learn's SGDClassifier for compatibility with CapyMOA
sklearn_SGD = SKClassifier(
    schema=elec_stream.get_schema(), sklearner=sk_linear_model.SGDClassifier()
)

# Perform prequential evaluation using a sliding window of 4500 instances
results_sklearn_SGD = prequential_evaluation(
    stream=elec_stream, learner=sklearn_SGD, window_size=4500
)

results_sklearn_SGD.cumulative.accuracy()

84.18079096045197

## River

Now we show the same functionality in River. This script performs cross-validation on the sklearn breast cancer dataset. It uses a logistic regression model for binary classification and evaluates its performance using a 5-fold deterministic K-Fold cross-validation. The logistic regression model is converted from a River-compatible format to ensure compatibility with scikit-learn's cross_val_score function. 


In [3]:
# Load data
dataset = datasets.load_breast_cancer()
X, y = dataset.data, dataset.target

# Define a determistic cross-validation procedure
cv = model_selection.KFold(n_splits=5, shuffle=True, random_state=42)

scorer = metrics.make_scorer(metrics.roc_auc_score)

# We define a model - Pipeline didn't work
model = linear_model.LogisticRegression()

# We make the model compatible with sklearn
model = compat.convert_river_to_sklearn(model)

# We compute the CV scores using the same CV scheme and the same scoring
scores = model_selection.cross_val_score(model, X, y, scoring=scorer, cv=cv)

# Display the average score and its standard deviation
print(f'ROC AUC: {scores.mean():.3f} (± {scores.std():.3f})')

ROC AUC: 0.816 (± 0.158)


# Pytorch

## CapyMOA

Before using Pytorch we have to define a simple NeuralNetwork to test the library. A simple architecture with two hidden layers of 512 neurons. In our case, since we are performing classification, we have two neurons for our output layer. 

In [42]:
device = "cpu"

# Define model
class NeuralNetwork(nn.Module):
    def __init__(self, input_size=0, number_of_classes=0):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, number_of_classes),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

Now we can show how to use Pytorch to train our previously defined neural network in a streaming way using has_more_instances(). For each instance we define X, y and we perform prediction/training using the usual torch way of doing it. 

In [54]:
# Creating the evaluator
evaluator = ClassificationEvaluator(schema=elec_stream.get_schema())

model = None
optimizer = None
loss_fn = nn.CrossEntropyLoss()

i = 0
while elec_stream.has_more_instances():
    i += 1
    instance = elec_stream.next_instance()
    if model is None:
        moa_instance = instance.java_instance.getData()
        # initialize the model and send it to the device
        model = NeuralNetwork(
            input_size=elec_stream.get_schema().get_num_attributes(),
            number_of_classes=elec_stream.get_schema().get_num_classes(),
        ).to(device)
        # set the optimizer
        optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
        print(model)

    X = torch.tensor(instance.x, dtype=torch.float32)
    y = torch.tensor(instance.y_index, dtype=torch.long)
    # set the device and add a dimension to the tensor
    X, y = torch.unsqueeze(X.to(device), 0), torch.unsqueeze(y.to(device), 0)

    # turn off gradient collection for test
    with torch.no_grad():
        pred = model(X)
        prediction = torch.argmax(pred)

    # update evaluator with predicted class
    evaluator.update(instance.y_index, prediction.item())

    # Compute prediction error
    pred = model(X)
    loss = loss_fn(pred, y)

    # Backpropagation
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    if i % 500 == 0:
        print(f"Accuracy at {i} : {evaluator.accuracy()}")

print(f"Accuracy at {i} : {evaluator.accuracy()}")

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=8, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=2, bias=True)
  )
)
Accuracy at 500 : 56.8
Accuracy at 1000 : 59.0
Accuracy at 1500 : 62.133333333333326
Accuracy at 2000 : 61.85000000000001
Accuracy at 2500 : 61.919999999999995
Accuracy at 3000 : 62.4
Accuracy at 3500 : 62.02857142857143
Accuracy at 4000 : 63.24999999999999
Accuracy at 4500 : 63.4
Accuracy at 5000 : 63.78
Accuracy at 5500 : 64.03636363636363
Accuracy at 6000 : 64.26666666666667
Accuracy at 6500 : 64.61538461538461
Accuracy at 7000 : 64.85714285714286
Accuracy at 7500 : 64.49333333333334
Accuracy at 8000 : 64.2875
Accuracy at 8500 : 64.31764705882354
Accuracy at 9000 : 64.33333333333333
Accuracy at 9500 : 64.13684210526316
Accuracy at 10000 : 64.25999999999999
Accuracy at 1050