In [None]:
import pandas as pd
from evidently.test_suite import TestSuite
from evidently.test_preset import DataDriftTestPreset
from evidently.tests import *
from batchstream.history.base.history_manager import HistoryManager
from batchstream.monitoring.pipeline.model_monitoring_pipeline import ModelMonitoringPipeline
from batchstream.monitoring.pipeline.steps.batch.evidently_monitoring_step import EvidentlyMonitoringStep
from sklearn.datasets import load_breast_cancer
from batchstream.utils.logging.base.logger_factory import LoggerFactory
from batchstream.batch_monitoring_strategy.dummy_monitoring_strategy import DummyMonitoringStrategy
from batchstream.retraining_strategy.dummy_retraining_strategy import DummyRetrainingStrategy 
from batchstream.model_comparers.batch_comparer import BatchModelComparer
from batchstream.model_comparers.shadow_comparer import ShadowOnlineComparer
from batchstream.pipelines.batch.batch_pipeline import BatchPipeline
from batchstream.estimators.sklearn_estimator import SklearnEstimator
from batchstream.detectors.base.detector import DriftDetector
from batchstream.experiment.experiment import StreamExperiment
from river.metrics import Accuracy, ROCAUC
from batchstream.evaluation.model_evaluation_pipeline import ModelEvaluationPipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline



history = HistoryManager()
logger_factory = LoggerFactory('test-2220')


### INPUT DRIFT DETECTION
# Detector 1.1 - Data Drift
data_drift_test_suite = TestSuite(tests=[
   DataDriftTestPreset(),
])
d1 = DummyMonitoringStrategy(n_curr=120, n_ref=120)
ev1 = EvidentlyMonitoringStep(data_drift_test_suite, d1, logger_factory, min_instances=240, clock=120, name='data_drift_eval')

# Detector 1.2 - Target Drift
target_drift = TestSuite(tests=[
    TestColumnDrift(column_name='target'),
])
d2 = DummyMonitoringStrategy(n_curr=120, n_ref=120, type='target')
ev2 = EvidentlyMonitoringStep(target_drift, d2, logger_factory, min_instances=240, clock=120, name='target_drift_eval')

input_monitoring = ModelMonitoringPipeline([(ev1._name, ev1), (ev2._name, ev2)])
input_drift_retraining_strategy = DummyRetrainingStrategy(n_last_retrain=120, n_last_test=0)
input_detector = DriftDetector(input_monitoring, input_drift_retraining_strategy)
###




### OUTPUT (PERFORMANCE) DRIFT DETECTION
# Detector 2.1 - Performance Drift

performance_drift = TestSuite(tests=[
    TestPrecisionScore(),
    TestRecallScore(),
    TestF1Score(),
    TestAccuracyScore()
])
d3 = DummyMonitoringStrategy(n_curr=120, n_ref=120, type='prediction')
ev3 = EvidentlyMonitoringStep(performance_drift, d3, logger_factory, min_instances=360, clock=120, name='performance_drift_eval')

output_monitoring = ModelMonitoringPipeline([(ev3._name, ev3)])
output_drift_retraining_strategy = DummyRetrainingStrategy(n_last_retrain=120, n_last_test=0)
output_detector = DriftDetector(output_monitoring, output_drift_retraining_strategy)
###

### Models comparison (after retraining)
#model_comparer = BatchModelComparer()
model_comparer = ShadowOnlineComparer(n_online=20)
###


### Model's Performance Evaluation
acc = Accuracy()
roc_auc = ROCAUC()
eval_pipe = ModelEvaluationPipeline(metric_steps=[
    ('accuracy', acc),
    ('roc_auc', roc_auc)
])
###


### Model composition
sklearn_batch_classifier = SklearnEstimator(Pipeline([('rf', RandomForestClassifier())]))
batch_pipeline = BatchPipeline(
    sklearn_batch_classifier,
    input_drift_detector=input_detector,
    output_drift_detector=output_detector,
    history=history,
    logger_factory=logger_factory,
    model_comparer=model_comparer,
    min_samples_retrain=120,
    min_samples_first_fit=240
)
###

### Experiment
experiment = StreamExperiment(batch_pipeline, eval_pipe, logger_factory)


In [None]:
X, Y = load_breast_cancer(return_X_y=True)
df = pd.DataFrame(X)
df['target'] = Y


In [None]:
experiment.run(df)

In [None]:
from river.metrics import Accuracy
from river import utils

Y_true =    [1, 0, 1, 0, 1, 0, 0]
Y_predict = [1, 1, 0, 0, 1, 1, 1]


roll_acc = utils.Rolling(Accuracy(), window_size=3)
for y_t, y_p in zip(Y_true, Y_predict):
    print(roll_acc.update(y_t, y_p).get())

In [None]:

from evidently.test_preset import DataDriftTestPreset



data_drift_test_suite = TestSuite(tests=[
    DataDriftTestPreset(),
])

In [None]:
data_drift_test_suite._inner_suite.context.tests

In [None]:
from evidently.tests.base_test import Test

t: Test = data_drift_test_suite._inner_suite.context.tests[0] 

In [None]:
t.__dict__

In [None]:
from batchstream.pipelines.base.stream_pipeline import StreamPipeline
from batchstream.utils.logging.base.logger_factory import LoggerFactory


class CombinationPipeline(StreamPipeline):

    def __init__(self, members: List[StreamPipeline], combiner: object):
        self._members = members
        self._combiner = combiner

    def handle(self, x, y) -> int:
        return self._combiner.combine(x, y, self._members)



In [None]:
import pandas as pd
from evidently.test_suite import TestSuite
from evidently.test_preset import DataDriftTestPreset
from evidently.tests import *
from batchstream.history.base.history_manager import HistoryManager
from batchstream.monitoring.pipeline.model_monitoring_pipeline import ModelMonitoringPipeline
from batchstream.monitoring.pipeline.steps.batch.evidently_monitoring_step import EvidentlyMonitoringStep
from sklearn.datasets import load_breast_cancer
from batchstream.utils.logging.base.logger_factory import LoggerFactory
from batchstream.batch_monitoring_strategy.simple_monitoring_strategy import SimpleMonitoringStrategy
from batchstream.retraining_strategy.simple_retraining_strategy import SimpleRetrainingStrategy 
from batchstream.model_comparers.batch_comparer import BatchModelComparer
from batchstream.model_comparers.shadow_comparer import ShadowOnlineComparer
from batchstream.pipelines.batch.batch_pipeline import BatchPipeline
from batchstream.estimators.sklearn_estimator import SklearnEstimator
from batchstream.detectors.base.drift_handler import DriftHandler
from batchstream.experiment.experiment import StreamExperiment
from river.metrics import Accuracy, ROCAUC
from batchstream.evaluation.river_evaluation_pipeline import RiverEvaluationPipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline



logger_factory = LoggerFactory('test-2218')
retraining_strategy = SimpleRetrainingStrategy(
  n_last_retrain=500, n_last_test=0)


### INPUT DRIFT DETECTION
# Detector 1.1 - Data Drift
data_drift_test_suite = TestSuite(tests=[
DataDriftTestPreset(),
])
d1 = SimpleMonitoringStrategy(n_curr=120, n_ref=120)
ev1 = EvidentlyMonitoringStep(data_drift_test_suite, d1, logger_factory, min_instances=240, clock=120, name='data_drift_eval')

# Detector 1.2 - Target Drift
target_drift = TestSuite(tests=[
    TestColumnDrift(column_name='target'),
])
d2 = SimpleMonitoringStrategy(n_curr=120, n_ref=120, type='target')
ev2 = EvidentlyMonitoringStep(target_drift, d2, logger_factory, min_instances=240, clock=120, name='target_drift_eval')

input_monitoring = ModelMonitoringPipeline([(ev1._name, ev1), (ev2._name, ev2)])
input_detector = DriftHandler(input_monitoring, retraining_strategy)
###




### OUTPUT (PERFORMANCE) DRIFT DETECTION
# Detector 2.1 - Performance Drift

performance_drift = TestSuite(tests=[
    TestPrecisionScore(),
    TestRecallScore(),
    TestF1Score(),
    TestAccuracyScore()
])
d3 = SimpleMonitoringStrategy(n_curr=500, n_ref=500, type='prediction')
ev3 = EvidentlyMonitoringStep(performance_drift, d3, logger_factory,
  min_instances=1000, clock=500, name='performance_drift_eval')

# Output Drift Handler (Performance Drift + Retraining Strategy)
output_monitoring = ModelMonitoringPipeline([(ev3._name, ev3)])

output_drift_handlers = [
  DriftHandler(output_monitoring, retraining_strategy)
]
###

### Models comparison (after retraining)
#model_comparer = BatchModelComparer()
model_comparer = ShadowOnlineComparer(n_online=100)
###


### Model's Performance Evaluation
acc = Accuracy()
roc_auc = ROCAUC()
eval_pipe = RiverEvaluationPipeline(metric_steps=[
    ('accuracy', acc),
    ('roc_auc', roc_auc)
])
###


### Model composition
logger_factory = LoggerFactory(experiment_id='rf_exp')
Pipeline([('rf', RandomForestClassifier(max_depth=10))])
sklearn_batch_classifier = SklearnEstimator()
batch_pipeline = BatchPipeline(
    sklearn_batch_classifier,
    input_drift_handlers,
    output_drift_handlers,
    history,
    logger_factory,
    model_comparer,
    min_samples_retrain=500,
    min_samples_first_fit=1000
)

In [None]:
from river.metrics import Accuracy, MacroF1, MicroF1
from river.utils import Rolling



window_size = 1000
eval_pipe = RiverEvaluationPipeline(metric_steps=[
  (f'acc_preq_{window_size}', Rolling(Accuracy(), window_size)),
  (f'micro_f1_preq_{window_size}', Rolling(MicroF1(), window_size)),
  (f'macro_f1_preq_{window_size}', Rolling(MacroF1(), window_size))
])

In [None]:
from batchstream.pipelines.stream.model_river_pipeline import RiverPipeline


In [None]:
from river.forest import ARFClassifier
from river import ensemble
from river import evaluate
from river import metrics
from river.datasets import synth
from river import tree
from river import ADWIN
from river import naive_bayes


In [None]:
from river.ensemble import SRPClassifier

In [None]:
from river.forest import ARFClassifier


logger_factory = LoggerFactory(experiment_id='arf_exp')
arf_model = ARFClassifier(seed=42, leaf_prediction="mc")
arf_pipe = RiverPipeline(arf_model)
arf_experiment = StreamExperiment(arf_pipe, eval_pipe, logger_factory)

In [None]:
base_model = tree.HoeffdingTreeClassifier(grace_period=50, delta=0.01, nominal_attributes=['age', 'car', 'zipcode'])
srp_model = ensemble.SRPClassifier(model=base_model, n_models=3, seed=42)
srp_pipe = RiverPipeline(srp_model)

In [None]:
from river.tree import HoeffdingAdaptiveTreeClassifier



hat_model = HoeffdingAdaptiveTreeClassifier(
  grace_period=100,
  delta=1e-5,
  leaf_prediction='nb',
  nb_threshold=10,
  seed=42
)
hat_pipe = RiverPipeline(hat_model)

In [None]:
from river import naive_bayes



logger_factory = LoggerFactory(experiment_id='nb_exp')
nb_model =  naive_bayes.GaussianNB()
nb_pipe = RiverPipeline(nb_model)
nb_experiment = StreamExperiment(nb_pipe, eval_pipe, logger_factory)

In [None]:
from evidently.test_suite import TestSuite
from evidently.test_preset import DataDriftTestPreset
from evidently.tests import *



### INPUT DRIFT DETECTION
# Detector 1.1 - Data Drift
data_drift_test_suite = TestSuite(tests=[
  DataDriftTestPreset(),
])
d1 = DummyMonitoringStrategy(n_curr=500, n_ref=500, type='data')
ev1 = EvidentlyMonitoringStep(data_drift_test_suite, d1, logger_factory,
  min_instances=1000, clock=500, name='data_drift_eval'
)

# Detector 1.2 - Target Drift 
target_drift = TestSuite(tests=[
    TestColumnDrift(column_name='target'),
])
d2 = DummyMonitoringStrategy(n_curr=500, n_ref=500, type='target')
ev2 = EvidentlyMonitoringStep(target_drift, d2, logger_factory,
  min_instances=1000, clock=500, name='target_drift_eval'
)

# Input Drift Handler (Data + Target Drift + Retraining Strategy)
input_monitoring = ModelMonitoringPipeline([(ev1._name, ev1), (ev2._name, ev2)])
input_drift_retraining_strategy = DummyRetrainingStrategy(
  n_last_retrain=500, n_last_test=0
)
input_drift_handlers = [
  DriftHandler(input_monitoring, input_drift_retraining_strategy)
]


In [None]:
StreamExperiment()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline



logger_factory = LoggerFactory(experiment_id='rf_exp_all_drifts')
rf_model = Pipeline([('rf', RandomForestClassifier(max_depth=10))])
sklearn_batch_classifier = SklearnEstimator(rf_model)
batch_pipeline = BatchPipeline(
    sklearn_batch_classifier,
    input_drift_handlers,
    output_drift_handlers,
    history,
    logger_factory,
    model_comparer,
    min_samples_retrain=1500,
    min_samples_first_fit=1000
)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline



logger_factory = LoggerFactory(experiment_id='lr_exp_all_drifts')
lr_model = Pipeline([('lr', LogisticRegression())])
sklearn_batch_classifier = SklearnEstimator(lr_model)
batch_pipeline = BatchPipeline(
    sklearn_batch_classifier,
    input_drift_handlers,
    output_drift_handlers,
    history,
    logger_factory,
    model_comparer,
    min_samples_retrain=1500,
    min_samples_first_fit=1000
)

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline



logger_factory = LoggerFactory(experiment_id='nb_exp_all_drifts')
nb_model = Pipeline([('nb', GaussianNB())])
sklearn_batch_classifier = SklearnEstimator(nb_model)
batch_pipeline = BatchPipeline(
    sklearn_batch_classifier,
    input_drift_handlers,
    output_drift_handlers,
    history,
    logger_factory,
    model_comparer,
    min_samples_retrain=1500,
    min_samples_first_fit=1000
)

In [None]:
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline



logger_factory = LoggerFactory(experiment_id='xgb_exp_all_drifts')
xgb_model = Pipeline([('xgb', XGBClassifier())])
sklearn_batch_classifier = SklearnEstimator(xgb_model)
batch_pipeline = BatchPipeline(
    sklearn_batch_classifier,
    input_drift_handlers,
    output_drift_handlers,
    history,
    logger_factory,
    model_comparer,
    min_samples_retrain=1500,
    min_samples_first_fit=1000
)

In [None]:
!pip install xgboost

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame({'1': [1, 2, 3]})

In [None]:
df

In [None]:
df['dataset'] = 'x'

In [None]:
dataset

In [None]:
df

In [None]:
df.columns = range(len(df.columns))

In [None]:
df

In [None]:
import pandas as pd
from io import BytesIO, TextIOWrapper
from zipfile import ZipFile
from scipy.io.arff import loadarff
from os import path
from typing import Tuple



def get_covtype_dataset(data_path='./data') -> Tuple[pd.DataFrame, pd.Series]:
    zip_path = path.join(data_path, 'COVTYPE/covtypeNorm.arff.zip')
    with (ZipFile(zip_path, 'r')) as zfile:
        in_mem_fo = TextIOWrapper(BytesIO(zfile.read('covtypeNorm.arff')), encoding='ascii')
        data = loadarff(in_mem_fo)
        df = pd.DataFrame(data[0])
        to_convert_df = df.select_dtypes([object])
        to_convert_col_names = to_convert_df.columns
        df[to_convert_col_names] = to_convert_df.stack().str.decode('utf-8').unstack()
        class_col = df.pop('class').replace(['noad', 'ad'], [0, 1])
        df['target'] = class_col.copy()
        df.loc[:, "Wilderness_Area1":] = df.loc[:, "Wilderness_Area1":].astype(str).astype(int)
        df['dataset'] = 'covtypeNorm'
    return df


In [None]:
df = get_covtype_dataset()

In [None]:
df.dtypes

In [None]:
from pathlib import Path
from batchstream.utils.visualization import visualize_results
from batchstream.utils.reading_logs import load_drift_history, get_metrics_vals



out_dir = Path(r'D:\Studia\praca\out\20230505_1848')
dir_list = [f for f in out_dir.resolve().glob('*') if not f.is_file()]
for d in dir_list:
    drift_hist = load_drift_history(str(d))
    res = get_metrics_vals(str(d))
    print(str(d))
    visualize_results(res, drift_hist, dataset_name='COVTYPE', metrics=['acc'])

    

In [None]:
# Drift features nie występuje
# Drift perfromance też nie (?) 

***
# Elec

In [None]:
from pathlib import Path
from batchstream.utils.visualization import visualize_results
from batchstream.utils.reading_logs import load_drift_history, get_metrics_vals



out_dir = Path(r'D:\Studia\praca\out\20230506_1423')
dir_list = [f for f in out_dir.resolve().glob('*') if not f.is_file()]
for d in dir_list:
    drift_hist = load_drift_history(str(d))
    res = get_metrics_vals(str(d))
    print(str(d))
    visualize_results(res, drift_hist, dataset_name='elec', metrics=['acc'])

# Rbf 0.66

In [None]:
from pathlib import Path
from batchstream.utils.visualization import visualize_results
from batchstream.utils.reading_logs import load_drift_history, get_metrics_vals



out_dir = Path(r'D:\Studia\praca\out\20230508_1035')
dir_list = [f for f in out_dir.resolve().glob('*') if not f.is_file()]
for d in dir_list:
    print(str(d))
    drift_hist = load_drift_history(str(d))
    res = get_metrics_vals(str(d))
    if res is None: continue
    visualize_results(res, drift_hist, dataset_name='rbf0.66', metrics=['f1'])