In [None]:
import pandas as pd
from evidently.test_suite import TestSuite
from evidently.test_preset import DataDriftTestPreset
from evidently.tests import *
from batchstream.history.base.history_manager import HistoryManager
from batchstream.monitoring.pipeline.model_monitoring_pipeline import ModelMonitoringPipeline
from batchstream.monitoring.pipeline.steps.batch.evidently_monitoring_step import EvidentlyMonitoringStep
from sklearn.datasets import load_breast_cancer
from batchstream.utils.logging.base.logger_factory import LoggerFactory
from batchstream.batch_monitoring_strategy.dummy_monitoring_strategy import DummyMonitoringStrategy
from batchstream.retraining_strategy.dummy_retraining_strategy import DummyRetrainingStrategy 
from batchstream.model_comparers.batch_comparer import BatchModelComparer
from batchstream.model_comparers.shadow_comparer import ShadowOnlineComparer
from batchstream.pipelines.batch.batch_pipeline import BatchPipeline
from batchstream.estimators.sklearn_estimator import SklearnEstimator
from batchstream.detectors.base.detector import DriftDetector
from batchstream.experiment.experiment import StreamExperiment
from river.metrics import Accuracy, ROCAUC
from batchstream.evaluation.model_evaluation_pipeline import ModelEvaluationPipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline



history = HistoryManager()
logger_factory = LoggerFactory('test-2220')


### INPUT DRIFT DETECTION
# Detector 1.1 - Data Drift
data_drift_test_suite = TestSuite(tests=[
   DataDriftTestPreset(),
])
d1 = DummyMonitoringStrategy(n_curr=120, n_ref=120)
ev1 = EvidentlyMonitoringStep(data_drift_test_suite, d1, logger_factory, min_instances=240, clock=120, name='data_drift_eval')

# Detector 1.2 - Target Drift
target_drift = TestSuite(tests=[
    TestColumnDrift(column_name='target'),
])
d2 = DummyMonitoringStrategy(n_curr=120, n_ref=120, type='target')
ev2 = EvidentlyMonitoringStep(target_drift, d2, logger_factory, min_instances=240, clock=120, name='target_drift_eval')

input_monitoring = ModelMonitoringPipeline([(ev1._name, ev1), (ev2._name, ev2)])
input_drift_retraining_strategy = DummyRetrainingStrategy(n_last_retrain=120, n_last_test=0)
input_detector = DriftDetector(input_monitoring, input_drift_retraining_strategy)
###




### OUTPUT (PERFORMANCE) DRIFT DETECTION
# Detector 2.1 - Performance Drift

performance_drift = TestSuite(tests=[
    TestPrecisionScore(),
    TestRecallScore(),
    TestF1Score(),
    TestAccuracyScore()
])
d3 = DummyMonitoringStrategy(n_curr=120, n_ref=120, type='prediction')
ev3 = EvidentlyMonitoringStep(performance_drift, d3, logger_factory, min_instances=360, clock=120, name='performance_drift_eval')

output_monitoring = ModelMonitoringPipeline([(ev3._name, ev3)])
output_drift_retraining_strategy = DummyRetrainingStrategy(n_last_retrain=120, n_last_test=0)
output_detector = DriftDetector(output_monitoring, output_drift_retraining_strategy)
###

### Models comparison (after retraining)
#model_comparer = BatchModelComparer()
model_comparer = ShadowOnlineComparer(n_online=20)
###


### Model's Performance Evaluation
acc = Accuracy()
roc_auc = ROCAUC()
eval_pipe = ModelEvaluationPipeline(metric_steps=[
    ('accuracy', acc),
    ('roc_auc', roc_auc)
])
###


### Model composition
sklearn_batch_classifier = SklearnEstimator(Pipeline([('rf', RandomForestClassifier())]))
batch_pipeline = BatchPipeline(
    sklearn_batch_classifier,
    input_drift_detector=input_detector,
    output_drift_detector=output_detector,
    history=history,
    logger_factory=logger_factory,
    model_comparer=model_comparer,
    min_samples_retrain=120,
    min_samples_first_fit=240
)
###

### Experiment
experiment = StreamExperiment(batch_pipeline, eval_pipe, logger_factory)


In [None]:
X, Y = load_breast_cancer(return_X_y=True)
df = pd.DataFrame(X)
df['target'] = Y


In [None]:
experiment.run(df)

In [None]:
from river.metrics import Accuracy
from river import utils

Y_true =    [1, 0, 1, 0, 1, 0, 0]
Y_predict = [1, 1, 0, 0, 1, 1, 1]


roll_acc = utils.Rolling(Accuracy(), window_size=3)
for y_t, y_p in zip(Y_true, Y_predict):
    print(roll_acc.update(y_t, y_p).get())

In [None]:

from evidently.test_preset import DataDriftTestPreset



data_drift_test_suite = TestSuite(tests=[
    DataDriftTestPreset(),
])

In [None]:
data_drift_test_suite._inner_suite.context.tests

In [None]:
from evidently.tests.base_test import Test

t: Test = data_drift_test_suite._inner_suite.context.tests[0] 

In [None]:
t.__dict__

In [None]:
from batchstream.pipelines.base.stream_pipeline import StreamPipeline
from batchstream.utils.logging.base.logger_factory import LoggerFactory


class CombinationPipeline(StreamPipeline):

    def __init__(self, members: List[StreamPipeline], combiner: object):
        self._members = members
        self._combiner = combiner

    def handle(self, x, y) -> int:
        return self._combiner.combine(x, y, self._members)



In [None]:
import pandas as pd
from evidently.test_suite import TestSuite
from evidently.test_preset import DataDriftTestPreset
from evidently.tests import *
from batchstream.history.base.history_manager import HistoryManager
from batchstream.monitoring.pipeline.model_monitoring_pipeline import ModelMonitoringPipeline
from batchstream.monitoring.pipeline.steps.batch.evidently_monitoring_step import EvidentlyMonitoringStep
from sklearn.datasets import load_breast_cancer
from batchstream.utils.logging.base.logger_factory import LoggerFactory
from batchstream.batch_monitoring_strategy.simple_monitoring_strategy import SimpleMonitoringStrategy
from batchstream.retraining_strategy.simple_retraining_strategy import SimpleRetrainingStrategy 
from batchstream.model_comparers.batch_comparer import BatchModelComparer
from batchstream.model_comparers.shadow_comparer import ShadowOnlineComparer
from batchstream.pipelines.batch.batch_pipeline import BatchPipeline
from batchstream.estimators.sklearn_estimator import SklearnEstimator
from batchstream.detectors.base.drift_handler import DriftHandler
from batchstream.experiment.experiment import StreamExperiment
from river.metrics import Accuracy, ROCAUC
from batchstream.evaluation.river_evaluation_pipeline import RiverEvaluationPipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline



logger_factory = LoggerFactory('test-2218')
retraining_strategy = SimpleRetrainingStrategy(
  n_last_retrain=500, n_last_test=0)


### INPUT DRIFT DETECTION
# Detector 1.1 - Data Drift
data_drift_test_suite = TestSuite(tests=[
DataDriftTestPreset(),
])
d1 = SimpleMonitoringStrategy(n_curr=120, n_ref=120)
ev1 = EvidentlyMonitoringStep(data_drift_test_suite, d1, logger_factory, min_instances=240, clock=120, name='data_drift_eval')

# Detector 1.2 - Target Drift
target_drift = TestSuite(tests=[
    TestColumnDrift(column_name='target'),
])
d2 = SimpleMonitoringStrategy(n_curr=120, n_ref=120, type='target')
ev2 = EvidentlyMonitoringStep(target_drift, d2, logger_factory, min_instances=240, clock=120, name='target_drift_eval')

input_monitoring = ModelMonitoringPipeline([(ev1._name, ev1), (ev2._name, ev2)])
input_detector = DriftHandler(input_monitoring, retraining_strategy)
###




### OUTPUT (PERFORMANCE) DRIFT DETECTION
# Detector 2.1 - Performance Drift

performance_drift = TestSuite(tests=[
    TestPrecisionScore(),
    TestRecallScore(),
    TestF1Score(),
    TestAccuracyScore()
])
d3 = SimpleMonitoringStrategy(n_curr=500, n_ref=500, type='prediction')
ev3 = EvidentlyMonitoringStep(performance_drift, d3, logger_factory,
  min_instances=1000, clock=500, name='performance_drift_eval')

# Output Drift Handler (Performance Drift + Retraining Strategy)
output_monitoring = ModelMonitoringPipeline([(ev3._name, ev3)])

output_drift_handlers = [
  DriftHandler(output_monitoring, retraining_strategy)
]
###

### Models comparison (after retraining)
#model_comparer = BatchModelComparer()
model_comparer = ShadowOnlineComparer(n_online=100)
###


### Model's Performance Evaluation
acc = Accuracy()
roc_auc = ROCAUC()
eval_pipe = RiverEvaluationPipeline(metric_steps=[
    ('accuracy', acc),
    ('roc_auc', roc_auc)
])
###


### Model composition
logger_factory = LoggerFactory(experiment_id='rf_exp')
Pipeline([('rf', RandomForestClassifier(max_depth=10))])
sklearn_batch_classifier = SklearnEstimator()
batch_pipeline = BatchPipeline(
    sklearn_batch_classifier,
    input_drift_handlers,
    output_drift_handlers,
    history,
    logger_factory,
    model_comparer,
    min_samples_retrain=500,
    min_samples_first_fit=1000
)

In [None]:
from river.metrics import Accuracy, MacroF1, MicroF1
from river.utils import Rolling



window_size = 1000
eval_pipe = RiverEvaluationPipeline(metric_steps=[
  (f'acc_preq_{window_size}', Rolling(Accuracy(), window_size)),
  (f'micro_f1_preq_{window_size}', Rolling(MicroF1(), window_size)),
  (f'macro_f1_preq_{window_size}', Rolling(MacroF1(), window_size))
])

In [None]:
from batchstream.pipelines.stream.model_river_pipeline import RiverPipeline


In [None]:
from river.forest import ARFClassifier
from river import ensemble
from river import evaluate
from river import metrics
from river.datasets import synth
from river import tree
from river import ADWIN
from river import naive_bayes


In [None]:
from river.ensemble import SRPClassifier

In [None]:
from river.forest import ARFClassifier


logger_factory = LoggerFactory(experiment_id='arf_exp')
arf_model = ARFClassifier(seed=42, leaf_prediction="mc")
arf_pipe = RiverPipeline(arf_model)
arf_experiment = StreamExperiment(arf_pipe, eval_pipe, logger_factory)

In [None]:
base_model = tree.HoeffdingTreeClassifier(grace_period=50, delta=0.01, nominal_attributes=['age', 'car', 'zipcode'])
srp_model = ensemble.SRPClassifier(model=base_model, n_models=3, seed=42)
srp_pipe = RiverPipeline(srp_model)

In [None]:
from river.tree import HoeffdingAdaptiveTreeClassifier



hat_model = HoeffdingAdaptiveTreeClassifier(
  grace_period=100,
  delta=1e-5,
  leaf_prediction='nb',
  nb_threshold=10,
  seed=42
)
hat_pipe = RiverPipeline(hat_model)

In [None]:
from river import naive_bayes



logger_factory = LoggerFactory(experiment_id='nb_exp')
nb_model =  naive_bayes.GaussianNB()
nb_pipe = RiverPipeline(nb_model)
nb_experiment = StreamExperiment(nb_pipe, eval_pipe, logger_factory)

In [None]:
from evidently.test_suite import TestSuite
from evidently.test_preset import DataDriftTestPreset
from evidently.tests import *



### INPUT DRIFT DETECTION
# Detector 1.1 - Data Drift
data_drift_test_suite = TestSuite(tests=[
  DataDriftTestPreset(),
])
d1 = DummyMonitoringStrategy(n_curr=500, n_ref=500, type='data')
ev1 = EvidentlyMonitoringStep(data_drift_test_suite, d1, logger_factory,
  min_instances=1000, clock=500, name='data_drift_eval'
)

# Detector 1.2 - Target Drift 
target_drift = TestSuite(tests=[
    TestColumnDrift(column_name='target'),
])
d2 = DummyMonitoringStrategy(n_curr=500, n_ref=500, type='target')
ev2 = EvidentlyMonitoringStep(target_drift, d2, logger_factory,
  min_instances=1000, clock=500, name='target_drift_eval'
)

# Input Drift Handler (Data + Target Drift + Retraining Strategy)
input_monitoring = ModelMonitoringPipeline([(ev1._name, ev1), (ev2._name, ev2)])
input_drift_retraining_strategy = DummyRetrainingStrategy(
  n_last_retrain=500, n_last_test=0
)
input_drift_handlers = [
  DriftHandler(input_monitoring, input_drift_retraining_strategy)
]


In [None]:
StreamExperiment()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline



logger_factory = LoggerFactory(experiment_id='rf_exp_all_drifts')
rf_model = Pipeline([('rf', RandomForestClassifier(max_depth=10))])
sklearn_batch_classifier = SklearnEstimator(rf_model)
batch_pipeline = BatchPipeline(
    sklearn_batch_classifier,
    input_drift_handlers,
    output_drift_handlers,
    history,
    logger_factory,
    model_comparer,
    min_samples_retrain=1500,
    min_samples_first_fit=1000
)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline



logger_factory = LoggerFactory(experiment_id='lr_exp_all_drifts')
lr_model = Pipeline([('lr', LogisticRegression())])
sklearn_batch_classifier = SklearnEstimator(lr_model)
batch_pipeline = BatchPipeline(
    sklearn_batch_classifier,
    input_drift_handlers,
    output_drift_handlers,
    history,
    logger_factory,
    model_comparer,
    min_samples_retrain=1500,
    min_samples_first_fit=1000
)

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline



logger_factory = LoggerFactory(experiment_id='nb_exp_all_drifts')
nb_model = Pipeline([('nb', GaussianNB())])
sklearn_batch_classifier = SklearnEstimator(nb_model)
batch_pipeline = BatchPipeline(
    sklearn_batch_classifier,
    input_drift_handlers,
    output_drift_handlers,
    history,
    logger_factory,
    model_comparer,
    min_samples_retrain=1500,
    min_samples_first_fit=1000
)

In [None]:
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline



logger_factory = LoggerFactory(experiment_id='xgb_exp_all_drifts')
xgb_model = Pipeline([('xgb', XGBClassifier())])
sklearn_batch_classifier = SklearnEstimator(xgb_model)
batch_pipeline = BatchPipeline(
    sklearn_batch_classifier,
    input_drift_handlers,
    output_drift_handlers,
    history,
    logger_factory,
    model_comparer,
    min_samples_retrain=1500,
    min_samples_first_fit=1000
)

In [None]:
!pip install xgboost

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame({'1': [1, 2, 3]})

In [None]:
df

In [None]:
df['dataset'] = 'x'

In [None]:
dataset

In [None]:
df

In [None]:
df.columns = range(len(df.columns))

In [None]:
df

In [None]:
import pandas as pd
from io import BytesIO, TextIOWrapper
from zipfile import ZipFile
from scipy.io.arff import loadarff
from os import path
from typing import Tuple



def get_covtype_dataset(data_path='./data') -> Tuple[pd.DataFrame, pd.Series]:
    zip_path = path.join(data_path, 'COVTYPE/covtypeNorm.arff.zip')
    with (ZipFile(zip_path, 'r')) as zfile:
        in_mem_fo = TextIOWrapper(BytesIO(zfile.read('covtypeNorm.arff')), encoding='ascii')
        data = loadarff(in_mem_fo)
        df = pd.DataFrame(data[0])
        to_convert_df = df.select_dtypes([object])
        to_convert_col_names = to_convert_df.columns
        df[to_convert_col_names] = to_convert_df.stack().str.decode('utf-8').unstack()
        class_col = df.pop('class').replace(['noad', 'ad'], [0, 1])
        df['target'] = class_col.copy()
        df.loc[:, "Wilderness_Area1":] = df.loc[:, "Wilderness_Area1":].astype(str).astype(int)
        df['dataset'] = 'covtypeNorm'
    return df


In [None]:
df = get_covtype_dataset()

In [None]:
df.dtypes

In [None]:
from pathlib import Path
from batchstream.utils.visualization import visualize_results
from batchstream.utils.reading_logs import load_drift_history, get_metrics_vals



out_dir = Path(r'D:\Studia\praca\out\20230505_1848')
dir_list = [f for f in out_dir.resolve().glob('*') if not f.is_file()]
for d in dir_list:
    drift_hist = load_drift_history(str(d))
    res = get_metrics_vals(str(d))
    print(str(d))
    visualize_results(res, drift_hist, dataset_name='COVTYPE', metrics=['acc'])

    

In [None]:
# Drift features nie występuje
# Drift perfromance też nie (?) 

***
# Elec

In [None]:
from pathlib import Path
from batchstream.utils.visualization import visualize_results
from batchstream.utils.reading_logs import load_drift_history, get_metrics_vals



out_dir = Path(r'D:\Studia\praca\out\20230506_1423')
dir_list = [f for f in out_dir.resolve().glob('*') if not f.is_file()]
for d in dir_list:
    drift_hist = load_drift_history(str(d))
    res = get_metrics_vals(str(d))
    print(str(d))
    visualize_results(res, drift_hist, dataset_name='elec', metrics=['acc'])

# Rbf 0.66

In [None]:
from pathlib import Path
from batchstream.utils.visualization import visualize_results
from batchstream.utils.reading_logs import load_drift_history, get_metrics_vals
from utils.parse_metadata.parse_log_metadata import print_info



out_dir = Path(r'D:\Studia\praca\out\20230509_1222_rbf66')
dir_list = [f for f in out_dir.resolve().glob('*') if not f.is_file()]
for d in dir_list:
    print(str(d))
    print_info(str(d))
    drift_hist = load_drift_history(str(d))
    res = get_metrics_vals(str(d))
    if res is None: continue
    visualize_results(res, drift_hist, dataset_name='rbf0.66', metrics=['f1'])

# ELEC

In [None]:
from pathlib import Path
from batchstream.utils.visualization import visualize_results
from batchstream.utils.reading_logs import load_drift_history, get_metrics_vals
from utils.parse_metadata.parse_log_metadata import print_info




out_dir = Path(r'D:\Studia\praca\out\20230505_2021_elec')
dir_list = [f for f in out_dir.resolve().glob('*') if not f.is_file()]
for d in dir_list:
    print(str(d))
    print_info(d)
    replacement_hist = get_replacement_hist(str(d))
    drift_hist = load_drift_history(str(d))
    res = get_metrics_vals(str(d))
    if res is None: continue
    visualize_results(res, drift_hist, dataset_name='elec', replacement_hist=replacement_hist, metrics=['f1'])

In [None]:
from experiments.first.rf_all_evidently import get_rf_all_evidently_exp


In [None]:
df = get_dataset('elec')
exp = get_rf_all_evidently_exp(suffix='0000300000')

exp.run(df)

In [None]:
from experiments.first.online.online import get_arf_exp
from utils.read_data.get_dataset import get_dataset

df = get_dataset('rbf66')
exp = get_arf_exp(suffix='example')

exp.run(df)

In [None]:
from river import sketch

cms = sketch.Counter(epsilon=0.005, seed=0)

In [None]:
for x in [0, 1, 1, 0, 0, 1, 0, 1, 2, 0, 2, 1]:
    cms.update(x)

In [None]:
from collections import Counter
counter = Counter()

In [None]:
for x in [0, 1, 1, 0, 0, 1, 0, 1, 2, 0, 2, 1]:
    counter[x] += 1

In [None]:
counter

In [None]:
counter.most_common(n=1)[0][0]

In [None]:
counter.total()

In [None]:
Counter().total()

In [None]:
df = get_dataset('covtype')

In [None]:
df.copy(deep=True)

In [None]:
from batchstream.utils.visualization import visualize_results
from pathlib import Path
from batchstream.utils.reading_logs import load_drift_history, get_metrics_vals

In [None]:
out_dir = Path(r'C:\Users\golik\Desktop\mgr\praca\first_draft\out\f3c094ec_arf_example_20230510_191733')
log_dir = Path(((str)(out_dir)).replace("\out\\", "\log\\"))

In [None]:
import os

dir_name = log_dir.name
path = os.path.join(log_dir, f"{log_dir.name}_BatchPipeline.log")

In [None]:
import re
import os

def get_replacement_hist(out_dir):
    log_dir = Path(((str)(out_dir)).replace("\out\\", "\log\\"))
    path = os.path.join(log_dir, f"{log_dir.name}_BatchPipeline.log")
    replacement_hist = []
    if not os.path.exists(path): return replacement_hist
    with open(path, mode="r+") as f:
        lines = f.readlines()
        for line in lines:
            if 'replacing' in line:
                pattern = r'Iter=(\d+)'
                match = re.search(pattern, line)
                if match:
                    liczba = match.group(1)
                    replacement_hist.append(int(liczba))

    return replacement_hist

In [None]:
d = out_dir
print(str(d))
drift_hist = load_drift_history(str(d))
res = get_metrics_vals(str(d))
visualize_results(res, drift_hist, 'rbf0.66', replacement_hist, metrics=['f1'], out_dir=out_dir)

In [None]:
import json

In [None]:
file_list = [f for f in out_dir.resolve().glob('*') if f.is_file() and 'metadata' in f.name]
with open(file_list[0], 'r') as f:
    d = json.load(f)

In [None]:
drift_detectors_i = stream_pipeline['input_drift_detector']

In [None]:
def get_drift_detectors_metadata(drift_detectors):
    if drift_detectors is None: return []
    drift_handlers_metadata = []
    for d_i in drift_detectors:
        d_i_metadata = {}
        d_i_monitor = d_i['monitor']['test_steps'][0]['target_drift_eval']
        if d_i_monitor['type'] == 'EvidentlyMonitoringStep':
            d_i_metadata.update({'min_instances': d_i_monitor['min_instances'],
                                'name': d_i_monitor['name'],
                                'clock': d_i_monitor['clock'],
                                'min_instances': d_i_monitor['min_instances'],
                                'type': 'Evidently'})
        drift_handlers_metadata.append(d_i_metadata)
    return drift_handlers_metadata
        

In [None]:
def get_batch_model_metadata(batch_model):
    if batch_model['type'] == 'SklearnEstimator':
        batch_alg = [x[0] for x in batch_model['sklearn_estimator']['steps'] if x[0] in ['rf']][0]
    return batch_alg


In [None]:
batch_alg = [x[0] for x in batch_model['sklearn_estimator']['steps'] if x[0] in ['rf']][0]

In [None]:
stream_pipeline = d['stream_pipeline']
if stream_pipeline['type'] == 'BatchPipeline':
    first_fit = stream_pipeline['min_samples_first_fit']
    n_retrain = stream_pipeline['min_samples_retrain']
    batch_model = stream_pipeline['batch_model']
    batch_alg = get_batch_model_metadata(batch_model)
    o_handlers = get_drift_detectors_metadata(stream_pipeline['input_drift_detector'])
    i_handlers = get_drift_detectors_metadata(stream_pipeline['output_drift_detector'])

    

In [None]:
o_handlers

In [None]:
info = f'BatchPipeline ({batch_alg}) - n_min_retrain: {n_retrain}, n_first_fit: {first_fit}\n'

In [None]:
def get_handlers_info_text(handlers, h_type):
    info = ''
    if len(handlers) >= 1:
        info += f'{h_type} handlers:\n'
        for h in handlers:
            info += f"{h['type']} {h['name']} (clock: {h['clock']}, n_min: {h['min_instances']})\n"
    return info

In [None]:
path = r"D:\Studia\praca\log\20230509_1222_rbf66\3f2a4d50_test_rf_data_evidently_ba6a_rbf66_20230509_004613\3f2a4d50_test_rf_data_evidently_ba6a_rbf66_20230509_004613_BatchPipeline.log"

In [None]:




def get_combine_exp(suffix, n_curr=100, n_ref=100, n_online=10, window_size=100, n_first_fit=100,
    data_stattest_threshold=0.05, target_stattest_threshold=0.05):


    history = HistoryManager()
    

    

    ### Models comparison (after retraining)
    #model_comparer = BatchModelComparer()
    model_comparer = ShadowOnlineComparer(n_online=n_online)

        ### Model composition
    sklearn_batch_classifier = SklearnEstimator(Pipeline([('rf', RandomForestClassifier())]))
 
    ###

    
    history2 = HistoryManager()
    ### INPUT DRIFT DETECTION
    # Detector 1.1 - Data Drift
    data_drift_test_suite2 = {'tests': [
        DataDriftTestPreset(stattest_threshold=data_stattest_threshold),
    ]}
    d12 = SimpleMonitoringStrategy(n_curr=n_curr, n_ref=n_ref)
    ev12 = EvidentlyMonitoringStep(data_drift_test_suite2, d12, logger_factory, min_instances=2*n_curr, clock=n_curr, name='data_drift_eval')

    # Detector 1.2 - Target Drift
    target_drift2 = {'tests': [
        TestColumnDrift(column_name='target', stattest_threshold=target_stattest_threshold),
    ]}
    d22 = SimpleMonitoringStrategy(n_curr=n_curr, n_ref=n_ref, type='target')
    ev22 = EvidentlyMonitoringStep(target_drift2, d22, logger_factory, min_instances=2*n_curr, clock=n_curr, name='target_drift_eval')

    input_monitoring2 = DriftMonitoringPipeline([(ev12.name, ev12), (ev22.name, ev22)])
    input_drift_retraining_strategy2 = SimpleRetrainingStrategy(n_last_retrain=n_curr, n_last_test=0)
    input_detector2 = DriftHandler(input_monitoring2, input_drift_retraining_strategy2)
    ###


    ### OUTPUT (PERFORMANCE) DRIFT DETECTION
    # Detector 2.1 - Performance Drift

    performance_drift2 = {'tests': [
        TestPrecisionScore(),
        TestRecallScore(),
        TestF1Score(),
        TestAccuracyScore()
    ]}
    d32 = SimpleMonitoringStrategy(n_curr=n_curr, n_ref=n_ref, type='prediction')
    ev32 = EvidentlyMonitoringStep(performance_drift2, d32, logger_factory, min_instances=2*n_curr, clock=n_curr, name='performance_drift_eval')

    output_monitoring2 = DriftMonitoringPipeline([(ev32.name, ev32)])
    output_drift_retraining_strategy2 = SimpleRetrainingStrategy(n_last_retrain=n_curr, n_last_test=0)
    output_detector2 = DriftHandler(output_monitoring2, output_drift_retraining_strategy2)
    ###

    ### Models comparison (after retraining)
    #model_comparer = BatchModelComparer()
    model_comparer2 = ShadowOnlineComparer(n_online=n_online)

        ### Model composition
    sklearn_batch_classifier2 = SklearnEstimator(Pipeline([('rf', RandomForestClassifier())]))
    batch_pipeline2 = BatchPipeline(
        sklearn_batch_classifier2,
        input_drift_handlers=input_detector2,
        output_drift_handlers=output_detector2,
        history=history2,
        logger_factory=logger_factory,
        model_comparer=model_comparer2,
        min_samples_retrain=n_curr,
        min_samples_first_fit=n_first_fit
    )
    ###


    
    
    ###

    arf_model = ARFClassifier(seed=42, leaf_prediction="mc")
    arf_pipe = RiverPipeline(arf_model)

    ###
    members = [batch_pipeline, batch_pipeline2, arf_pipe]
    combiner = MajorityVoteCombiner()
    comb_pipeline = CombinationPipeline(members=members, combiner=combiner)

    ### Experiment args
    experiment = StreamExperiment(comb_pipeline, eval_pipe, logger_factory)
    
    return experiment


In [None]:
exp_name = f'_{datetime.today().strftime("%Y%m%d_%H%M%S")}'
    
logger_factory = LoggerFactory(exp_name)

In [None]:
exp_name

In [None]:
m2 = Accuracy()

In [None]:
m2.

In [None]:
import uuid

str(uuid.uuid4())[:4]

In [None]:
f"0c6b3787_rf_all_evidently_{str(uuid.uuid4())[:4]}"

In [None]:
from river.datasets import synth


dataset = synth.FriedmanDrift(
    drift_type='gra',
    position=(25_000, 50_000),
    seed=42)

In [None]:
dataset

In [None]:
from utils.sync_data.create_dataset import generate_friedman_drift

In [None]:
generate_friedman_drift()

In [None]:
from utils.read_data.get_dataset import get_dataset

In [None]:
df = get_dataset('insects_abrupt')

In [None]:
def get_evidently_input_handlers(n_curr, n_ref, data_stattest_threshold, target_stattest_threshold, logger_factory, data_drift=True, target_drift=True):
    if not data_drift and not target_drift: return None
    
    ### INPUT DRIFT DETECTION
    # Detector 1.1 - Data Drift
    data_drift_test_suite = {'tests': [
        DataDriftTestPreset(stattest_threshold=data_stattest_threshold),
    ]}
    d1 = SimpleMonitoringStrategy(n_curr=n_curr, n_ref=n_ref)
    ev1 = EvidentlyMonitoringStep(data_drift_test_suite, d1, logger_factory, min_instances=2*n_curr, clock=n_curr, name='data_drift_eval')

    # Detector 1.2 - Target Drift
    target_drift = {'tests': [
        TestColumnDrift(column_name='target', stattest_threshold=target_stattest_threshold),
    ]}
    d2 = SimpleMonitoringStrategy(n_curr=n_curr, n_ref=n_ref, type='target')
    ev2 = EvidentlyMonitoringStep(target_drift, d2, logger_factory, min_instances=2*n_curr, clock=n_curr, name='target_drift_eval')

    monitoring_steps = []
    if data_drift:
        monitoring_steps.append((ev1.name, ev1))
    if target_drift:
        monitoring_steps.append((ev2.name, ev2))
    input_monitoring = DriftMonitoringPipeline(monitoring_steps)
    
    input_drift_retraining_strategy = SimpleRetrainingStrategy(n_last_retrain=n_curr, n_last_test=0)
    input_detector = DriftHandler(input_monitoring, input_drift_retraining_strategy)

    return input_detector
    


In [None]:
def get_evidently_output_handlers(n_curr, n_ref, logger_factory):
    ### OUTPUT (PERFORMANCE) DRIFT DETECTION
    # Detector 2.1 - Performance Drift

    performance_drift = {'tests': [
        TestPrecisionScore(),
        TestRecallScore(),
        TestF1Score(),
        TestAccuracyScore()
    ]}
    d3 = SimpleMonitoringStrategy(n_curr=n_curr, n_ref=n_ref, type='prediction')
    ev3 = EvidentlyMonitoringStep(performance_drift, d3, logger_factory, min_instances=2*n_curr, clock=n_curr, name='performance_drift_eval')

    output_monitoring = DriftMonitoringPipeline([(ev3.name, ev3)])
    output_drift_retraining_strategy = SimpleRetrainingStrategy(n_last_retrain=n_curr, n_last_test=0)
    output_detector = DriftHandler(output_monitoring, output_drift_retraining_strategy)
    
    return output_detector

In [None]:
def get_eval_pipeline(window_size):
    eval_pipe = RiverEvaluationPipeline(metric_steps=[
        (f'acc_preq_{window_size}', Rolling(Accuracy(), window_size)),
        (f'macro_f1_preq_{window_size}', Rolling(MacroF1(), window_size)),
        (f'kappa_preq_{window_size}', Rolling(CohenKappa(), window_size)),
        ('acc', Accuracy()),
        ('f1_macro', MacroF1()),
        ('kappa', CohenKappa())
    ])
    return eval_pipe


In [None]:
def get_batch_pipeline(n_curr, n_first_fit, sklearn_pipe, input_handlers, output_handlers, model_comparer, logger_factory):
    history = HistoryManager()
    batch_pipeline = BatchPipeline(
        sklearn_pipe,
        input_drift_handlers=input_handlers,
        output_drift_handlers=output_handlers,
        history=history,
        logger_factory=logger_factory,
        model_comparer=model_comparer,
        min_samples_retrain=n_curr,
        min_samples_first_fit=n_first_fit
    )
    return batch_pipeline


In [None]:
def get_experiment(suffix, n_curr=300, n_online=50, n_first_fit=300, window_size=100):
    prefix = str(uuid.uuid4())[:8]
    name = f'{prefix}_rf_all_evidently_{suffix}'
    exp_name = f'{name}_{datetime.today().strftime("%Y%m%d_%H%M%S")}'
    
    logger_factory = LoggerFactory(exp_name)
    input_handlers = get_evidently_input_handlers(n_curr=n_curr, n_ref=n_curr, data_stattest_threshold=0.05, target_stattest_threshold=0.05, logger_factory=logger_factory)
    output_handlers = get_evidently_output_handlers(n_curr, n_ref=n_curr, logger_factory=logger_factory)
    sklearn_pipeline = SklearnEstimator(Pipeline([('rf', RandomForestClassifier())]))
    model_comparer = ShadowOnlineComparer(n_online=n_online)
    batch_pipeline = get_batch_pipeline(n_curr, n_first_fit, sklearn_pipeline, input_handlers, output_handlers, model_comparer, logger_factory)
    eval_pipe = get_eval_pipeline(window_size)
    experiment = StreamExperiment(batch_pipeline, eval_pipe, logger_factory)
    return experiment

In [None]:
from utils.read_data.get_dataset import get_dataset

exp = get_experiment('stagger2034')
df = get_dataset('stagger_1k')
exp.run(df)

In [None]:
from river.datasets import synth




In [None]:
from utils.read_data.get_dataset import get_dataset
df = get_dataset('stagger_1K')

In [None]:
df

In [None]:
from pathlib import Path
from batchstream.utils.visualization import visualize_results
from batchstream.utils.reading_logs import load_drift_history, get_metrics_vals
from utils.parse_metadata.parse_log_metadata import print_info
import os 


d = Path(r'C:\Users\golik\Desktop\mgr\praca\first_draft\out\3abb3f56_rf_all_evidently_stagger1844_20230521_185417')

print_info(d)
replacement_hist = list(pd.read_csv(os.path.join(str(d), 'model_replacement_history.csv'), header=None)[0].values)
drift_hist = load_drift_history(str(d))
res = get_metrics_vals(str(d))
if res is not None: 
    visualize_results(res, drift_hist, dataset_name='elec', replacement_hist=replacement_hist, metrics=['f1'])

In [None]:
from pathlib import Path
from batchstream.utils.visualization import visualize_results
from batchstream.utils.reading_logs import load_drift_history, get_metrics_vals
from utils.parse_metadata.parse_log_metadata import print_info
import os 


d = Path(r'C:\Users\golik\Desktop\mgr\praca\first_draft\out\c644433b_rf_all_evidently_stagger4_20230521_164748')

print_info(d)
replacement_hist = [] #list(pd.read_csv(os.path.join(str(d), 'model_replacement_history.csv'), header=None)[0].values)
drift_hist = load_drift_history(str(d))
res = get_metrics_vals(str(d))

visualize_results(res, drift_hist, dataset_name='elec', replacement_hist=replacement_hist, metrics=['f1'])

In [None]:
c644433b_rf_all_evidently_stagger4_20230521_164748_data_drift_eval_ae5f

In [None]:
df

In [None]:
df.loc[:, ['size', 'color', 'shape']]

In [None]:
df

In [None]:
def generate_stagger_dataset(seed=42, balance_classes=True, drift_step: int=25_000, data_dir='./data'):
    X = []
    Y = []
    dataset = synth.STAGGER(classification_function=0, seed=seed, balance_classes=balance_classes)
    for x, y in dataset.take(drift_step):
        X.append(x)
        Y.append(y)

    dataset.classification_function = 1
    for x, y in dataset.take(drift_step):
        X.append(x)
        Y.append(y)

    dataset.classification_function = 2
    for x, y in dataset.take(drift_step):
        X.append(x)
        Y.append(y)

    df = pd.DataFrame(X)

    ohe = OneHotEncoder()

    df = pd.DataFrame(ohe.fit_transform(df).toarray())
    df.columns = ['size_0', 'size_1', 'size_2', 'color_0', 'color_1', 'color_2', 'shape_0', 'shape_1', 'shape_2']

    df['target'] = Y
    
    #df.to_csv(f'{data_dir}/STAGGER/stagger_{drift_step // 1000}K.csv')
    return df

In [None]:
import csv
from os import path
import json
import os
from river.datasets import synth
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [None]:
df = generate_stagger_dataset()

In [None]:
df.to_csv()

In [None]:
from utils.sync_data.create_dataset import generate_stagger_dataset

In [None]:
generate_stagger_dataset()

In [None]:
generate_stagger_dataset(drift_step=1_000)

In [None]:
from utils.sync_data.create_dataset import generate_LEDdrift

In [None]:
generate_LEDdrift()

In [1]:
from experiments.second import get_adwin_experiment, get_evidently_experiment

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline


rf = Pipeline([('rf', RandomForestClassifier())])

In [3]:
exp1 = get_evidently_experiment('xD', rf)

In [4]:
exp2 = get_adwin_experiment('xxD', rf)

TypeError: 'NoneType' object is not iterable