In [None]:
# Funkcja odpalana jako pandas_udf




In [None]:
from abc import ABC, abstractmethod
from typing import List



class RetrainingStrategy(ABC):
    
    @abstractmethod
    def get_retraining_data(x_history: List, y_history: List[int], prediction_history: List[int], drift_history: List[int]):
        pass

In [None]:
from abc import ABC, abstractmethod
from typing import List


class EvaluationStrategyManager(ABC):
    
    @abstractmethod
    def get_curr_ref_data(self, x_history: List, y_history: List[int], prediction_history: List[int], drift_history: List[int]):
        pass

In [None]:
from abc import ABC, abstractmethod
from typing import List


class ModelEstimatorPipeline(ABC):
    
    @abstractmethod
    def handle(self, x, y):
        pass

    @abstractmethod
    def adjust_model(self, x_history: List, y_history: List[int], prediction_history: List[int], drift_history: List[int]):
        pass

    @abstractmethod
    def get_name(self):
        pass

In [None]:
from sklearn.pipeline import Pipeline
from typing import Dict, List



class ModelSklearnPipeline(ModelEstimatorPipeline):

    def __init__(self, sklearn_pipeline: Pipeline, hyperparameter_space: Dict, retraining_strategy: TrainingStrategyManager):
        self.estimator = sklearn_pipeline
        self.hyperparameter_space = hyperparameter_space
        self.retraining_strategy = retraining_strategy

    def handle(self, x, y):
        return self.estimator.predict(x)
    
    def adjust_model(self, x_history: List, y_history: List[int], prediction_history: List[int], drift_history: List[int]):
        x_train, y_train = self.retraining_strategy.get_retraining_data(x_history, y_history, prediction_history, drift_history)
        self.estimator.fit(x_train, y_train) 

    def get_name(self):
        return super().get_name() # TO DO

In [None]:
from river import compose
from typing import List


class ModelRiverPipeline(ModelEstimatorPipeline):
    
    def __init__(self, river_pipeline: compose.Pipeline):
        self.estimator = river_pipeline

    def handle(self, x, y):
        prediction = self.estimator.predict_one(x)
        self.estimator.learn_one(x, y)
        return prediction

    def adjust_model(self, x_history: List, y_history: List[int], prediction_history: List[int], drift_history: List[int]):
        return
    
    def get_name(self):
        return super().get_name() # TO DO

In [None]:
class ModelEvaluationPipeline:
    
    def __init__(self, metric_steps):
        self.metric_steps = metric_steps

    def handle(self, y_true, y_predict):
        results = {}
        for metric_name, metric in self.steps:
            metric_value = metric.update(y_true, y_predict)
            results.update({metric_name: metric_value})
        return results

In [None]:
from abc import ABC, abstractmethod
from typing import List



class MonitoringStep(ABC):

    @abstractmethod
    def monitor(self, x_history: List, y_history: List[int], prediction_history: List[int], drift_history: List[int]) -> bool:
        pass

In [None]:
from evidently.test_suite import TestSuite



class EvidentlyMonitoringStep(MonitoringStep):

    def __init__(self, evidently_test_suite: TestSuite, evaluation_strategy: EvaluationStrategyManager):
        self.detector = evidently_test_suite
        self.eval_strategy = evaluation_strategy

    def monitor(self, x_history: List, y_history: List[int], prediction_history: List[int], drift_history: List[int]) -> bool:
        curr, ref = self.eval_strategy.get_curr_ref_data(x_history, y_history, prediction_history, drift_history)
        self.detector.run(reference_data=ref,current_data=curr)
        report = self.detector.as_dict()
        return True # to do based on report
        

In [None]:

# Experiment -> uruchomiony na danym partition: definiuje experiment pipeline



datastream_name = datastream['name'][0]
datastream = datastream.drop(col=['name'])
logger = Logger(dataset_name=name)

logger.start()
for x, y in datastream:

   logger.iter()
   pipe = StreamClassificationPipeline()

   pipe.handle(x, y)
   logger.iter_end()

logger.end()

        


In [None]:
list([{'a': 4}][0].values())[0]

In [None]:
from river.drift import ADWIN

a = ADWIN()

In [None]:
type(a)

In [None]:
a.clock

In [None]:
'_helper'.startswith('_')

In [None]:
vars(a).items()

In [None]:
{item for item in vars(a).items() if not item[0].startswith('_')}

In [None]:
import pandas as pd
import numpy as np

from sklearn import datasets, ensemble, model_selection

from evidently import ColumnMapping


from evidently.test_preset import NoTargetPerformanceTestPreset
from evidently.test_preset import DataQualityTestPreset
from evidently.test_preset import DataStabilityTestPreset

from evidently.test_preset import RegressionTestPreset
from evidently.test_preset import MulticlassClassificationTestPreset
from evidently.test_preset import BinaryClassificationTopKTestPreset
from evidently.test_preset import BinaryClassificationTestPreset

from evidently.tests import TestNumberOfEmptyRows, TestNumberOfEmptyColumns, TestNumberOfDuplicatedRows, TestNumberOfDuplicatedColumns, TestNumberOfDriftedColumns, TestShareOfDriftedColumns

In [None]:
#Dataset for Data Quality and Integrity
adult_data = datasets.fetch_openml(name='adult', version=2, as_frame='auto')
adult = adult_data.frame

adult_ref = adult[~adult.education.isin(['Some-college', 'HS-grad', 'Bachelors'])]
adult_cur = adult[adult.education.isin(['Some-college', 'HS-grad', 'Bachelors'])]

adult_cur.iloc[:2000, 3:5] = np.nan

In [None]:
#Dataset for Regression
housing_data = datasets.fetch_california_housing(as_frame='auto')
housing = housing_data.frame

housing.rename(columns={'MedHouseVal': 'target'}, inplace=True)
housing['prediction'] = housing_data['target'].values + np.random.normal(0, 3, housing.shape[0])

housing_ref = housing.sample(n=5000, replace=False)
housing_cur = housing.sample(n=5000, replace=False)

In [None]:
#Dataset for Binary Probabilistic Classifcation
bcancer_data = datasets.load_breast_cancer(as_frame='auto')
bcancer = bcancer_data.frame

bcancer_ref = bcancer.sample(n=300, replace=False)
bcancer_cur = bcancer.sample(n=200, replace=False)

bcancer_label_ref = bcancer_ref.copy(deep=True)
bcancer_label_cur = bcancer_cur.copy(deep=True)

model = ensemble.RandomForestClassifier(random_state=1, n_estimators=10)
model.fit(bcancer_ref[bcancer_data.feature_names.tolist()], bcancer_ref.target)

bcancer_ref['prediction'] = model.predict_proba(bcancer_ref[bcancer_data.feature_names.tolist()])[:, 1]
bcancer_cur['prediction'] = model.predict_proba(bcancer_cur[bcancer_data.feature_names.tolist()])[:, 1]

bcancer_label_ref['prediction'] = model.predict(bcancer_label_ref[bcancer_data.feature_names.tolist()])
bcancer_label_cur['prediction'] = model.predict(bcancer_label_cur[bcancer_data.feature_names.tolist()])

In [None]:
#Dataset for Multiclass Classifcation
iris_data = datasets.load_iris(as_frame='auto')
iris = iris_data.frame

iris_ref = iris.sample(n=75, replace=False)
iris_cur = iris.sample(n=75, replace=False)

model = ensemble.RandomForestClassifier(random_state=1, n_estimators=3)
model.fit(iris_ref[iris_data.feature_names], iris_ref.target)

iris_ref['prediction'] = model.predict(iris_ref[iris_data.feature_names])
iris_cur['prediction'] = model.predict(iris_cur[iris_data.feature_names])

In [None]:
data_drift_dataset_tests = TestSuite(tests=[
    TestNumberOfEmptyRows(),
    TestNumberOfEmptyColumns(),
    TestNumberOfDuplicatedRows(),
    TestNumberOfDuplicatedColumns(),
    TestNumberOfDriftedColumns(),
    TestShareOfDriftedColumns(),
])

data_drift_dataset_tests.run(reference_data=adult_ref, current_data=adult_cur)

In [None]:
#test preset as a python object
res = data_drift_dataset_tests.as_dict()

In [None]:
res['tests']

In [None]:
is_drift_detected = False
detection_idx = 3



In [None]:
desc

In [None]:
from collections import Counter


x = Counter({'SUCCESS': 3, 'FAIL': 3})

In [None]:
x.elements()

In [None]:
def foo():
    return 1, 2, {}

In [None]:
a, b, kwargs = foo()

In [None]:
class Foo:

    def __init__(self, a: int, b: int, weight: int=4, height: int=3):
        self.a = a
        self.b = b
        self.weight = weight
        self.height = height


In [None]:
f= Foo(a, b, **kwargs)

In [None]:
f.a

In [None]:
f.weight

In [None]:
f.c

In [None]:
from collections import Counter

In [None]:
d = {
            'tests': [{
                'name': 'self.step_name',
                'description': f'Drift detected at idx.: {3}',
                'status': 'FAIL',
                'group': 'river-detector',
                'parameters': {
                    'detected_at_idx': 3,
                    'detector_type': 'type(self.detector)'
                }.update({'item for item in vars(self.detector).items()' : "if not item[0].startswith('_')"})
            }],
            'summary': {
                'all_passed': 0,
                'total_tests': 1,
                'success_tests': 0,
                'failed_tests': 1,
                'by_status': Counter({'SUCCESS': 0, 'FAIL': 1})}
        }

In [None]:
any_failed = d['summary']['failed_tests'] > 0
all_failed = d['summary']['failed_tests'] == d['summary']['total_tests']

In [None]:
d

In [None]:
any_failed

In [None]:
all_failed

In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.DataFrame(np.random.random(size=(10, 30)))

In [None]:
df.head()

In [None]:
n_curr = 5
n_ref = 3

In [None]:
df.iloc[-n_curr:,:]

In [None]:
df.iloc[-(n_curr + n_ref):-n_curr,:]

In [None]:
s = [1, 2, 3, 4, 5, 6, 7, 8, 9]

In [None]:
s[3:]

In [None]:
data_path = './data/ADS/internet_ads.arff.zip'

In [None]:
from utils.read_data.internet_ads import get_internet_ads_df

In [None]:
df = get_internet_ads_df()

In [None]:
from batchstream.pipelines.stream.model_river_pipeline import ModelRiverPipeline
from batchstream.evaluation.model_evaluation_pipeline import ModelEvaluationPipeline
from river.metrics import Accuracy, ROCAUC

In [None]:
from river import linear_model
from river import preprocessing
from batchstream.utils.logging.performance_logger import PerformanceEvalLogger

scaler = preprocessing.StandardScaler()
log_reg = linear_model.LogisticRegression()
river_model = scaler | log_reg
perf_logger = PerformanceEvalLogger('test-xgrhtueifj')

In [None]:
stream_pipeline = ModelRiverPipeline(river_model)

acc = Accuracy()
roc_auc = ROCAUC()
pipeline_evaluation = ModelEvaluationPipeline(metric_steps=[('roc_auc', roc_auc)])



In [None]:
from batchstream.experiment.experiment import StreamExperiment


In [None]:
exp1 = StreamExperiment(stream_pipeline, pipeline_evaluation, perf_logger)

In [None]:
exp1.run(df)

In [None]:
report_batch = [
        {'accuracy': 0.0, 'roc_auc': -0.0},
        {'accuracy': 0.5, 'roc_auc': 0.5},
        {'accuracy': 0.33, 'roc_auc': 0.5},
        {'accuracy': 0.5, 'roc_auc': 0.5},
        {'accuracy': 0.6, 'roc_auc': 0.67},
        {'accuracy': 0.67, 'roc_auc': 0.67},
        {'accuracy': 0.71, 'roc_auc': 0.75},
        {'accuracy': 0.75, 'roc_auc': 0.75}
    ]

In [None]:
perf_logger.log_eval_report(report_batch)

In [None]:
report_batch_2 = [
        {'accuracy': 0.75, 'roc_auc': -0.0},
        {'accuracy': 0.71, 'roc_auc': 0.5},
        {'accuracy': 0.67, 'roc_auc': 0.5},
        {'accuracy': 0.6, 'roc_auc': 0.5},
        {'accuracy': 0.5, 'roc_auc': 0.67},
        {'accuracy': 0.4, 'roc_auc': 0.67},
        {'accuracy': 0.33, 'roc_auc': 0.75},
        {'accuracy': 0.2, 'roc_auc': 0.75}
    ]

In [None]:
perf_logger.log_eval_report(report_batch_2)

In [None]:
perf_logger.log_info("Start evaluation")

In [None]:
perf_logger.log_info("First batch logged.")

In [None]:
perf_logger.log_info("Second batch logged.")

In [None]:
perf_logger.log_info("End evaluation")

In [None]:
import pandas as pd


In [None]:
report_artifact = pd.read_csv('./out/1111/1111_performance_eval_report.csv')

In [None]:
with open("./log/1111/1111_performance_eval.log", 'r') as fp:
    for count, line in enumerate(fp):
        pass
return count + 1

In [None]:
import shutil

shutil.rmtree(f'./log/test_experiment')
shutil.rmtree(f'./out/test_experiment')

In [None]:
import logging

In [None]:
logging.shutdown()

In [None]:

from batchstream.utils.logging.base.logger_factory import LoggerFactory
    

In [None]:
from batchstream.retraining_strategy.base.retraining_strategy import RetrainingStrategy

In [1]:
from river.stream import iter_pandas
import pandas as pd
from evidently.test_suite import TestSuite
from evidently.test_preset import DataDriftTestPreset
from batchstream.history.base.history_manager import HistoryManager
from batchstream.monitoring.pipeline.model_monitoring_pipeline import ModelMonitoringPipeline
from batchstream.monitoring.pipeline.steps.batch.evidently_monitoring_step import EvidentlyMonitoringStep
from sklearn.datasets import load_breast_cancer
from batchstream.utils.logging.base.logger_factory import LoggerFactory
from batchstream.batch_monitoring_strategy.dummy_monitoring_strategy import DummyMonitoringStrategy
from batchstream.retraining_strategy.dummy_retraining_strategy import DummyRetrainingStrategy 




history = HistoryManager()
X, Y = load_breast_cancer(return_X_y=True)
# for x, y in iter_pandas(pd.DataFrame(X), pd.Series(Y)):
#    history.update_history_x(x)
#    is_drift = monitoring.monitor(history)

#    if is_drift:
#       X_retrain, y_retrain = dr.get_retraining_data(history)
#       print(X_retrain, y_retrain)   

#    history.update_history_y(y)  
   



In [28]:
from batchstream.detectors.base.detector import DriftDetector

In [None]:
DriftDetector()

In [33]:
from batchstream.pipelines.batch.batch_pipeline import BatchPipeline
from batchstream.estimators.sklearn_estimator import SklearnEstimator
from batchstream.detectors.base.detector import DriftDetector

In [None]:
import pandas as pd
from evidently.test_suite import TestSuite
from evidently.test_preset import DataDriftTestPreset
from batchstream.history.base.history_manager import HistoryManager
from batchstream.monitoring.pipeline.model_monitoring_pipeline import ModelMonitoringPipeline
from batchstream.monitoring.pipeline.steps.batch.evidently_monitoring_step import EvidentlyMonitoringStep
from sklearn.datasets import load_breast_cancer
from batchstream.utils.logging.base.logger_factory import LoggerFactory
from batchstream.batch_monitoring_strategy.dummy_monitoring_strategy import DummyMonitoringStrategy
from batchstream.retraining_strategy.dummy_retraining_strategy import DummyRetrainingStrategy 


data_drift_test_suite = TestSuite(tests=[
   DataDriftTestPreset(),
])
f = LoggerFactory('test-2121')
d1 = DummyMonitoringStrategy(n_curr=120, n_ref=120)
ev1 = EvidentlyMonitoringStep(data_drift_test_suite, d1, f, min_instances=240, clock=120, name='ev1')
monitoring = ModelMonitoringPipeline([(ev1._name, ev1)])
dr = DummyRetrainingStrategy(n_last_retrain=100, n_last_test=20)

In [None]:
data_detector = DriftDetector(monitoring, dr)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

sk = SklearnEstimator(Pipeline([('rf', RandomForestClassifier())]))

BatchPipeline(sk, input_drift_detector=[data_detector], None, history, logger_factory, None, min_samples_first_fit=120, min_samples_retrain=240)

In [20]:
df = pd.DataFrame(X).iloc[-20:, :]
y = pd.Series(Y).iloc[-20:-1]

In [21]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
549,10.82,24.21,68.89,361.6,0.08192,0.06602,0.01548,0.00816,0.1976,0.06328,...,13.03,31.45,83.9,505.6,0.1204,0.1633,0.06194,0.03264,0.3059,0.07626
550,10.86,21.48,68.51,360.5,0.07431,0.04227,0.0,0.0,0.1661,0.05948,...,11.66,24.77,74.08,412.3,0.1001,0.07348,0.0,0.0,0.2458,0.06592
551,11.13,22.44,71.49,378.4,0.09566,0.08194,0.04824,0.02257,0.203,0.06552,...,12.02,28.26,77.8,436.6,0.1087,0.1782,0.1564,0.06413,0.3169,0.08032
552,12.77,29.43,81.35,507.9,0.08276,0.04234,0.01997,0.01499,0.1539,0.05637,...,13.87,36.0,88.1,594.7,0.1234,0.1064,0.08653,0.06498,0.2407,0.06484
553,9.333,21.94,59.01,264.0,0.0924,0.05605,0.03996,0.01282,0.1692,0.06576,...,9.845,25.05,62.86,295.8,0.1103,0.08298,0.07993,0.02564,0.2435,0.07393
554,12.88,28.92,82.5,514.3,0.08123,0.05824,0.06195,0.02343,0.1566,0.05708,...,13.89,35.74,88.84,595.7,0.1227,0.162,0.2439,0.06493,0.2372,0.07242
555,10.29,27.61,65.67,321.4,0.0903,0.07658,0.05999,0.02738,0.1593,0.06127,...,10.84,34.91,69.57,357.6,0.1384,0.171,0.2,0.09127,0.2226,0.08283
556,10.16,19.59,64.73,311.7,0.1003,0.07504,0.005025,0.01116,0.1791,0.06331,...,10.65,22.88,67.88,347.3,0.1265,0.12,0.01005,0.02232,0.2262,0.06742
557,9.423,27.88,59.26,271.3,0.08123,0.04971,0.0,0.0,0.1742,0.06059,...,10.49,34.24,66.5,330.6,0.1073,0.07158,0.0,0.0,0.2475,0.06969
558,14.59,22.68,96.39,657.1,0.08473,0.133,0.1029,0.03736,0.1454,0.06147,...,15.48,27.27,105.9,733.5,0.1026,0.3171,0.3662,0.1105,0.2258,0.08004


In [22]:
class MockHistory:

    def __init__(self, df, y):
        self.x_history = df
        self.y_history = y

In [23]:
dr = DummyRetrainingStrategy(n_last_retrain=10, n_last_test=2)
xr, yr = dr.get_retraining_data(MockHistory(df, y))
xt, yt = dr.get_retest_data(MockHistory(df, y))