# Chapter 11
## Section: Avoding data drift

In [1]:
import numpy as np

class DataDriftMonitor:
    def __init__(self, baseline_data: np.array, threshold_mean: float = 0.1):
        self.baseline = self.calculate_statistics(baseline_data)
        self.threshold_mean = threshold_mean

    def calculate_statistics(self, data: np.array):
        return np.mean(data, axis=0)

    def assess_drift(self, current_data: np.array):
        current_stats = self.calculate_statistics(current_data)

        drift_detected = False
        for feature in range(0, len(current_stats)):
            baseline_stat = self.baseline[feature]
            current_stat = current_stats[feature]
            if np.abs(current_stat - baseline_stat) > self.threshold_mean:
                drift_detected = True
                print('Feature id with drift: {}'.format(feature))
                print('Mean of original distribution: {}'.format(baseline_stat))
                print('Mean of new distribution: {}'.format(current_stat))
                break

        return drift_detected

In [2]:
np.random.seed(23)
# Generating a synthetic dataset, as the original data, with 100 datapoints and 5 features
# from a normal distribution centered around 0 with std of 1
baseline_data = np.random.normal(loc=0, scale=1, size=(100, 5))
# Create a DataDriftMonitor instance
monitor = DataDriftMonitor(baseline_data, threshold_mean=0.1)

# Generating a synthetic dataset, as the original data, with 100 datapoints and 5 features
# from a normal distribution centered around 0.2 with std of 1
current_data = np.random.normal(loc=0.15, scale=1, size=(100, 5))

# Assess data drift
drift_detected = monitor.assess_drift(current_data)
if drift_detected:
    print("Data drift detected.")
else:
    print("No data drift detected.")

Feature id with drift: 1
Mean of original distribution: -0.09990597519469419
Mean of new distribution: 0.09662442557421645
Data drift detected.
