In [22]:
from tsl.datasets import MetrLA, PemsBay, AirQuality
from scipy.stats import skew, kurtosis
import nannyml as nml
import pandas as pd
import numpy as np

dataset1 = MetrLA()
dataset2 = PemsBay()
dataset3 = AirQuality(impute_nans=True, small=True)
df1 = dataset1.dataframe()
df2 = dataset2.dataframe()
df3 = dataset3.dataframe()

In [23]:
# drop level channels from columns
df1.columns = df1.columns.droplevel('channels')
df2.columns = df2.columns.droplevel('channels')
df3.columns = df3.columns.droplevel('channels')

In [24]:
def get_stats(df):
    """
    Get statistics of dataframe.
    """
    df_described = df.describe()
    stats = pd.DataFrame({'skewness': [], 'kurtosis': []})
    for column in df.columns:
        stats = pd.concat([stats, pd.DataFrame({
            'skewness': [skew(df[column])],
            'kurtosis': [kurtosis(df[column])]
        })], ignore_index=True)
    stats = stats.T
    stats.columns = df_described.columns
    df_described = pd.concat([df_described, stats], axis=0)
    return df_described.mean(axis=1)

# Description of values of MetrLA Dataset

In [25]:
df1_described = get_stats(df1)

# Description of values of PemsBay Dataset

In [26]:
df2_described = get_stats(df2)

# Description of values of AirQuality Dataset

In [27]:
df3_described = get_stats(df3)

In [28]:
df_stats = pd.concat([df1_described, df2_described, df3_described], axis=1)
df_stats.columns = ['MetrLA', 'PemsBay', 'AirQuality(36)']
df_stats = df_stats.T
df_stats.insert(0, 'Dataset', df_stats.index)
df_stats.reset_index(drop=True, inplace=True)
df_stats['count'] = df_stats['count'].astype('int') 
df_stats.drop(columns=['min', '25%', '50%','75%', 'max'], inplace=True)
df_stats.to_csv('datasets_statistics.csv', index=False)

In [29]:
df_stats

Unnamed: 0,Dataset,count,mean,std,skewness,kurtosis
0,MetrLA,34272,58.367832,10.512709,-2.534072,13.184498
1,PemsBay,52128,62.620531,8.556311,-2.992395,15.073551
2,AirQuality(36),8759,84.460824,75.650833,1.610577,3.157856


# Check distribution drift for both datasets

In [30]:

def detect_drift(data, threshold=0.05):
  """
  Code for detecting data drift

  Args:
      data: sample NumPy array of data points.
      threshold: Proportion of data points allowed to fall outside the expected range (default: 0.05).

  Returns:
      A boolean indicating if data drift is detected.
  """
  mean = np.mean(data)
  std = np.std(data)
  expected_upper_bound = mean + 2 * std
  expected_lower_bound = mean - 2 * std

  # Calculate the cumulative sum of deviations from the mean
  deviations = np.cumsum(data - mean)

  # Identify data points exceeding the expected bounds
  outliers = np.sum(deviations > expected_upper_bound) + np.sum(deviations < expected_lower_bound)

  # Check if the proportion of outliers exceeds the threshold
  return outliers / len(data) > threshold


# Calculate drift for both datasets
drift_detected_1 = detect_drift(df1)
drift_detected_2 = detect_drift(df2)
drift_detected_3 = detect_drift(df3)

if np.mean(drift_detected_1):
  print("Data drift detected in MetrLA data")
else:
  print("No data drift detected in MetrLA data.")
  
if np.mean(drift_detected_2):
  print("Data drift detected in PemsBay dataset")
else:
  print("No data drift detected in PemsBay data.")
  
if np.mean(drift_detected_3):
  print("Data drift detected in AirQuality dataset")
else:
  print("No data drift detected in PemsBay data.")



In a future version, DataFrame.mean(axis=None) will return a scalar mean over the entire DataFrame. To retain the old behavior, use 'frame.mean(axis=0)' or just 'frame.mean()'


In a future version, DataFrame.mean(axis=None) will return a scalar mean over the entire DataFrame. To retain the old behavior, use 'frame.mean(axis=0)' or just 'frame.mean()'



Data drift detected in MetrLA data
Data drift detected in PemsBay dataset
Data drift detected in AirQuality dataset



In a future version, DataFrame.mean(axis=None) will return a scalar mean over the entire DataFrame. To retain the old behavior, use 'frame.mean(axis=0)' or just 'frame.mean()'



# Check drift distribution after half dataset fit with nannyml 

## MetrLa

In [31]:
METRLA_middle_index = len(df1) // 2

METRLA_first_half_df = df1.iloc[:METRLA_middle_index] # reference dataset 
METRLA_second_half_df = df1.iloc[METRLA_middle_index:]
METRLA_feature_names = df1.columns
METRLA_reference_set = METRLA_first_half_df.copy()  
METRLA_analysis_set = METRLA_second_half_df.copy() 

In [32]:
from nannyml.plots.blueprints.metrics import plot_metric
METRLA_drift_detector = nml.DataReconstructionDriftCalculator(
    column_names=METRLA_feature_names,
    chunk_size=500
).fit(reference_data=METRLA_reference_set)

# Calculate drift results
METRLA_drift_results = METRLA_drift_detector.calculate(data=METRLA_analysis_set)

# Visualize drift
METRLA_drift_results_plot = plot_metric(METRLA_drift_results, title='', metric_display_name='Reconstruction Error',
                metric_column_name='reconstruction_error')
METRLA_drift_results_plot.write_image('metrla_drift_results.png', width=1000, height=500)
METRLA_drift_results_plot.show()

## PemsBay

In [33]:
PemsBay_middle_index = len(df2) // 2

PemsBay_first_half_df = df2.iloc[:PemsBay_middle_index] # reference dataset 
PemsBay_second_half_df = df2.iloc[PemsBay_middle_index:]

PemsBay_feature_names = df2.columns
PemsBay_reference_set = PemsBay_first_half_df.copy()  
PemsBay_analysis_set = PemsBay_second_half_df.copy() 

In [34]:
PemsBay_drift_detector = nml.DataReconstructionDriftCalculator(
    column_names=PemsBay_feature_names,
    chunk_size=400
).fit(reference_data=PemsBay_reference_set)

# Calculate drift results
PemsBay_drift_results = PemsBay_drift_detector.calculate(data=PemsBay_analysis_set)

# Visualize drift
PemsBay_drift_results_plot = plot_metric(PemsBay_drift_results, title='', metric_display_name='Reconstruction Error',
                metric_column_name='reconstruction_error')
# PemsBay_drift_results_plot = PemsBay_drift_results.plot()
PemsBay_drift_results_plot.write_image('pemsbay_drift_results.png', width=1000, height=500)
PemsBay_drift_results_plot.show()

## AirQuality

In [35]:
AirQuality_middle_index = len(df3) // 2

AirQuality_first_half_df = df3.iloc[:AirQuality_middle_index] # reference dataset 
AirQuality_second_half_df = df3.iloc[AirQuality_middle_index:]

AirQuality_feature_names = df3.columns
AirQuality_reference_set = AirQuality_first_half_df.copy()  
AirQuality_analysis_set = AirQuality_second_half_df.copy() 

In [36]:
AirQuality_drift_detector = nml.DataReconstructionDriftCalculator(
    column_names=AirQuality_feature_names,
    chunk_size=100
).fit(reference_data=AirQuality_reference_set)

# Calculate drift results
AirQuality_drift_results = AirQuality_drift_detector.calculate(data=AirQuality_analysis_set)

# Visualize drift
# AirQuality_drift_results_plot = AirQuality_drift_results.plot()
AirQuality_drift_results_plot = plot_metric(AirQuality_drift_results, title='', metric_display_name='Reconstruction Error',
                metric_column_name='reconstruction_error')
# PemsBay_drift_results_plot = PemsBay_drift_results.plot()
AirQuality_drift_results_plot.write_image('airquality_drift_results.png', width=1000, height=500)
AirQuality_drift_results_plot.show()

In [37]:
# AirQuality_reference_set['datetime'] = AirQuality_reference_set.index
# AirQuality_analysis_set['datetime'] = AirQuality_analysis_set.index

# AirQuality_reference_set.reset_index(drop=True, inplace=True)
# AirQuality_analysis_set.reset_index(drop=True, inplace=True)

In [38]:
# import nannyml as nml
# from IPython.display import display

# calc = nml.DomainClassifierCalculator(
#     feature_column_names=AirQuality_feature_names,
#     timestamp_column_name='datetime',
#     chunk_size=50
# )
# calc.fit(AirQuality_reference_set)
# results = calc.calculate(AirQuality_analysis_set)

# figure = results.plot()
# figure.show()