In [2]:
from tsl.datasets import AirQuality
from tsl.datasets.mts_benchmarks import ExchangeBenchmark
from scipy.stats import skew, kurtosis
import nannyml as nml
import pandas as pd
import numpy as np
from GT import get_dataset

dataset1 = get_dataset('Synth')[0]
dataset1 = dataset1.reshape(dataset1.shape[0]*dataset1.shape[1], dataset1.shape[2])

dataset2 = ExchangeBenchmark().dataframe().values[:7560]
dataset2 = get_dataset('ExchangeBenchmark', dataset2, 216)[0]
dataset2 = dataset2.reshape(dataset2.shape[0]*dataset2.shape[1], dataset2.shape[2])

dataset3 = AirQuality(impute_nans=True, small=True).dataframe().values[:8736]
dataset3 = get_dataset('AirQuality', dataset3, 168)[0]
dataset3 = dataset3.reshape(dataset3.shape[0]*dataset3.shape[1], dataset3.shape[2])

df1 = pd.DataFrame(dataset1)
df2 = pd.DataFrame(dataset2)
df3 = pd.DataFrame(dataset3)

Synth DATA
Original Dataset: 	1494
Train Split: 		498 	(70%)
Validation Split: 	498 	(20%)
Test Split: 		498 	(10%)
ExchangeBenchmark DATA
Original Dataset: 	105
Train Split: 		35 	(70%)
Validation Split: 	35 	(20%)
Test Split: 		35 	(10%)
AirQuality DATA
Original Dataset: 	156
Train Split: 		52 	(70%)
Validation Split: 	52 	(20%)
Test Split: 		52 	(10%)


In [3]:
def get_stats(df):
    """
    Get statistics of dataframe.
    """
    df_described = df.describe()
    stats = pd.DataFrame({'skewness': [], 'kurtosis': []})
    for column in df.columns:
        stats = pd.concat([stats, pd.DataFrame({
            'skewness': [skew(df[column])],
            'kurtosis': [kurtosis(df[column])]
        })], ignore_index=True)
    stats = stats.T
    stats.columns = df_described.columns
    df_described = pd.concat([df_described, stats], axis=0)
    return df_described.mean(axis=1)

# Description of values of MetrLA Dataset

In [4]:
df1_described = get_stats(df1)

# Description of values of PemsBay Dataset

In [5]:
df2_described = get_stats(df2)

# Description of values of AirQuality Dataset

In [6]:
df3_described = get_stats(df3)

In [7]:
df_stats = pd.concat([df1_described, df2_described, df3_described], axis=1)
df_stats.columns = ['Synthetic Sin', 'Exchange', 'AirQuality(36)']
df_stats = df_stats.T
df_stats.insert(0, 'Dataset', df_stats.index)
df_stats.reset_index(drop=True, inplace=True)
df_stats['count'] = df_stats['count'].astype('int') 
df_stats.drop(columns=['min', '25%', '50%','75%', 'max'], inplace=True)
df_stats.to_csv('datasets_statistics.csv', index=False)

In [8]:
df_stats

Unnamed: 0,Dataset,count,mean,std,skewness,kurtosis
0,Synthetic Sin,31374,0.49839,0.336971,0.006864,-1.28683
1,Exchange,7560,0.431364,0.219219,0.353939,-0.536501
2,AirQuality(36),8736,0.16887,0.157309,1.613606,3.162993


# Check distribution drift for both datasets

In [9]:

def detect_drift(data, threshold=0.05):
  """
  Code for detecting data drift

  Args:
      data: sample NumPy array of data points.
      threshold: Proportion of data points allowed to fall outside the expected range (default: 0.05).

  Returns:
      A boolean indicating if data drift is detected.
  """
  mean = np.mean(data)
  std = np.std(data)
  expected_upper_bound = mean + 2 * std
  expected_lower_bound = mean - 2 * std

  # Calculate the cumulative sum of deviations from the mean
  deviations = np.cumsum(data - mean)

  # Identify data points exceeding the expected bounds
  outliers = np.sum(deviations > expected_upper_bound) + np.sum(deviations < expected_lower_bound)

  # Check if the proportion of outliers exceeds the threshold
  return outliers / len(data) > threshold


# Calculate drift for both datasets
drift_detected_1 = detect_drift(df1)
drift_detected_2 = detect_drift(df2)
drift_detected_3 = detect_drift(df3)

if np.mean(drift_detected_1):
  print("Data drift detected in Synthetic data")
else:
  print("No data drift detected in Synthetic data.")
  
if np.mean(drift_detected_2):
  print("Data drift detected in Exchange dataset")
else:
  print("No data drift detected in Exchange data.")
  
if np.mean(drift_detected_3):
  print("Data drift detected in AirQuality dataset")
else:
  print("No data drift detected in AirQuality data.")


Data drift detected in Synthetic data
Data drift detected in Exchange dataset
Data drift detected in AirQuality dataset


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


# Check drift distribution after half dataset fit with nannyml 

## MetrLa

In [10]:
METRLA_middle_index = len(df1) // 2

METRLA_first_half_df = df1.iloc[:METRLA_middle_index] # reference dataset 
METRLA_second_half_df = df1.iloc[METRLA_middle_index:]
METRLA_feature_names = df1.columns
METRLA_reference_set = METRLA_first_half_df.copy()  
METRLA_analysis_set = METRLA_second_half_df.copy() 

In [20]:
from nannyml.plots.blueprints.metrics import plot_metric
METRLA_drift_detector = nml.DataReconstructionDriftCalculator(
    column_names=METRLA_feature_names,
    chunk_size=15
).fit(reference_data=METRLA_reference_set)

# Calculate drift results
METRLA_drift_results = METRLA_drift_detector.calculate(data=METRLA_analysis_set)

# Visualize drift
METRLA_drift_results_plot = plot_metric(METRLA_drift_results, title='', metric_display_name='Reconstruction Error',
                metric_column_name='reconstruction_error')
METRLA_drift_results_plot.write_image('metrla_drift_results.png', width=1000, height=500)
METRLA_drift_results_plot.show()


is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead.



## PemsBay

In [12]:
PemsBay_middle_index = len(df2) // 2

PemsBay_first_half_df = df2.iloc[:PemsBay_middle_index] # reference dataset 
PemsBay_second_half_df = df2.iloc[PemsBay_middle_index:]

PemsBay_feature_names = df2.columns
PemsBay_reference_set = PemsBay_first_half_df.copy()  
PemsBay_analysis_set = PemsBay_second_half_df.copy() 

In [18]:
PemsBay_drift_detector = nml.DataReconstructionDriftCalculator(
    column_names=PemsBay_feature_names,
    chunk_size=216
).fit(reference_data=PemsBay_reference_set)

# Calculate drift results
PemsBay_drift_results = PemsBay_drift_detector.calculate(data=PemsBay_analysis_set)

# Visualize drift
PemsBay_drift_results_plot = plot_metric(PemsBay_drift_results, title='', metric_display_name='Reconstruction Error',
                metric_column_name='reconstruction_error')
# PemsBay_drift_results_plot = PemsBay_drift_results.plot()
PemsBay_drift_results_plot.write_image('pemsbay_drift_results.png', width=1000, height=500)
PemsBay_drift_results_plot.show()


is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead.



## AirQuality

In [14]:
AirQuality_middle_index = len(df3) // 2

AirQuality_first_half_df = df3.iloc[:AirQuality_middle_index] # reference dataset 
AirQuality_second_half_df = df3.iloc[AirQuality_middle_index:]

AirQuality_feature_names = df3.columns
AirQuality_reference_set = AirQuality_first_half_df.copy()  
AirQuality_analysis_set = AirQuality_second_half_df.copy() 

In [21]:
AirQuality_drift_detector = nml.DataReconstructionDriftCalculator(
    column_names=AirQuality_feature_names,
    chunk_size=168
).fit(reference_data=AirQuality_reference_set)

# Calculate drift results
AirQuality_drift_results = AirQuality_drift_detector.calculate(data=AirQuality_analysis_set)

# Visualize drift
# AirQuality_drift_results_plot = AirQuality_drift_results.plot()
AirQuality_drift_results_plot = plot_metric(AirQuality_drift_results, title='', metric_display_name='Reconstruction Error',
                metric_column_name='reconstruction_error')
# PemsBay_drift_results_plot = PemsBay_drift_results.plot()
AirQuality_drift_results_plot.write_image('airquality_drift_results.png', width=1000, height=500)
AirQuality_drift_results_plot.show()


is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead.



In [16]:
# AirQuality_reference_set['datetime'] = AirQuality_reference_set.index
# AirQuality_analysis_set['datetime'] = AirQuality_analysis_set.index

# AirQuality_reference_set.reset_index(drop=True, inplace=True)
# AirQuality_analysis_set.reset_index(drop=True, inplace=True)

In [17]:
# import nannyml as nml
# from IPython.display import display

# calc = nml.DomainClassifierCalculator(
#     feature_column_names=AirQuality_feature_names,
#     timestamp_column_name='datetime',
#     chunk_size=50
# )
# calc.fit(AirQuality_reference_set)
# results = calc.calculate(AirQuality_analysis_set)

# figure = results.plot()
# figure.show()