In [1]:
import logging

import pandas as pd
import numpy as np
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset, TargetDriftPreset
from torch.utils.data import DataLoader

from src.models.dataset import MultiTSDataset

In [36]:
logging.basicConfig(datefmt='%d-%b-%y %H:%M:%S', level=logging.INFO,
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

In [37]:
df = pd.read_csv('data/processed/gee_ds_20231003_temporal_drift_analysis/mapped_paths.csv')
df.head()

Unnamed: 0,no2_path,features_path,date,ds_type
0,../../data/processed/gee_ds_20231003_temporal_...,../../data/processed/gee_ds_20231003_temporal_...,2021-11-14,others
1,../../data/processed/gee_ds_20231003_temporal_...,../../data/processed/gee_ds_20231003_temporal_...,2019-01-28,others
2,../../data/processed/gee_ds_20231003_temporal_...,../../data/processed/gee_ds_20231003_temporal_...,2023-04-13,current
3,../../data/processed/gee_ds_20231003_temporal_...,../../data/processed/gee_ds_20231003_temporal_...,2019-02-01,others
4,../../data/processed/gee_ds_20231003_temporal_...,../../data/processed/gee_ds_20231003_temporal_...,2019-02-05,others


In [38]:
def load_datasets(df: pd.DataFrame):

    logging.info('Starting...')

    df_curr = df[df['ds_type'] == 'current']
    df_old = df[df['ds_type'] == 'val']

    columns = ['sm_surface', 'precipitationCal', 'volumetric_soil_water_layer_1', 'Optical_Depth_047',
               'evaporation_from_bare_soil_sum',
               'surface_latent_heat_flux_sum', 'Column_WV', 'temperature_2m']
    def _extract_feat_n_targets(base_df, type: str):

        logging.info(f'Extracting {type} dataset.')
        ds = MultiTSDataset(base_df, use_rel=True)

        loader = DataLoader(ds, batch_size=1)

        x_data = []
        target_data = []
        for x, y in loader:
            a = x.squeeze()
            x_data.append(a)
            target_data.append(y.flatten().tolist())

        target = np.array(target_data).flatten()
        features_df = pd.DataFrame(np.concatenate(x_data), columns=columns)
        target_df = pd.DataFrame({'target': target})

        return features_df, target_df

    current_ds = _extract_feat_n_targets(df_curr, 'current')
    old_ds = _extract_feat_n_targets(df_old, 'val')

    logging.info('All datasets loaded correctly.')

    return current_ds, old_ds

In [39]:
(curr_feats, curr_target), (old_feats, old_target) = load_datasets(df)

20-Nov-23 18:58:36 - root - INFO - Starting...
20-Nov-23 18:58:36 - root - INFO - Extracting current dataset.
20-Nov-23 18:58:36 - root - INFO - Extracting val dataset.
20-Nov-23 18:58:38 - root - INFO - All datasets loaded correctly.


In [40]:
curr_feats.head()

Unnamed: 0,sm_surface,precipitationCal,volumetric_soil_water_layer_1,Optical_Depth_047,evaporation_from_bare_soil_sum,surface_latent_heat_flux_sum,Column_WV,temperature_2m
0,1.245891,0.991005,0.971815,-0.35263,-0.233975,-0.896474,0.704385,-0.417923
1,1.366299,1.440902,0.997121,-0.100061,1.31717,0.006891,-2.505527,-0.871619
2,1.360401,-0.341436,1.023707,-0.100061,0.766852,-0.227956,0.00207,-0.807393
3,1.382988,0.017002,0.975043,-0.100061,1.30129,0.183847,-1.367084,-1.131056
4,1.36527,1.087744,1.076559,-0.100061,0.731486,0.376179,0.00207,-1.1797


In [41]:
curr_target.head()

Unnamed: 0,target
0,4.221332
1,-1.539109
2,9.283604
3,5.637645
4,13.495926


In [42]:
drift_report = Report(metrics=[TargetDriftPreset(stattest='ks', stattest_threshold=0.05)])
 
drift_report.run(reference_data=old_target, current_data=curr_target)

In [43]:
drift_report.save_json("target_temporal_drift.json", include_render=True)

In [44]:
drift_report.as_dict()

{'metrics': [{'metric': 'ColumnDriftMetric',
   'result': {'column_name': 'target',
    'column_type': 'num',
    'stattest_name': 'K-S p_value',
    'stattest_threshold': 0.05,
    'drift_score': 0.08997219319479635,
    'drift_detected': False,
    'current': {'small_distribution': {'x': [-14.52061939239502,
       -10.854759693145752,
       -7.188899993896484,
       -3.523040294647217,
       0.14281940460205078,
       3.8086791038513184,
       7.474538803100586,
       11.140398502349854,
       14.806258201599121,
       18.47211790084839,
       22.137977600097656],
      'y': [0.0003007577820395969,
       0.0015037889101979845,
       0.003308335602435566,
       0.005714397858752341,
       0.04150457392146437,
       0.10376143480366093,
       0.0781970233302952,
       0.02827123151172211,
       0.00781970233302952,
       0.002406062256316775]}},
    'reference': {'small_distribution': {'x': [-18.131059646606445,
       -12.714060401916504,
       -7.297061157226562,


In [45]:
drift_report = Report(metrics=[DataDriftPreset(stattest='ks', stattest_threshold=0.05)])
 
drift_report.run(reference_data=old_feats, current_data=curr_feats)

In [46]:
drift_report.save_json("feats_temporal_drift.json", include_render=True)

In [47]:
drift_report.as_dict()

{'metrics': [{'metric': 'DatasetDriftMetric',
   'result': {'drift_share': 0.5,
    'number_of_columns': 8,
    'number_of_drifted_columns': 8,
    'share_of_drifted_columns': 1.0,
    'dataset_drift': True}},
  {'metric': 'DataDriftTable',
   'result': {'number_of_columns': 8,
    'number_of_drifted_columns': 8,
    'share_of_drifted_columns': 1.0,
    'dataset_drift': True,
    'drift_by_columns': {'Column_WV': {'column_name': 'Column_WV',
      'column_type': 'num',
      'stattest_name': 'K-S p_value',
      'stattest_threshold': 0.05,
      'drift_score': 5.392563058318869e-134,
      'drift_detected': True,
      'current': {'small_distribution': {'x': [-2.721226215362549,
         -2.0933783054351807,
         -1.4655303955078125,
         -0.8376824855804443,
         -0.20983457565307617,
         0.418013334274292,
         1.0458612442016602,
         1.6737091541290283,
         2.3015570640563965,
         2.9294049739837646,
         3.557252883911133],
        'y': [0.04