In [None]:
"""
Meal Identification using EAgglo Model

This notebook focuses on implementing the EAgglo (Hierarchical Agglomerative Estimation of Multiple Change Points) 
model from the sktime library to detect significant meals in CGM time-series data. 

The goal is to identify key change points in the glucose time series data that correspond to significant meals. 

EAgglo is a non-parametric clustering approach that preserves the time ordering of observations. It merges 
neighboring segments sequentially to maximize a goodness-of-fit statistic, simultaneously identifying the number 
and location of change points without assuming any specific data distribution. The parameters of the model allow 
flexibility to control clustering behavior and penalization to avoid overfitting.

References:
- sktime Documentation: https://www.sktime.net/en/v0.28.0/api_reference/auto_generated/sktime.annotation.eagglo.EAgglo.html#sktime.annotation.eagglo.EAgglo
"""

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sktime.annotation.eagglo import EAgglo
import os

In [123]:
import pandas as pd
import os

data_path = "../data/raw"
relevant_cols = ['date', 'bgl', 'trend']
csv_files = [f for f in os.listdir(data_path) if f.endswith('.csv')]

dataframes = {
    file: pd.read_csv(
        os.path.join(data_path, file),
        usecols=relevant_cols,
        parse_dates=['date'],
    )
    for file in csv_files
}

df = dataframes['679372_2024-07-01_2024-09-30.csv']

df['date'] = pd.to_datetime(df['date'], format='ISO8601')

df['date'] = df['date'].dt.strftime('%Y-%m-%d %H:%M:%S')

df['date'] = pd.to_datetime(df['date'])

df = df.dropna(subset=['bgl', 'date'])
df = df.drop_duplicates()

print(df.head())
print(df.dtypes)


                 date    bgl trend
0 2024-07-01 00:02:32  115.0  FLAT
1 2024-07-01 00:05:33  112.0  FLAT
2 2024-07-01 00:08:33  116.0  FLAT
3 2024-07-01 00:10:34  121.0  FLAT
4 2024-07-01 00:13:36  122.0  FLAT
date     datetime64[ns]
bgl             float64
trend            object
dtype: object


In [132]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df['trend_encoded'] = le.fit_transform(df['trend'])

print(df[['trend', 'trend_encoded']].head())

  trend  trend_encoded
0  FLAT              0
1  FLAT              0
2  FLAT              0
3  FLAT              0
4  FLAT              0


In [None]:
import pandas as pd
from sktime.annotation.eagglo import EAgglo
from sklearn.preprocessing import LabelEncoder

df['date'] = pd.to_datetime(df['date'])
le = LabelEncoder()
df['trend_encoded'] = le.fit_transform(df['trend'])

df_day = df[df['date'].dt.date == pd.to_datetime('2024-07-05').date()]
df_day = df_day.set_index('date')

X = df_day[['bgl', 'trend_encoded']]

model = EAgglo(alpha=1.0, penalty="mean_diff_penalty")
clusters = model.fit_transform(X)

df_day['cluster'] = clusters

print(df_day)


                       bgl           trend  trend_encoded  cluster
date                                                              
2024-07-05 00:01:53  126.0            FLAT              0      NaN
2024-07-05 00:04:54  132.0            FLAT              0      NaN
2024-07-05 00:08:55  132.0            FLAT              0      NaN
2024-07-05 00:11:54  135.0            FLAT              0      NaN
2024-07-05 00:14:55  133.0            FLAT              0      NaN
...                    ...             ...            ...      ...
2024-07-05 23:46:46  165.0  FORTYFIVE_DOWN              1      NaN
2024-07-05 23:49:45  156.0  FORTYFIVE_DOWN              1      NaN
2024-07-05 23:53:47  155.0  FORTYFIVE_DOWN              1      NaN
2024-07-05 23:56:46  145.0  FORTYFIVE_DOWN              1      NaN
2024-07-05 23:59:48  127.0     SINGLE_DOWN              4      NaN

[474 rows x 4 columns]


In [None]:
print(type(X))
print(X.shape)

<class 'pandas.core.frame.DataFrame'>
(474, 2)


In [119]:
from sktime.annotation.datagen import piecewise_normal_multivariate
X_example = piecewise_normal_multivariate(means=[[1, 3], [4, 5]], lengths=[3, 4], random_state=10)
model = EAgglo()
print(X_example)
clusters_example = model.fit_transform(X_example)
print(clusters_example)


[[ 2.3315865   3.71527897]
 [-0.54540029  2.99161615]
 [ 1.62133597  2.27991444]
 [ 4.26551159  5.10854853]
 [ 4.00429143  4.82539979]
 [ 4.43302619  6.20303737]
 [ 3.03493433  6.02827408]]
[0 0 0 1 1 1 1]
