Steps for clustering algorithm (what has been done manually and now needs to be automated)

1. Seasonality : 4 hour window periods
2. Standard deviation for 4 hour periods
3. Timestamps matched for carbohydrate intake (i.e. all 35g CHO within 1 hour of eachother grouped together)
4. Rate of glucose appearance and disappearance

    a. Glucose excursion (nadir to peak / peak to nadir) - +ve and -ve trends

    b. iAUC (nadir to peak / peak to nadir) - +ve and -ve trends
    
5. Total change in BG (nadir to peak / peak to nadir) - +ve and -ve 

6. Incude the Carbs consumed in the 4 hour period

7. Collate the results of the above points into a data set with columns names

- Day Name (Monday / Tuesday / Wednesday / Thursday / Friday / Saturday / Sunday)
- Day number (1 / 2 / 3 / 4 / 5 / 6 / 7)
- Week of experiment (1 / 2 / 3 / 4 / 5 / 6 / 7 / 8 / 9)
- Time Period (Breakfast / Lunch / Dinner / Overnight / Early Morning)
- Time period Number (1 / 2 / 3 / 4 / 5)
- iAUC (+ve)
- iAUC (-ve)
- Glucose Excursion (+ve)
- Glucose Excursion (-ve)
- Total change in BG (+ve)
- Total change in BG (-ve)
- Peak value
- Carbs

8. Run Clustering Algorithm

9. Goodness of fit assessment :

- Want the algorithm to be adaptive.

    Have the left overs recycled through the 'rules' missing the glucose excursion but maintaining the iAUC and total change in BG. 
    
    Then further iterations miss out the iAUC and review total change in BG etc

First things first:
Import/load necessary libraries 

In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

from sklearn import datasets
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from datetime import datetime, date
import calendar
import time

%matplotlib inline

Load and preprocess necessary data sets

In [59]:
# Import data
unfiltered = pd.read_csv('/Users/user/Desktop/Clustering Project/CSV Files/544_data.csv')
unfiltered['glucose_level_ts'] = pd.to_datetime(unfiltered['glucose_level_ts'])

In [60]:
unfiltered.drop('glucose_level_mmol/L', axis=1)

Unnamed: 0,glucose_level_ts,glucose_level_mg/dL,carbs_g,meal_ts,meal_type,bolus_dose,bolus_start_ts,bolus_end_ts,bolus_type
0,2020-11-05 00:02:00,129,135.0,11/05/2020 11:25,Lunch,7.1,11/05/2020 08:24,11/05/2020 08:24,normal
1,2020-11-05 00:07:00,128,100.0,11/05/2020 17:16,Dinner,16.8,11/05/2020 11:24,11/05/2020 11:24,normal
2,2020-11-05 00:12:00,129,42.0,11/05/2020 21:26,Snack,14.2,11/05/2020 17:19,11/05/2020 17:19,normal
3,2020-11-05 00:17:00,131,37.0,12/05/2020 08:02,Breakfast,7.5,11/05/2020 21:23,11/05/2020 21:23,normal
4,2020-11-05 00:22:00,133,78.0,12/05/2020 12:07,Lunch,6.9,12/05/2020 08:00,12/05/2020 08:00,normal
...,...,...,...,...,...,...,...,...,...
13334,2020-04-07 20:57:00,296,,,,,,,
13335,2020-04-07 21:02:00,292,,,,,,,
13336,2020-04-07 21:07:00,284,,,,,,,
13337,2020-04-07 21:12:00,273,,,,,,,


In [61]:
# Here we filter the unfiltered for glucose level and glucose_ts to use in isolating the glucose events.
# We will pick up the unfiltered datset again later whenlooking at CHO and Insulin Levels

features = ['glucose_level_ts', 'glucose_level_mg/dL']
data = unfiltered[features].copy()

# Convert timestamp to datetime format
data['glucose_level_ts'] = pd.to_datetime(data['glucose_level_ts'])

# Remove the first row
data = data.iloc[1:].reset_index(drop=True)

# Map day of week names to day of week numbers
day_names = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 7: 'Sunday'}
data['day_of_week'] = data['glucose_level_ts'].dt.dayofweek.map(day_names)

# Calculate rate of change
data['BG Rate of Change'] = data['glucose_level_mg/dL'].diff()/5
data    

Unnamed: 0,glucose_level_ts,glucose_level_mg/dL,day_of_week,BG Rate of Change
0,2020-11-05 00:07:00,128,Thursday,
1,2020-11-05 00:12:00,129,Thursday,0.2
2,2020-11-05 00:17:00,131,Thursday,0.4
3,2020-11-05 00:22:00,133,Thursday,0.4
4,2020-11-05 00:27:00,132,Thursday,-0.2
...,...,...,...,...
13333,2020-04-07 20:57:00,296,Tuesday,0.0
13334,2020-04-07 21:02:00,292,Tuesday,-0.8
13335,2020-04-07 21:07:00,284,Tuesday,-1.6
13336,2020-04-07 21:12:00,273,Tuesday,-2.2


Step 2:

Isolate the peak and nadir (low) points in the set windows

In [65]:
# Convert data to numpy arrays
glucose_level_ts = np.array(data['glucose_level_ts'])
glucose_level = np.array(data['glucose_level_mg/dL'], dtype=float)
glucose_change = np.array(data['BG Rate of Change'], dtype=float)

# Set threshold values for glycemic events
glucose_threshold_high = 100  # mg/dL (5.5 mmol/L)
glucose_threshold_low = 100 # mg/dL (5.5 mmol/L)
rate_of_change_threshold = 3.6  # mg/dL/min (0.3 mmol/L)

# Initialize lists to store event data
event_nadir = []
event_peak = []
event_secondary_nadir = []
event_time_nadir = []
event_time_peak = []
event_time_secondary_nadir = []

# Loop through the data to find events
# Find peak of the event
# Check for initial nadir event
for i in range(1, len(glucose_level) - 1):
    if glucose_level[i] < glucose_threshold_low and (glucose_level[i - 1] - glucose_level[i]) > rate_of_change_threshold and (glucose_level[i + 1] - glucose_level[i]) > rate_of_change_threshold:
        # Check that the event occurred within the time threshold
        #if len(event_time_peak) > 0 and glucose_level_ts[i] - event_time_peak[-1] < time_threshold:
        # Found a nadir event
        if len(event_nadir) == 0 or (glucose_level[i], glucose_level_ts[i]) != (event_nadir[-1], event_time_nadir[-1]):
            event_nadir.append(glucose_level[i])
            event_time_nadir.append(glucose_level_ts[i])

for j in range(len(glucose_level) - 1):
    if glucose_level[j] > glucose_threshold_high and (glucose_level[j] - glucose_level[j - 1]) > rate_of_change_threshold and (glucose_level[j] - glucose_level[j + 1]) > rate_of_change_threshold:
        # Check that the event occurred within the time threshold
        # if len(event_time_nadir) > 0 and glucose_level_ts[j] - event_time_nadir[-1] < time_threshold:
        time_since_nadir = glucose_level_ts[j] - event_time_nadir[-1]
        if time_since_nadir.astype('timedelta64[s]').astype(int) >= 3600:  
            # check if time since last nadir event is >= 1 hour# Found a peak event
            event_peak.append(glucose_level[j])
            event_time_peak.append(glucose_level_ts[j])
        
# Check for a secondary nadir event
for k in range(len(glucose_level) - 1):
    if glucose_level[k] < glucose_level[k - 1] and (glucose_level[k - 1] - glucose_level[k]) > rate_of_change_threshold and (glucose_level[k + 1] - glucose_level[k]) > rate_of_change_threshold and glucose_level_ts[k] > event_time_peak[-1]:
        # Check that the event occurred within the time threshold
        #if len(event_time_secondary_nadir) > 0 and glucose_level_ts[k] - event_time_peak[-1] < time_threshold:
        time_since_peak = glucose_level_ts[k] - event_time_peak[-1]
        if time_since_peak.astype('timedelta64[s]').astype(int) >= 14400:   # check if time since last peak event is >= 4 hours
            # Found a secondary nadir event
            event_secondary_nadir.append(glucose_level[k])
            event_time_secondary_nadir.append(glucose_level_ts[k])

print('Event nadir:', event_nadir , event_time_nadir)
print('Event peak:', event_peak, event_time_peak)
print('Event Secondary nadir:', event_secondary_nadir, event_time_secondary_nadir)

Event nadir: [87.0, 87.0, 96.0, 95.0, 85.0, 98.0, 88.0, 68.0, 68.0, 72.0, 77.0] [numpy.datetime64('2020-05-14T12:19:00.000000000'), numpy.datetime64('2020-05-14T19:29:00.000000000'), numpy.datetime64('2020-05-23T05:15:00.000000000'), numpy.datetime64('2020-05-28T18:55:00.000000000'), numpy.datetime64('2020-02-06T18:37:00.000000000'), numpy.datetime64('2020-03-06T12:52:00.000000000'), numpy.datetime64('2020-09-06T04:09:00.000000000'), numpy.datetime64('2020-09-06T07:09:00.000000000'), numpy.datetime64('2020-11-06T03:49:00.000000000'), numpy.datetime64('2020-06-29T05:49:00.000000000'), numpy.datetime64('2020-04-07T13:37:00.000000000')]
Event peak: [229.0, 233.0, 152.0, 153.0, 172.0, 281.0, 274.0, 173.0, 142.0, 124.0, 216.0, 218.0, 167.0, 103.0, 165.0, 230.0, 324.0, 307.0, 254.0, 240.0, 224.0, 218.0, 185.0, 166.0, 163.0, 157.0, 278.0, 206.0, 200.0, 248.0, 123.0, 122.0, 170.0, 177.0, 107.0, 188.0, 228.0, 131.0, 109.0, 220.0, 279.0, 296.0, 177.0, 137.0, 136.0, 174.0, 305.0, 222.0, 239.0, 31

In [66]:
# Initialize events list
events = []

# Loop through all events
for i in range(len(event_peak)):
    initial_nadir = None
    secondary_nadir = None

    # Find the initial nadir event that occurred before the peak event
    for j in range(len(event_nadir)):
        if event_time_nadir[j] < event_time_peak[i]:
            initial_nadir = (event_nadir[j], event_time_nadir[j])
        
    # Find the secondary nadir event that occurred after the peak event
    for k in range(len(event_secondary_nadir)):
        if event_time_secondary_nadir[k] > event_time_peak[i]:
            secondary_nadir = (event_secondary_nadir[k], event_time_secondary_nadir[k])
    
    # Calculate the positive and negative glucose excursions
    positive_excursion = event_peak[i] - initial_nadir[0] if initial_nadir is not None else None
    negative_excursion = event_peak[i] - secondary_nadir[0] if secondary_nadir is not None else None

    # Add the event to the events list
    initial_nadir_glucose = initial_nadir[0] if initial_nadir is not None else None
    initial_nadir_time = initial_nadir[1] if initial_nadir is not None else None
    peak_glucose = event_peak[i]
    peak_time = event_time_peak[i]
    secondary_nadir_glucose = secondary_nadir[0] if secondary_nadir is not None else None
    secondary_nadir_time = secondary_nadir[1] if secondary_nadir is not None else None
    events.append((initial_nadir_glucose, initial_nadir_time, peak_glucose, peak_time, secondary_nadir_glucose, secondary_nadir_time, positive_excursion, negative_excursion))

# Create a pandas dataframe from the events list
df_events = pd.DataFrame(events, columns=['initial_nadir_glucose', 'initial_nadir_time', 'peak_glucose', 'peak_time', 'secondary_nadir_glucose', 'secondary_nadir_time', 'positive_excursion', 'negative_excursion'])
df_events.head()

Unnamed: 0,initial_nadir_glucose,initial_nadir_time,peak_glucose,peak_time,secondary_nadir_glucose,secondary_nadir_time,positive_excursion,negative_excursion
0,77.0,2020-04-07 13:37:00,229.0,2020-11-05 20:29:00,219.0,2020-12-06 19:49:00,152.0,10.0
1,77.0,2020-04-07 13:37:00,233.0,2020-11-05 21:29:00,219.0,2020-12-06 19:49:00,156.0,14.0
2,77.0,2020-04-07 13:37:00,152.0,2020-12-05 00:59:00,219.0,2020-12-06 19:49:00,75.0,-67.0
3,77.0,2020-04-07 13:37:00,153.0,2020-12-05 02:19:00,219.0,2020-12-06 19:49:00,76.0,-66.0
4,77.0,2020-04-07 13:37:00,172.0,2020-12-05 02:34:00,219.0,2020-12-06 19:49:00,95.0,-47.0


Step 3:

Calculate glucose appreance and disappearance rates

    a. Glucose excursion (mmol/L.min-1)
    
    b. iAUC

Step 4. 

Total change in BG (nadir to peak / peak to nadir) - +ve and -ve 

In [63]:
# Calculate iAUC for positive and negative excursions
positive_iAUC = []
negative_iAUC = []

for i in range(len(df_events)):
    # Get glucose levels for the positive and negative excursions
    initial_nadir_glucose = df_events.loc[i, 'initial_nadir_glucose']
    peak_glucose = df_events.loc[i, 'peak_glucose']
    secondary_nadir_glucose = df_events.loc[i, 'secondary_nadir_glucose']
    
    positive_glucose = np.arange(initial_nadir_glucose, peak_glucose)
    negative_glucose = np.arange(peak_glucose, secondary_nadir_glucose)
    
    # Calculate iAUC for positive excursion
    positive_iAUC.append(np.trapz(positive_glucose))
    
    # Calculate iAUC for negative excursion
    negative_iAUC.append(np.trapz(negative_glucose))

# Add columns to DataFrame
df_events['positive_iAUC'] = positive_iAUC
df_events['negative_iAUC'] = negative_iAUC


df_events.head()

Unnamed: 0,initial_nadir_glucose,initial_nadir_time,peak_glucose,peak_time,secondary_nadir_glucose,secondary_nadir_time,positive_excursion,negative_excursion,positive_iAUC,negative_iAUC
0,77.0,2020-04-07 13:37:00,229.0,2020-11-05 20:29:00,219.0,2020-12-06 19:49:00,152.0,10.0,23027.5,0.0
1,77.0,2020-04-07 13:37:00,233.0,2020-11-05 21:29:00,219.0,2020-12-06 19:49:00,156.0,14.0,23947.5,0.0
2,77.0,2020-04-07 13:37:00,152.0,2020-12-05 00:59:00,219.0,2020-12-06 19:49:00,75.0,-67.0,8436.0,12210.0
3,77.0,2020-04-07 13:37:00,153.0,2020-12-05 02:19:00,219.0,2020-12-06 19:49:00,76.0,-66.0,8587.5,12057.5
4,77.0,2020-04-07 13:37:00,172.0,2020-12-05 02:34:00,219.0,2020-12-06 19:49:00,95.0,-47.0,11656.0,8970.0


Identify the carbohydrate intake and timestamps

In [140]:
#unfiltered.head()

Unnamed: 0,glucose_level_ts,glucose_level_mg/dL,glucose_level_mmol/L,carbs_g,meal_ts,meal_type,bolus_dose,bolus_start_ts,bolus_end_ts,bolus_type,day_name,day_number,week_of_experiment,time_period
0,2020-11-05 00:02:00,129,,135.0,11/05/2020 11:25,Lunch,7.1,11/05/2020 08:24,11/05/2020 08:24,normal,Thursday,4,45.0,Early Morning
1,2020-11-05 00:07:00,128,,100.0,11/05/2020 17:16,Dinner,16.8,11/05/2020 11:24,11/05/2020 11:24,normal,Thursday,4,45.0,Early Morning
2,2020-11-05 00:12:00,129,,42.0,11/05/2020 21:26,Snack,14.2,11/05/2020 17:19,11/05/2020 17:19,normal,Thursday,4,45.0,Early Morning
3,2020-11-05 00:17:00,131,,37.0,12/05/2020 08:02,Breakfast,7.5,11/05/2020 21:23,11/05/2020 21:23,normal,Thursday,4,45.0,Early Morning
4,2020-11-05 00:22:00,133,,78.0,12/05/2020 12:07,Lunch,6.9,12/05/2020 08:00,12/05/2020 08:00,normal,Thursday,4,45.0,Early Morning


Step 6. 

Collate the results of the above points into a data set with columns names

- Day Name (Monday / Tuesday / Wednesday / Thursday / Friday / Saturday / Sunday)
- Day number (1 / 2 / 3 / 4 / 5 / 6 / 7)
- Week of experiment (1 / 2 / 3 / 4 / 5 / 6 / 7 / 8 / 9)
- Time Period (Breakfast / Lunch / Dinner / Overnight / Early Morning)
- Time period Number (1 / 2 / 3 / 4 / 5)
- iAUC (+ve)
- iAUC (-ve)
- Glucose Excursion (+ve)
- Glucose Excursion (-ve)
- Total change in BG (+ve)
- Total change in BG (-ve)
- Peak value
- Carbs

In [64]:
df_events['day_names'] = unfiltered['glucose_level_ts'].dt.day_name()
df_events['time_of_day'] = unfiltered['glucose_level_ts'].apply(lambda x: get_time_period(pd.to_datetime(x)))
df_events['day_number'] = unfiltered['glucose_level_ts'].dt.dayofweek + 1
df_events['week_of_experiment'] = np.ceil(unfiltered['glucose_level_ts'].dt.dayofyear / 7)
df_events['carbs_g'] = unfiltered['carbs_g']
#df_events['meal_ts'] = pd.to_datetime(unfiltered['meal_ts'])
df_events['meal_type'] = unfiltered['meal_type']

df_events.head()

Unnamed: 0,initial_nadir_glucose,initial_nadir_time,peak_glucose,peak_time,secondary_nadir_glucose,secondary_nadir_time,positive_excursion,negative_excursion,positive_iAUC,negative_iAUC,day_names,time_of_day,day_number,week_of_experiment,carbs_g,meal_type
0,77.0,2020-04-07 13:37:00,229.0,2020-11-05 20:29:00,219.0,2020-12-06 19:49:00,152.0,10.0,23027.5,0.0,Thursday,Early Morning,4,45.0,135.0,Lunch
1,77.0,2020-04-07 13:37:00,233.0,2020-11-05 21:29:00,219.0,2020-12-06 19:49:00,156.0,14.0,23947.5,0.0,Thursday,Early Morning,4,45.0,100.0,Dinner
2,77.0,2020-04-07 13:37:00,152.0,2020-12-05 00:59:00,219.0,2020-12-06 19:49:00,75.0,-67.0,8436.0,12210.0,Thursday,Early Morning,4,45.0,42.0,Snack
3,77.0,2020-04-07 13:37:00,153.0,2020-12-05 02:19:00,219.0,2020-12-06 19:49:00,76.0,-66.0,8587.5,12057.5,Thursday,Early Morning,4,45.0,37.0,Breakfast
4,77.0,2020-04-07 13:37:00,172.0,2020-12-05 02:34:00,219.0,2020-12-06 19:49:00,95.0,-47.0,11656.0,8970.0,Thursday,Early Morning,4,45.0,78.0,Lunch


In [None]:
# Save the DataFrame to a CSV file

file_path = '/Users/user/Desktop/Clustering Project/CSV Files/544_clustering_data.csv'

df_events.to_csv(path_or_buf=file_path, index=False)