# Junwon (Paul) Park

https://www.sktime.net/en/stable/api_reference/auto_generated/sktime.annotation.stray.STRAY.html

In [1]:
import pandas as pd

# Importing Data

In [2]:
# running data preprocessing notebook to use its methods
%run "1.02-vr-ggs-data-preprocessing.ipynb"  # Use quotes here

In [3]:
columns_to_load = [
    'date', 'sender_id', 'bgl', 'bgl_date_millis', 'text', 'template', 'msg_type', 
    'affects_fob', 'affects_iob', 'dose_units', 'food_g', 'food_glycemic_index', 
    'dose_automatic', 'fp_bgl', 'message_basal_change', '__typename', 'trend'
]

# Call load_data with specified columns
df_list = load_data(
    data_dir='../data/raw',  # Replace with your actual directory path
    columns_to_load=columns_to_load,
    date_parse_columns=['date']
)


Loaded data from 2 files.


  df = pd.read_csv(os.path.join(data_dir, file), usecols=columns_to_load, parse_dates=date_parse_columns)


In [4]:
df_500030 = df_list[0]
df_679372 = df_list[1]

we will use df_500030 for now.

# Storing Meal times and other message times into df

In [5]:
def filter_meal_related_events(df):
    """
    Filter the DataFrame to include only rows where 'msg_type' matches specific meal-related events.
    
    Parameters:
    - df : DataFrame, the original DataFrame containing a 'msg_type' column.
    
    Returns:
    - filtered_df : DataFrame, filtered to include only meal-related events with relevant dates.
    """
    # List of relevant meal-related events
    relevant_msg_types = ['ANNOUNCE_MEAL', 'INTERVENTION_SNACK', 'ANNOUNCE_EXERCISE', 'DOSE_INSULIN', 'DOSE_BASAL_INSULIN']
    
    # Filter DataFrame to include only rows with relevant 'msg_type' values
    filtered_df = df[df['msg_type'].isin(relevant_msg_types)].copy()
    
    # Ensure 'date' column is in datetime format if not already converted
    filtered_df['date'] = pd.to_datetime(filtered_df['date'], errors='coerce', utc=True)
    
    # Drop any rows with invalid or missing dates
    filtered_df.dropna(subset=['date'], inplace=True)
    
    # Sort by date and reset index
    filtered_df = filtered_df.sort_values('date').reset_index(drop=True)
    
    return filtered_df


In [6]:
# this data frame will help us validate meal times, we can validate our model using this data by checking if selected date is an actual meal time?
df_actual = filter_meal_related_events(df_679372) 

In [38]:
preprocess_data(df_500030).info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 32521 entries, 2024-07-01 05:02:39+00:00 to 2024-10-01 04:57:37+00:00
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   sender_id                   32521 non-null  float64
 1   bgl                         32521 non-null  float64
 2   bgl_date_millis             32521 non-null  float64
 3   text                        32521 non-null  object 
 4   template                    32521 non-null  object 
 5   affects_fob                 32521 non-null  int64  
 6   affects_iob                 32521 non-null  int64  
 7   dose_units                  32521 non-null  float64
 8   food_g                      32521 non-null  float64
 9   food_glycemic_index         32521 non-null  float64
 10  dose_automatic              32521 non-null  object 
 11  fp_bgl                      32521 non-null  float64
 12  message_basal_change        32521 non-nul

# Cleaned and pre processed data

In [7]:
df_679372_processed = preprocess_data(df_679372)

# Model Traning

## Step1: Identify Relevant Features for STRAY MODEL

### Relevant Columns:

`bgl`: The main variable representing blood glucose levels, which could show spikes or patterns around meals.

`dose_units`: Insulin doses often correlate with meals (especially bolus doses) to manage postprandial glucose levels.

`food_g`: The amount of carbohydrates, where non-zero values likely indicate meals or snacks.

`food_glycemic_index`: Helps distinguish between regular meals and higher-glycemic foods (like snacks). 0.5 for regular meal announcements, or 1 for INTERVENTION_SNACKS

`affects_fob` or `affects_iob`: fob -> food on board, iob -> insuline on board. 

`trend`: Indicates the direction of blood glucose change rate, which could help capture sharp rises or falls associated with meals.

### Improvement:

Potentially do a better feature selection, currently I wanted to testout how the STRAY model works with multiple explanatory variables thus I have selected to work with the columns specified above.

In [8]:
def prepare_data_for_stray(df):
    """
    Prepare the DataFrame by selecting relevant columns for STRAY anomaly detection model.
    
    Parameters:
    - df : DataFrame, the original DataFrame.
    
    Returns:
    - df_for_stray : DataFrame, with selected columns and necessary preprocessing applied.
    """
    # Select columns that are relevant for detecting meal-related anomalies
    df_for_stray = df[['bgl', 'dose_units', 'food_g', 'food_glycemic_index', 
                       'affects_fob', 'affects_iob', 'trend']]
    
    # Convert any categorical variables to numeric if needed
    # For example, encoding 'trend' as numbers (e.g., FLAT=0, SINGLE_UP=1, etc.)
    trend_mapping = {'FLAT': 0, 'SINGLE_UP': 1, 'DOUBLE_UP': 2, 'FORTYFIVE_UP': 3, 
                     'FORTYFIVE_DOWN': -1, 'DOUBLE_DOWN': -2, 'NOT_COMPUTABLE': None}
    df_for_stray['trend'] = df_for_stray['trend'].map(trend_mapping)
    
    # Drop rows with nulls if necessary
    df_for_stray.dropna(inplace=True)
    
    return df_for_stray

In [9]:
# Apply to your DataFrame
df_for_stray = prepare_data_for_stray(df_679372_processed)  # Replace 'original_df' with your DataFrame variable

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_for_stray['trend'] = df_for_stray['trend'].map(trend_mapping)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_for_stray.dropna(inplace=True)


## Step2: Train an Apply STRAY for Meal Detection

STRAY is typically used to detect outliers, which in this case can signify unusual blood glucose changes related to meals.


In [10]:
# setting up STRAY
from sktime.annotation.stray import STRAY

In [11]:
# Initialize STRAY for anomaly detection
stray_model = STRAY()  # k is a parameter defining the number of neighbors, adjust based on data

# Detect anomalies using fit_transform (not fit_predict)
anomalies = stray_model.fit_transform(df_for_stray)

## Step3: Model Accuracy

In [35]:
from sklearn.preprocessing import OneHotEncoder

def preprocess_for_meal_labels(df):
    """
    Preprocess the combined DataFrame.
    ---
    1. Fill NaN values with 0
    2. Replace 0 with 'NULL' in the 'msg_type' column
    3. One hot encode the 'msg_type' column
    4. Drop irrelevant columns for the target variable 
    (in this case, only keeps 'ANNOUNCE_MEAL', 'INTERVENTION_SNACK', 'ANNOUNCE_EXERCISE', 'DOSE_INSULIN', 'DOSE_BASAL_INSULIN')
    5. Drop rows with invalid dates
    6. Change affects_fob and affects_iob to 1 and 0
    """
    df = df.fillna(0)
    df['msg_type'] = df['msg_type'].replace(0, 'NULL')

    # Convert 'date' column to datetime with a custom format
    df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d %H:%M:%S%z', errors='coerce', utc=True)
    
    # Drop rows where the date is null or invalid
    df = df.dropna(subset=['date'])
    
    # Sort by date and handle duplicate timestamps
    df = df.sort_values('date')
    
    # Set 'date' as the index
    df.set_index('date', inplace=True)
    # Change affects_fob and affects_iob to 1 and 0
    df['affects_fob'] = df['affects_fob'].apply(lambda x: 1 if x != 0 else 0)
    df['affects_iob'] = df['affects_iob'].apply(lambda x: 1 if x != 0 else 0)

    RELEVANT_MSG_TYPES = ['ANNOUNCE_MEAL', 'INTERVENTION_SNACK', 'ANNOUNCE_EXERCISE', 'DOSE_INSULIN', 'DOSE_BASAL_INSULIN']
    
    encoder = OneHotEncoder(categories='auto', sparse_output=False)
    encoded_data = encoder.fit_transform(df[['msg_type']])
    encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['msg_type']), index=df.index)
    
    df = df.drop(columns=['msg_type'])
    df = pd.concat([df, encoded_df], axis=1)
    

    
    return df


In [36]:
df_actual = preprocess_for_meal_labels(df_679372)

In [None]:
df_actual['msg']

In [22]:
# adding in the model predictied anomalies
df_actual['is_anomaly'] = anomalies

In [34]:
df_679372['msg_type'].unique()

array([nan, 'DOSE_INSULIN', 'DOSE_BASAL_INSULIN', 'TEXT', 'ANNOUNCE_MEAL',
       'INTERVENTION_SNACK', 'ANNOUNCE_EXERCISE', 'NEW_PEN', 'NEW_SENSOR',
       'BGL_FP_READING', 'MEDICAL_TEST_RESULT'], dtype=object)

In [33]:
df_actual['msg_type'].unique()

array(['NULL', 'INTERVENTION_SNACK', 'NEW_PEN', 'NEW_SENSOR'],
      dtype=object)

In [None]:
# visual inspection with time series plot

In [None]:
# evaluate windoweddetaction (tolerance for anomaly timing)

In [None]:
# adjust stray model parameters and re-evaluate
# testing again with hyper parameter tuning, gradient boost, and cross validation methods
# random test