In [54]:
import pandas as pd
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.transformations.panel.compose import ColumnConcatenator
from sktime.classification.interval_based import TimeSeriesForestClassifier
from sktime.datatypes import convert_to
from sktime.datatypes._panel._convert import from_2d_array_to_nested
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import os

'''
The following code block is taken from Andrew's notebook on 
time series forest classifier :)
'''
# List all CSV files in the data/raw directory
csv_files = [f for f in os.listdir('../data/raw') if f.endswith('.csv')]

# ,date,sender_id,bgl,bgl_date_millis,text,template,msg_type,affects_fob,affects_iob,dose_units,food_g,food_glycemic_index,dose_automatic,fp_bgl,message_basal_change,__typename,trend
# Define the columns to load
columns_to_load = ['date', 'bgl', 'msg_type', 'affects_fob', 'affects_iob','dose_units','food_g','food_glycemic_index']

# Load each CSV file into a DataFrame and store them in a dictionary
dataframes = {file: pd.read_csv(os.path.join('../data/raw', file), usecols=columns_to_load, parse_dates=['date']) for file in csv_files}

# Print the names of the loaded DataFrames
print("Loaded DataFrames:", list(dataframes.keys()))

Loaded DataFrames: ['500030_2024-07-01_2024-09-30.csv', '679372_2024-07-01_2024-09-30.csv']


In [55]:
print(dataframes['500030_2024-07-01_2024-09-30.csv'].columns)
print(dataframes['679372_2024-07-01_2024-09-30.csv'].columns)
dataframes['679372_2024-07-01_2024-09-30.csv'][dataframes['679372_2024-07-01_2024-09-30.csv']['msg_type'] == 'ANNOUNCE_MEAL'].head()


Index(['date', 'bgl', 'msg_type', 'affects_fob', 'affects_iob', 'dose_units',
       'food_g', 'food_glycemic_index'],
      dtype='object')
Index(['date', 'bgl', 'msg_type', 'affects_fob', 'affects_iob', 'dose_units',
       'food_g', 'food_glycemic_index'],
      dtype='object')


Unnamed: 0,date,bgl,msg_type,affects_fob,affects_iob,dose_units,food_g,food_glycemic_index
256,2024-07-01 13:39:01.062000-04:00,135.0,ANNOUNCE_MEAL,True,False,0.0,22.0,0.5
275,2024-07-01 14:32:31.740000-04:00,157.0,ANNOUNCE_MEAL,True,False,0.0,22.0,0.5
308,2024-07-01 15:44:35.026000-04:00,223.0,ANNOUNCE_MEAL,True,False,0.0,22.0,0.5
718,2024-07-02 13:19:08.176000-04:00,102.0,ANNOUNCE_MEAL,True,False,0.0,40.0,0.5
849,2024-07-02 20:12:00.212000-04:00,97.0,ANNOUNCE_MEAL,True,False,,15.0,0.5


In [56]:
df = dataframes['679372_2024-07-01_2024-09-30.csv']
df = df.fillna(0)
# replaces message type 0 with NULL
df['msg_type'] = df['msg_type'].replace(0, 'NULL')
df.head()
df['msg_type'].unique()


array(['NULL', 'DOSE_INSULIN', 'DOSE_BASAL_INSULIN', 'TEXT',
       'ANNOUNCE_MEAL', 'INTERVENTION_SNACK', 'ANNOUNCE_EXERCISE',
       'NEW_PEN', 'NEW_SENSOR', 'BGL_FP_READING', 'MEDICAL_TEST_RESULT'],
      dtype=object)

In [57]:
df[df['msg_type'] == 0].head()


Unnamed: 0,date,bgl,msg_type,affects_fob,affects_iob,dose_units,food_g,food_glycemic_index


In [58]:
'''
df[msg_type].unique() returns the following:
array([0, 'DOSE_INSULIN', 'DOSE_BASAL_INSULIN', 'TEXT', 'ANNOUNCE_MEAL',
       'INTERVENTION_SNACK', 'ANNOUNCE_EXERCISE', 'NEW_PEN', 'NEW_SENSOR',
       'BGL_FP_READING', 'MEDICAL_TEST_RESULT'], dtype=object)

This block will one hot encode only the relevant categories for meal identification
(in this case, 'ANNOUNCE_MEAL', 'INTERVENTION_SNACK', 'ANNOUNCE_EXERCISE', 'DOSE_INSULIN', 'DOSE_BASAL_INSULIN')

Every other category will be encoded as NULL
'''
RELEVANT_MSG_TYPES = ['ANNOUNCE_MEAL', 'INTERVENTION_SNACK', 'ANNOUNCE_EXERCISE', 'DOSE_INSULIN', 'DOSE_BASAL_INSULIN']


encoder = OneHotEncoder(categories='auto', sparse_output=False)
encoded_data = encoder.fit_transform(df[['msg_type']])
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['msg_type']))

df = df.drop(columns=['msg_type'])
df = pd.concat([df, encoded_df], axis=1)

# checks if the column name contains 'msg_type' and does not contain any strings in RELEVANT_MSG_TYPES
COLUMNS_TO_DROP = [col for col in df.columns if 'msg_type' in col and not any(msg_type in col for msg_type in RELEVANT_MSG_TYPES)]
# drop irrelevant columns for the target variable
df.drop(columns=COLUMNS_TO_DROP, inplace=True)



In [None]:
'''

'''