# Raw to Processed

## Import Libraries

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import json
import datetime as dt
import pytz
import matplotlib.dates as md
import numpy as np

## Plot data

In [None]:
def plot_line(ax, metric, data):
    '''
    Plots a metric on an axis.
    
    Parameters
    ----------
    ax: matplotlib Axes
        axis to plot the metric data on to
    metric: string
        name of the metric being plotted
    data: DataFrame
        the data being plotted
    '''

    metric_info = {
        'bg': ['line', 'Blood Glucose [mmol/L]', 'tab:red'], 
        'insulin': ['line', 'Insulin [U]', 'tab:blue'], 
        'carbs': ['scatter', 'Carbohydrate [grams]', 'tab:green'], 
        'hr': ['line', 'Heart Rate [bpm]', 'tab:pink'], 
        'dist': ['bar', 'Distance [m]', 'tab:gray'], 
        'steps': ['bar', 'Steps [count]', 'tab:cyan'], 
        'cals': ['line', 'Calories Burned [kcals]', 'tab:olive'],
        'activity': ['scatter', 'Activity', 'tab:purple']
    }

    # filter the data to just be the plotted metric
    metric_data = data[data[metric] == data[metric]].loc[:, [metric]]

    # adds nan where the time difference between consecutive points exceeds 30 minutes to leave gaps in lines
    if metric_info[metric][0] in ['line']:
        time_diff = metric_data.index.to_series().diff()
        gaps = time_diff > pd.Timedelta(minutes=30)
        if gaps.any():
            gap_starts = metric_data.index[gaps]
            gap_ends = metric_data.index[gaps.shift(-1, fill_value=False)]
            gap_midpoints = gap_starts + (gap_ends - gap_starts) / 2
            nan_rows = pd.DataFrame(index=gap_midpoints)
            metric_data = pd.concat([metric_data, nan_rows]).sort_index()
    
    # plot the data and format the y axis
    ax.xaxis.set_major_formatter(md.DateFormatter('%m/%d %H:%M'))
    ax.set_xlabel('Time [mm/dd HH:MM]')
    ax.set_ylabel(metric_info[metric][1], color=metric_info[metric][2])
    if metric_info[metric][0] == 'line':
        ax.plot(
            metric_data.index,
            metric_data[metric],
            color = metric_info[metric][2]
        )
    elif metric_info[metric][0] == 'scatter':
        ax.scatter(
            metric_data.index, 
            metric_data[metric],
            color = metric_info[metric][2]
        )
    elif metric_info[metric][0] == 'bar':
        ax.bar(
            metric_data.index,
            metric_data[metric],
            width = pd.Timedelta(minutes=5),
            color = metric_info[metric][2]
        )
    ax.tick_params(axis='y', labelcolor=metric_info[metric][2])
    
    
def plot_axis(group, ax, data, start, end):
    '''
    Plots a subplot.
    
    Parameters
    ----------
    group: list of strings
        the group of metrics to be plotted on the axis
    ax: matplotlib Axes
        axis to plot the metric data on to
    data: DataFrame
        the data being plotted
    start: string
        lower bound for the date range to be plotted
    end: string
        upper bound for the date range to be plotted
    '''
    
    # plots a line for each metric and adjust axis positions for each
    for j, metric in enumerate(group):
        if metric in ['bg', 'insulin', 'carbs', 'hr', 'dist', 'steps', 'cals', 'activity']:
            if data[data[metric] == data[metric]][metric].empty:
                print('No "{}" data in the plotted period.'.format(metric))
            else:
                if j > 0:
                    ax_temp = ax.twinx()
                    plot_line(ax_temp, metric, data)
                    if j > 1:
                        ax_temp.spines['right'].set_position(('outward', 60*(j-1)))
                else:
                    plot_line(ax, metric, data)
        else:
            print('"{}" is not an available metric to plot.'.format(metric))
            
    ax.set_xlim(pd.to_datetime(start), pd.to_datetime(end))
        
        
def plot_data(data, start = '2023/07/01', end = '2023/07/04', layout = [['bg', 'insulin', 'carbs'], ['hr', 'dist', 'steps', 'cals', 'activity']]):
    '''
    Plots the data.
    
    Parameters
    ----------
    data: DataFrame
        participant data
    start: string
        lower bound for the date range to be plotted
    end: string
        upper bound for the date range to be plotted
    layout: list of lists of strings
        the metrics to plot on each axis
    '''
    data = data.loc[(data.index >= pd.to_datetime(start)) & (data.index < pd.to_datetime(end))]
    
    fig, ax = plt.subplots(len(layout), 1, figsize=(16,4*len(layout)))
    
    for i, group in enumerate(layout):
        if len(layout) == 1:
            plot_axis(group, ax, data, start, end)
        else:
            plot_axis(group, ax[i], data, start, end)

    plt.show()

## Extract T1D data

### General functions

In [None]:
def split_time(data):
    '''
    Breaks the data time period down into 5 minute chunks.
    
    Parameters
    ----------
    data: DataFrame
        a dataframe containing the basal data in a column titled 'basal' and an 
        index with datetimes from that basal shift
    '''
    time_range = pd.date_range(
        (
            data[data['basal'].notna()].index.min() + 
            pd.Timedelta(minutes=2, seconds=29)
        ).round('5min'), 
        (data.index.max()-pd.Timedelta(minutes=2, seconds=30)).round('5min'), 
        freq='5min'
    )
    
    return time_range

def add_device(p_path, data, group):
    '''
    Adds a column for the device the data has come from
    
    Parameters
    ----------
    p_path: string
        path to the devices file
    data: DataFrame
        data to add the column to
    group: string
        the device group that the device is from ('CGM', 'Insulin Pump' or 
        'Smartwatch')
    
    Returns
    -------
    data: DataFrame
        data with the added device column
    '''
    devices = pd.read_csv(os.path.join(p_path, 'devices.csv'))
    devices['date'] = pd.to_datetime(devices['date'])
    devices = devices.set_index('date')
    
    for time, device in devices[devices['group']==group].iterrows():
        data.loc[data.index > time, 'device'] = device['device']
        
    return data

def mean_duration(path):
    '''
    Finds the mean extended bolus duration across all participants.
    
    Parameters
    ----------
    path: string
        path to search for duration files in
        
    Returns
    -------
    mean_dur: float
        the mean duration from those foudn in the data
    '''
    durs = []
    
    for root, dirs, files in os.walk(path):
        if 'extended.csv' in files:
            ext = pd.read_csv(os.path.join(root, 'extended.csv'))
            durs = durs + pd.to_timedelta(
                ext[ext['duration']==ext['duration']]['duration']+':00'
            ).to_list()
            
    mean_dur = sum(durs, dt.timedelta(0)) / len(durs)
    return mean_dur

### CareLink

In [None]:
def extract_carelink_pump(p_path, fso):
    '''
    Extracts data from carelink pump files.
    
    Parameters
    ----------
    p_path: string
        path to the participant's directory
    fso: string
        name of file to extract the carelink pump data from
    
    Returns
    -------
    data: DataFrame
        extracted insulin and carb values from the carelink pump file
    '''
    data = pd.read_csv(os.path.join(p_path, fso), low_memory=False)
    data = data[[
        'DateTime', 
        'Basal Rate (U/h)', 
        'Temp Basal Amount', 
        'Temp Basal Duration (h:mm:ss)', 
        'Bolus Volume Delivered (U)',
        'Bolus Duration (h:mm:ss)',
        'Suspend',
        'BWZ Carb Input (grams)',
        'BWZ Status',
        'Bolus Source'
    ]]
    data = data.rename(columns={
        'DateTime': 'timestamp', 
        'Basal Rate (U/h)': 'basal', 
        'Temp Basal Amount': 'temp_basal', 
        'Temp Basal Duration (h:mm:ss)': 'temp_basal_dur', 
        'Bolus Volume Delivered (U)': 'bolus',
        'Bolus Duration (h:mm:ss)': 'bolus_dur',
        'Suspend': 'suspend',
        'BWZ Carb Input (grams)': 'carbs',
        'BWZ Status': 'status',
        'Bolus Source': 'source'
    })
    
    data['timestamp'] = pd.to_datetime(data['timestamp'])
    data = data.set_index('timestamp')
    data = data.dropna(how='all')
    data = data.iloc[::-1]

    # adds the basal rate to rows highlighting return to normal pumping so that the basal rate change is repeated as with other basal rate changes
    for time, row in data.iterrows():
        if row['suspend'] == 'NORMAL_PUMPING':
            data.at[time, 'basal'] = data[data['basal']==data['basal']]['basal'][time].mean()  

    # when basal rate would change during a temporary basal rate replace the basal rate value with the temporary basal rate value
    basal_rate = 0
    temp_basals = data[data['temp_basal'] == data['temp_basal']][['temp_basal', 'temp_basal_dur']]

    for time, temp_basal in temp_basals.iterrows():
        count = 0
        basal_events = data[(data.index >= time) & (data.index < time + pd.Timedelta(temp_basal['temp_basal_dur']))]

        for t, event in basal_events[basal_events['basal'] == basal_events['basal']].iterrows():
            if count%2 == 1:
                data.at[t, 'basal'] = basal_rate
            basal_rate = event['basal']
            count += 1

    data = data.sort_index()
    
    # splits the time period into 5 minute intervals
    time_range = split_time(data)
    
    # calculates the insulin delivered in the last 5 minutes for each time interval
    insulin_values = []
    basal_rate = data[data.index <= time_range[0]][data[data.index <= time_range[0]]['basal'].notna()]['basal'].iloc[-1]
    basal_rate_temp = 0
    basal_rate_temp_t = pd.to_datetime('2000/01/01')
    for time in range(len(time_range)-1):
        insulin = 0
        recent_basal_t = time_range[time]
        for index, event in data.loc[(data.index >= time_range[time]) & (data.index < time_range[time+1])].iterrows():
            if event['basal'] == event['basal']:
                if basal_rate_temp == event['basal'] and basal_rate_temp_t == (index+pd.Timedelta(seconds=29)).round('min'):
                    insulin += (index - recent_basal_t).seconds/3600 * basal_rate
                    basal_rate = event['basal']
                    recent_basal_t = index
                else:
                    basal_rate_temp = event['basal']
                    basal_rate_temp_t = (index+pd.Timedelta(seconds=29)).round('min')


            if event['bolus'] == event['bolus']:
                # if the insulin pump is in closed-loop mode each dose the insulin pump gives as part of the basal rate is recorded as a bolus instead so the basal needs to be ignored to avoid double counting
                if event['source'] == 'CLOSED_LOOP_AUTO_BASAL' or event['source'] == 'CLOSED_LOOP_AUTO_BOLUS':
                    basal_rate = 0

                # retrospectively goes back and adds extended boluses
                if event['bolus_dur'] == event['bolus_dur']:
                    bolus_dur = pd.Timedelta(event['bolus_dur'])
                    bolus_rate = event['bolus']/pd.Timedelta(event['bolus_dur']).seconds # in units per second

                    if bolus_dur < index - (index-pd.Timedelta(minutes=2, seconds=31)).round('5min'):
                        insulin += bolus_dur.seconds * bolus_rate
                    else:
                        insulin += (index - (index-pd.Timedelta(minutes=2, seconds=31)).round('5min')).seconds * bolus_rate
                        bolus_dur -= index - (index-pd.Timedelta(minutes=2, seconds=31)).round('5min')
                        i=1
                        while bolus_dur >= pd.Timedelta(minutes=5):
                            insulin_values[-i] += 300 * bolus_rate
                            bolus_dur -= pd.Timedelta(minutes=5)
                            i += 1
                        insulin_values[-i] += bolus_dur.seconds * bolus_rate
                else:
                    insulin += event['bolus']
        insulin += (time_range[time+1] - recent_basal_t).seconds/3600 * basal_rate
        insulin_values.append(insulin)
    
    # formats the insulin values into a dataframe
    insulin_values = pd.DataFrame({'timestamp': time_range[1:], 'insulin': insulin_values})
    insulin_values = insulin_values.set_index('timestamp')

    # removes carb values from cases where the delivery failed and 0 carbs is recorded and rounds them to the nearest minute
    data = data.drop(data[data['status'] == 'Not Delivered'].index)
    carbs = data[['carbs']]
    carbs = carbs.loc[(carbs['carbs'] == carbs['carbs']) & (carbs['carbs'] > 0)]
    carbs.index = carbs.index.round('min')
    carbs = carbs.groupby(carbs.index).sum()

    # combines insulin and carbs data and sorts
    data = insulin_values.join(carbs, how='outer')
    data = data.dropna(how='all')
    data = data.sort_index()

    # adds the device column
    data = add_device(p_path, data, 'Insulin Pump')
    
    return data

In [None]:
def extract_carelink_sensor(p_path, fso):
    '''
    Extracts data fropm carelink sensor files.
    
    Parameters
    ----------
    p_path: string
        path to the participant's directory
    fso: string
        name of file to extract the carelink sensor data from
    
    Returns
    -------
    data: DataFrame
        extracted bg values from the carelink sensor file
    '''
    data = pd.read_csv(os.path.join(p_path, fso), low_memory=False)
    data = data[[
        'DateTime',
        'Sensor Glucose (mmol/L)'
    ]]
    data = data.rename(columns={
        'DateTime': 'timestamp', 
        'Sensor Glucose (mmol/L)': 'bg'
    })
    data = data.drop_duplicates()
    data['timestamp'] = pd.to_datetime(data['timestamp'])
    data = data.set_index('timestamp')
    data.index = data.index.round('min')
    data = data.dropna(how='all')
    data = data.sort_index()
    
    data = add_device(p_path, data, 'CGM')
    
    return data

### Clarity

In [None]:
def extract_clarity(p_path, fso):
    '''
    Extracts bg readings from clairty files.
    
    Parameters
    ----------
    p_path: string
        path to the participant's directory
    fso: string
        name of file to extract the clarity data from
    
    Returns
    -------
    data: DataFrame
        extracted data from the clarity file
    '''
    data = pd.read_csv(os.path.join(p_path, fso))
    data = data[[
        'Timestamp',
        'Glucose Value (mmol/L)'
    ]]
    data = data.rename(columns={
        'Timestamp': 'timestamp',
        'Glucose Value (mmol/L)': 'bg'
    })
    data = data.drop_duplicates()

    # sets high and low warnings to the threshold values
    data['bg'] = data['bg'].replace('Low', '2.2')
    data['bg'] = data['bg'].replace('High', '22.2')
    data['bg'] = pd.to_numeric(data['bg'])
    data['timestamp'] = pd.to_datetime(data['timestamp']).round('min')
    data = data.set_index('timestamp')
    
    data = data.sort_index()
    data = add_device(p_path, data, 'CGM')
    
    return data

### Glooko

In [None]:
def detect_misalignment(cgm_data):
    '''
    Detects misalignment between cgm readings recorded on the pump and phone and adjusts the cgm readings on the pump based on these misalignments

    Parameters
    ----------
    cgm_data: DataFrame
        glooko cgm data

    Returns
    -------
    cgm_data: DataFrame
        cgm data updated for mislaingments
    misalignment_start_end: DataFrame
        the start end and value of misalignments detected in the cgm readings
    '''
    misalignment_start_end = pd.DataFrame()
    
    if 'pump' in cgm_data['source'].unique() and 'phone' in cgm_data['source'].unique():
        pump_data = cgm_data[cgm_data['source'] == 'pump'].reset_index(drop=True)
        phone_data = cgm_data[cgm_data['source'] == 'phone'].reset_index(drop=True)

        # calculates all possible misalignments
        misalignments = []
        for i, pump_row in pump_data.iterrows():
            pump_time = pump_row['timestamp']
            pump_value = pump_row['bg']
            misalignments.append((pump_time - phone_data[(phone_data['timestamp'] > pump_time - pd.Timedelta(hours=2)) & (phone_data['timestamp'] < pump_time + pd.Timedelta(hours=2)) & (phone_data['bg'] == pump_value)]['timestamp']).to_list())
        pump_data['misalignments'] = misalignments

        # selects frequently occuring misalingments as the true misalingment
        final_misalignments = []
        for i, pump_row in pump_data.iterrows():
            final_misalignment = pd.NaT
            for misalignment in pump_row['misalignments']:
                count = 0
                for other_misalignments in pump_data[(pump_data['timestamp'] > pump_row['timestamp']) & (pump_data['timestamp'] < pump_row['timestamp'] + pd.Timedelta(minutes=90))]['misalignments']:
                    if misalignment in other_misalignments:
                        count += 1
                if count > 15:
                    final_misalignment = misalignment
                    break
                count = 0
                for other_misalignments in pump_data[(pump_data['timestamp'] > pump_row['timestamp'] - pd.Timedelta(minutes=90)) & (pump_data['timestamp'] < pump_row['timestamp'])]['misalignments']:
                    if misalignment in other_misalignments:
                        count += 1
                if count > 15:
                    final_misalignment = misalignment
                    break
            final_misalignments.append(final_misalignment)
        pump_data['misalignments'] = final_misalignments

        if len(pump_data[pump_data['misalignments'] == pump_data['misalignments']]['misalignments']) > 0:
            # groups the misalignments into groups with start and end times
            misalignment_data = pump_data[pump_data['misalignments'] == pump_data['misalignments']][['timestamp', 'misalignments']]
            mask = (misalignment_data['misalignments'] == misalignment_data['misalignments'].shift(1)) & (misalignment_data['misalignments'] == misalignment_data['misalignments'].shift(-1))
            misalignment_data = misalignment_data[~mask]
            misalignment_data['group'] = (misalignment_data['misalignments'] != misalignment_data['misalignments'].shift()).cumsum()
            misalignment_start_end = (
                misalignment_data.groupby('group')
                .agg(
                    start_time=('timestamp', 'first'),
                    end_time=('timestamp', 'last'),
                    misalignment=('misalignments', 'first')
                )
                .reset_index(drop=True)
            )

            # extends the misalingment time ranges to be touching if there is a small change in misalingment value
            for i in range(len(misalignment_start_end) - 1):
                if np.abs(misalignment_start_end.loc[i+1, 'misalignment'] - misalignment_start_end.loc[i, 'misalignment']) <= pd.Timedelta(minutes=3):
                    misalignment_start_end.loc[i, 'end_time'] = misalignment_start_end.loc[i+1, 'start_time']

            # updates pump cgm readings based on the misalingments
            for i, misalignment in misalignment_start_end.iterrows():
                pump_data.loc[(pump_data['timestamp'] >= misalignment['start_time']) & (pump_data['timestamp'] < misalignment['end_time']), 'timestamp'] = pump_data['timestamp'] - misalignment['misalignment']
            
            # remove phone readings when there are reliable pump values within 5 minutes
            for i, pump_row in pump_data.iterrows():
                pump_time = pump_row['timestamp']
                if len(pump_data[(pump_data['timestamp'] >= pump_time - pd.Timedelta(minutes=15)) & (pump_data['timestamp'] <= pump_time + pd.Timedelta(minutes=15))]) > 2:
                    phone_data = phone_data.drop(phone_data[(phone_data['timestamp'] > pump_time - pd.Timedelta(minutes=5)) & (phone_data['timestamp'] < pump_time + pd.Timedelta(minutes=5))].index)

            cgm_data = pd.concat([pump_data, phone_data])

        cgm_data = cgm_data[['timestamp', 'bg']]

    return cgm_data, misalignment_start_end
    
def correct_insulin_misalignment(insulin, misalignment_start_end):
    '''
    Adjusts insulin timestamps based on misalignment found between pump and phone cgm readings

    Parameters
    ----------
    insulin: DataFrame
        basal or bolus data
    misalignment_start_end: DataFrame
        misalignment times and values

    Returns
    -------
    insulin: DataFrame
        basal or bolus data adjusted by misalignment times
    '''
    for i, misalignment in misalignment_start_end.iterrows():
        insulin.loc[misalignment['start_time']:misalignment['end_time']].index - misalignment['misalignment']
            
    return insulin

def extract_glooko(p_path, fsos, omnipod_5_start = None):
    '''
    Extracts data from glooko files.
    
    Parameters
    ----------
    p_path: string
        path to the participant's directory
    fsos: list of string
        names of directory to extract the glooko data from
    
    Returns
    -------
    data: DataFrame
        extracted data from the glooko files
    '''
    data = pd.DataFrame()
    cgm = pd.DataFrame()
    misalignment_start_end = pd.DataFrame()

    # loads and joins all the cgm data
    for fso in fsos:
        path_glooko = os.path.join(p_path, fso)
        if 'cgm_data.csv' in os.listdir(path_glooko):
            cgm = pd.concat([cgm, pd.read_csv(os.path.join(path_glooko, 'cgm_data.csv'))])

    # extracts the cgm data if there is any
    if not cgm.empty:
        cgm = cgm.drop_duplicates().reset_index(drop=True)
        cgm = cgm.rename(columns = {'Timestamp': 'timestamp', 'CGM Glucose Value (mmol/L)': 'bg', 'Source': 'source'})
        cgm['bg'] = cgm['bg'].replace(0.1, 2.2)
        cgm['bg'] = cgm['bg'].replace(111.1, 22.2)
        cgm['timestamp'] = pd.to_datetime(cgm['timestamp'])

        cgm, misalignment_start_end = detect_misalignment(cgm)

        cgm = cgm.set_index('timestamp')
        cgm = cgm[['bg']]
        cgm = add_device(p_path, cgm, 'CGM')
        data = data.join(cgm, how='outer')

    # extracts the insulin data
    for fso in fsos:
        pump = pd.DataFrame()
        path_glooko = os.path.join(p_path, fso)
        if 'basal_data.csv' in os.listdir(path_glooko) and 'bolus_data.csv' in os.listdir(path_glooko):
            # reads the basal data and formats it
            basal = pd.read_csv(os.path.join(path_glooko, 'basal_data.csv'))
            basal = basal[['Timestamp', 'Duration (minutes)', 'Rate']]
            basal = basal.rename(columns = {'Timestamp': 'timestamp', 'Duration (minutes)': 'duration', 'Rate': 'basal'})
            basal['timestamp'] = pd.to_datetime(basal['timestamp'])
            basal = basal.set_index('timestamp').sort_index()
            
            # reads the bolus data and formats it
            bolus = pd.read_csv(os.path.join(path_glooko, 'bolus_data.csv'))
            bolus = bolus[[
                'Timestamp', 
                'Carbs input (g)', 
                'Insulin delivered (U)', 
                'Initial delivery (U)', 
                'Extended delivery (U)'
            ]]
            bolus = bolus.rename(columns = {
                'Timestamp': 'timestamp', 
                'Carbs input (g)': 'carbs', 
                'Insulin delivered (U)': 'bolus', 
                'Initial delivery (U)': 'instant', 
                'Extended delivery (U)': 'extended'
            })
            bolus['timestamp'] = pd.to_datetime(bolus['timestamp'])
            bolus = bolus.set_index('timestamp').sort_index()

            # adds extended bolus durations in the cases they exist
            if 'extended.csv' in os.listdir(p_path):
                duration = pd.read_csv(os.path.join(p_path, 'extended.csv'))
                duration['timestamp'] = pd.to_datetime(duration['timestamp'], format='%d/%m/%Y %H:%M')
                duration['duration'] = pd.to_timedelta(duration['duration']+':00')
                duration = duration.set_index('timestamp')
                
            for index, event in bolus.iterrows():
                if event['extended'] == event['extended'] and event['extended'] > 0:
                    try:
                        bolus.at[index, 'duration'] = duration['duration'][index]
                    except:
                        # if there isn't extended bolus durations collected (P13 and P23) uses the mean extended bolus duration across the other participants instead rounded to the nearest minute (1:39:00)
                        bolus.at[index, 'duration'] = pd.to_timedelta('01:39:00')
                        print('missing duration value so the default 1:39:00 is used')
                else:
                    bolus.at[index, 'instant'] = event['bolus']
                    bolus.at[index, 'extended'] = 0.0

            if len(misalignment_start_end) > 0:
                basal = correct_insulin_misalignment(basal, misalignment_start_end)
                bolus = correct_insulin_misalignment(bolus, misalignment_start_end)

            # splits the time period into 5 minute chunks
            time_range = split_time(basal)
            
            # caluclates the insulin delivered in the last 5 minutes for each time chunk
            insulin_values = [0] * (len(time_range)-1)
            basal_rate = basal[basal.index <= time_range[0]]['basal'].iloc[-1]
            for i in range(len(insulin_values)):
                recent_basal_t = time_range[i]
                interval_basals = basal.loc[(basal.index >= time_range[i]) & (basal.index < time_range[i+1])]

                for index, event in interval_basals.iterrows():
                    if len(interval_basals.loc[[index]]) == 1 or event['duration'] > 1:
                        insulin_values[i] += (index - recent_basal_t).seconds/3600 * basal_rate
                        basal_rate = event['basal']
                        recent_basal_t = index
                
                for index, event in bolus.loc[(bolus.index >= time_range[i]) & (bolus.index < time_range[i+1])].iterrows():
                    insulin_values[i] += event['instant']
                    if event['extended'] > 0:
                        bolus_dur = event['duration']
                        bolus_rate = event['extended']/event['duration'].seconds # in units per second
                        insulin_values[i] += (index - (index-pd.Timedelta(minutes=2, seconds=31)).round('5min')).seconds * bolus_rate
                        bolus_dur -= (index - (index-pd.Timedelta(minutes=2, seconds=31)).round('5min'))
                        j=1
                        while bolus_dur >= pd.Timedelta(minutes=5):
                            insulin_values[i+j] += 300 * bolus_rate
                            bolus_dur -= pd.Timedelta(minutes=5)
                            j += 1
                        insulin_values[i+j] += bolus_dur.seconds * bolus_rate

                insulin_values[i] += (time_range[i+1] - recent_basal_t).seconds/3600 * basal_rate
            
            # formats the insulin values into a dataframe and adds the device
            insulin_values = pd.DataFrame({'timestamp': time_range[1:], 'insulin': insulin_values})
            insulin_values = insulin_values.set_index('timestamp')

            # excludes insulin data recorded on an omnipod 5
            if omnipod_5_start:
                pump = insulin_values[insulin_values.index < omnipod_5_start].copy()
            else:
                pump = insulin_values.copy()

            # assume if no bolus devliered with carbs that it was a failed delivery
            carbs = bolus.loc[(bolus['carbs'] == bolus['carbs']) & (bolus['carbs'] > 0) & (bolus['bolus'] > 0)][['carbs']]
            if pump.empty:
                pump = carbs.copy()
            else:
                pump = pump.join(carbs, how='outer')

            pump = add_device(p_path, pump, 'Insulin Pump')

        data = pd.concat([data, pump])

    # reorder data and remove duplciates
    data = data.reset_index().drop_duplicates().set_index('timestamp')
    data = data.sort_index()
        
    return data
    

### LibreView

In [None]:
def extract_libreview(p_path, fso):
    '''
    Extracts data from libreview files.
    
    Parameters
    ----------
    p_path: string
        path to the participant's directory
    fso: string
        name of file to extract the libreview data from
    
    Returns
    -------
    data: DataFrame
        extracted bg readings from the libreview file
    '''
    data = pd.read_csv(os.path.join(p_path, fso))
    data = data[data['Record Type'] == 0]
    data = data[['Device Timestamp', 'Historic Glucose mmol/L']]
    data = data.rename(columns = {'Device Timestamp': 'timestamp', 'Historic Glucose mmol/L': 'bg'})
    data['timestamp'] = pd.to_datetime(data['timestamp'])
    data = data.set_index('timestamp')
    data = add_device(p_path, data, 'CGM')
    
    return data

## Extract smartwatch data

### Fitbit

In [None]:
def extract_takeout(p_path, fso):
    '''
    Extracts data from takeout fitbit files.
    
    Parameters
    ----------
    p_path: string
        path to the participant's directory
    fso: string
        name of folder to extract the takeout data from
    
    Returns
    -------
    data: DataFrame
        data set of the extracted smartwatch data from the takeout files
    '''
    path_takeout = os.path.join(p_path, fso, 'Global Export Data')
    
    data = extract_fitbit(p_path, path_takeout)
    
    return data
    
    
def extract_dashboard(p_path, fso):
    '''
    Extracts data from fitbit dashboard files.
    
    Parameters
    ----------
    p_path: string
        path to the participant's directory
    fso: string
        name of folder to extract the fitbit dashboard data from
    
    Returns
    -------
    data: DataFrame
        data set of the extracted smartwatch data from the fitbit dashboard files
    '''
    path_dashboard = os.path.join(p_path, fso, 'Physical Activity')
    
    data = extract_fitbit(p_path, path_dashboard)
    
    return data


def extract_timezones(path):
    '''
    Extracts any timezone changes from the heart rate files
    
    Parameters
    ----------
    path: string
        path to the directory with the heart rate data
        
    Returns
    -------
    time_diffs: DataFrame
        a dataframe of the timezone changes and when they are first detected plus the starting timezone and time change out of BST
    '''
    time_diffs = []
    
    # searches through the heart rate files to look for exact hour differences between the first or last reading and the start and end of the day
    for file in os.listdir(path):
        if file.startswith('heart_rate-'):
            file_date = dt.datetime.strptime(file[11:21], '%Y-%m-%d') 
            
            f = open(os.path.join(path, file))
            hr = json.load(f)
            
            first_time = dt.datetime.strptime(hr[0]['dateTime'], '%m/%d/%y %H:%M:%S')
            last_time = dt.datetime.strptime(hr[-1]['dateTime'], '%m/%d/%y %H:%M:%S')
            
            if first_time.minute == 0:
                time_diff = dt.timedelta(hours=round((file_date - first_time).total_seconds() / 3600))
                time_diffs.append({'date': file_date, 'time_diff': time_diff})
                
            elif last_time.minute == 59:
                time_diff = dt.timedelta(hours=round((file_date + dt.timedelta(days=1) - last_time).total_seconds() / 3600))
                time_diffs.append({'date': file_date, 'time_diff': time_diff})
            
            f.close()

    # formats time differences and removes anomalous time changes
    time_diffs = pd.DataFrame(time_diffs)
    time_diffs = time_diffs.sort_values(by = 'date')
    consecutive_mask = (time_diffs['time_diff'] == time_diffs['time_diff'].shift(1)) | (time_diffs['time_diff'] == time_diffs['time_diff'].shift(-1))
    time_diffs = time_diffs[consecutive_mask]
    
    # adds starting time zones and UK time zone shift
    uk_time = pd.DataFrame({
        'date': [
            dt.datetime.strptime('2023-05-31 00:00:00', '%Y-%m-%d %H:%M:%S'), 
            dt.datetime.strptime('2023-10-29 02:00:00', '%Y-%m-%d %H:%M:%S')
        ], 
        'time_diff': [
            dt.timedelta(hours=1),
            dt.timedelta(hours=0)
        ]
    })

    time_diffs = pd.concat([time_diffs, uk_time], ignore_index=True)
    time_diffs = time_diffs.sort_values(by = 'date')
    
    time_diffs = time_diffs[time_diffs['time_diff'] != time_diffs['time_diff'].shift(1)]
    
    return time_diffs


def convert_to_local_time(time_diff, data):
    '''
    Converts the times of data from GMT to local time using timezone information
    
    Parameters
    ----------
    time_diff: DataFrame
        the timezone changes and when they occur
        
    data: DataFrame
        the data to have it's time updated
    
    Returns
    -------
    merged_data: DataFrame
        the data with updates times based on timezone difference
    '''
    # format the data
    data['timestamp'] = pd.to_datetime(data['timestamp'])
    time_diff['date'] = pd.to_datetime(time_diff['date'])
    data = data.sort_values('timestamp')
    time_diff = time_diff.sort_values('date')

    # adjust by timezone
    merged_data = pd.merge_asof(data, time_diff, left_on='timestamp', right_on='date', direction='backward')
    merged_data['timestamp'] = merged_data.apply(
        lambda row: (row['timestamp'] + row['time_diff']).strftime('%Y-%m-%d %H:%M:%S'),
        axis=1
    )

    # remove uneeded columns
    merged_data = merged_data.drop(columns=['date', 'time_diff'])
    merged_data['timestamp'] = pd.to_datetime(merged_data['timestamp'])
    
    return merged_data


def extract_json(path, date_key, value_keys, value_transforms):
    '''
    Extarct the data from json files
    
    Parameters
    ----------
    path: string
        path to the json file
    data_key: string
        index of the timestamp data
    value_keys: list of strings
        the index of the data
    value_transforms: list of string methods
        the methods to use to format the data
        
    Return
    ------
    processed_data: list of list
        the extarcted data
    '''
    with open(path) as f:
        data = json.load(f)
        processed_data = [[dt.datetime.strptime(item[date_key], '%m/%d/%y %H:%M:%S')] + [transform(item[key]) for key, transform in zip(value_keys, value_transforms)] for item in data]
    return processed_data


def extract_fitbit_metric(data, time_diff, columns, method):
    '''
    Extract a metric from the extracted json files
    
    Parameters
    ----------
    data: list of lists
        the data extracted fromthe json files
    time_diff: DataFrame
        the time differences to adjust the values by
    columns: list of strings
        the column names of the data
    method: string
        aggregation method, either 'mean' or 'sum'
    
    Returns
    -------
    metric: DataFrame
        the extracted metric
    '''
    # format the data and adjust the timezones
    metric = pd.DataFrame(data, columns=columns)
    metric = convert_to_local_time(time_diff, metric)
    metric = metric.set_index('timestamp')
    metric = metric.sort_index()
    
    # resample the data to 5 minute intervals, aggregated based on the method inputted
    resampled_count = metric.resample('5min', closed='right', label='right').count()
    if method == 'mean':
        metric = metric.resample('5min', closed='right', label='right').mean()
    elif method == 'sum':
        metric = metric.resample('5min', closed='right', label='right').sum()
    metric[resampled_count == 0] = float('nan')
    
    return metric

def extract_fitbit_activities(exe, time_diff):
    '''
    Extracts the activity labels from the exercise data
    
    Parameters
    ----------
    exe: list of lists
        the data from the exercise jsons
    time_diff: DataFrame
        the time differences to adjust the values by
        
    Returns
    -------
    act: DataFrame
        the extarcted activity labels
    '''
    # formats the data
    exe = pd.DataFrame(exe, columns=['timestamp', 'activity', 'duration'])
    exe = convert_to_local_time(time_diff, exe)
    exe['timestamp'] = pd.to_datetime(exe['timestamp'])
    
    # adds the end of activity
    exe['end'] = exe['timestamp'] + exe['duration']
    
    # round the start and end times to the nearest 5 minutes
    exe['start'] = (exe['timestamp'] + pd.Timedelta(seconds=450)).dt.floor('5min')
    exe['end'] = (exe['end'] - pd.Timedelta(seconds=150)).dt.ceil('5min')
    
    # resample the data into 5 minute intervals and add the activity label to intervals within activity events
    act = pd.DataFrame()
    for i, row in exe.iterrows():
        act_range = pd.date_range(row['start'], row['end'], freq='5min')
        act = pd.concat([act, pd.DataFrame({'activity': [row['activity']] * len(act_range)}, index=act_range)])
    act = act[~act.index.duplicated(keep='last')]
    act.index.name = 'timestamp'
    
    return act

def extract_fitbit(p_path, d_path):
    '''
    Extracts data from fitbit files.
    
    Parameters
    ----------
    p_path: string
        path to the participant's directory
    d_path: string
        path to the participant's general pa data directory
    
    Returns
    -------
    data: DataFrame
        data set of the extracted smartwatch data from the takeout files
    '''
    
    # generate lists to collect data in
    hr = []
    dist = []
    steps = []
    cals = []
    exe = []
    
    # extract the json data
    for file in os.listdir(d_path):
        f_path = os.path.join(d_path, file)
        if file.startswith('heart_rate-'):
            hr += extract_json(f_path, 'dateTime', ['value'], [lambda x: x['bpm']])
        elif file.startswith('distance-'):
            dist += extract_json(f_path, 'dateTime', ['value'], [lambda x: int(x)/100])
        elif file.startswith('steps-'):
            steps += extract_json(f_path, 'dateTime', ['value'], [int])
        elif file.startswith('calories-'):
            cals += extract_json(f_path, 'dateTime', ['value'], [float])
        elif file.startswith('exercise-'):
            exe += extract_json(f_path, 'startTime', ['activityName', 'duration'], [str, lambda x: dt.timedelta(milliseconds=x)])
    
    # calculate the timezones
    time_diff = extract_timezones(d_path)
    
    # format and aggregate the metric data
    hr = extract_fitbit_metric(hr, time_diff, ['timestamp', 'hr'], 'mean')
    dist = extract_fitbit_metric(dist, time_diff, ['timestamp', 'dist'], 'sum')
    steps = extract_fitbit_metric(steps, time_diff, ['timestamp', 'steps'], 'sum')
    cals = extract_fitbit_metric(cals, time_diff, ['timestamp', 'cals'], 'sum')
    act = extract_fitbit_activities(exe, time_diff)
    
    # combine metrics into a single dataframe and adds the device name    
    data = pd.concat([hr, dist, steps, cals, act], axis=1)
    data = add_device(p_path, data, 'Smartwatch')
    
    return data

### Apple Watch

In [None]:
def extract_tz(path):
    '''
    Extract the timezone data from the tz.csv file
    
    Parameters
    ----------
    path: string
        path to the tz.csv file
        
    Returns
    -------
    tz: DataFrame
        a dataframe of the timezone data
    '''
    # open and format the timezone data
    tz = pd.read_csv(os.path.join(path, 'tz.csv'))
    tz['timestamp'] = pd.to_datetime(tz['timestamp'])
    tz['timestamp'] = tz['timestamp'].dt.tz_localize(None)
    
    # add the timezone for the start of the data collection period and sort it
    start_time = pd.DataFrame({'timestamp': [dt.datetime.strptime('2023-05-31 00:00:00', '%Y-%m-%d %H:%M:%S')], 'tz': ['Europe/London']})
    tz = pd.concat([tz, start_time], ignore_index=True)
    tz = tz.sort_values('timestamp')
    
    # only includes the instances when the timezone first changes
    tz = tz[tz['tz'] != tz['tz'].shift(1)]

    return tz


def extract_record(path):
    '''
    Extract the activity data from the record.csv file
    
    Parameters
    ----------
    path: string
        path to the record.csv file
        
    Returns
    -------
    record: DataFrame
        a dataframe of the activity data
    '''
    # opens and formats the activty data, removing the UTC timezone
    record = pd.read_csv(os.path.join(path, 'record.csv'), low_memory=False)
    record = record[[
        'type',
        'unit',
        'endDate',
        'value',
        'device'
    ]]
    record['endDate'] = pd.to_datetime(record['endDate'])
    record['endDate'] = record['endDate'].dt.tz_localize(None)
    record = record.sort_values('endDate')
    
    return record


def extract_workout(path):
    '''
    Extract the workout data from the workout.csv file
    
    Parameters
    ----------
    path: string
        path to the workout.csv file
        
    Returns
    -------
    workout: DataFrame
        a dataframe of the workout data
    '''
    # opens and formats the workout data
    workout = pd.read_csv(os.path.join(path, 'workout.csv'))
    workout = workout[[
        'workoutActivityType',
        'startDate',
        'endDate',
        'device'
    ]]
    workout['startDate'] = pd.to_datetime(workout['startDate'])
    workout['startDate'] = workout['startDate'].dt.tz_localize(None)
    workout['endDate'] = pd.to_datetime(workout['endDate'])
    workout['endDate'] = workout['endDate'].dt.tz_localize(None)
    
    # removes 'HKWorkoutActivityType' from activity labels
    workout['workoutActivityType'] = workout['workoutActivityType'].apply(lambda x: x[21:])
    
    return workout

def extract_bpm(path):
    '''
    Extract the heart rate data from the bpm.csv file
    
    Parameters
    ----------
    path: string
        path to the bpm.csv file
        
    Returns
    -------
    bpm: DataFrame
        a dataframe of the bpm data
    '''
    # opens and formats the bpm data
    bpm = pd.read_csv(os.path.join(path, 'bpm.csv'))
    bpm['time'] = pd.to_datetime(bpm['time'])
    bpm['time'] = bpm['time'].dt.tz_localize(None)
    bpm = bpm.sort_values('time')
    
    return bpm


def adjust_timezone(data, tz, time_columns):
    '''
    Adjusts the timestamps in the data according to the timezones
    
    Parameter
    ---------
    data: DataFrame
        the data to adjust the timestamps of
    tz: DataFrame
        the timezones shifts to update the data by
    time_columns: list of strings
        the names of the columns of timestamps in the data
    
    Returns
    -------
    merged_data: DataFrame
        the data with the timestamps adjusted
    '''
    # merge the data and corresponding timezones
    merged_data = pd.merge_asof(data, tz, left_on=time_columns[0], right_on='timestamp', direction='backward')

    # adjust the timestamps with the corresponding timezone
    for time_column in time_columns:
        merged_data[time_column] = merged_data.apply(
            lambda row: row[time_column].tz_localize(pytz.utc).tz_convert(row['tz']).tz_localize(None),
            axis=1
        )
    
    # remove the timezone columns
    merged_data = merged_data.drop(columns = ['timestamp', 'tz'])
    
    return merged_data


def extract_apple_metric(record, metric_id, metric_name, method, extra_data = []):
    '''
    Extracts a metric from the record.csv file and aggregates it into 5-minute chunks
    
    Parameters
    ----------
    record: DataFrame
        the record.csv file
    metric_id: string
        the type of the metric being extarcted
    metric_name: string
        the name of the metric
    method: DataFrame method
        how the date is aggregated
    extra_data: list of DataFrames
        any extra data to be added before aggregation
        
    Returns
    -------
    metric: DataFrame
        the aggregated metric
    '''
    # extract chosen metric
    metric = record.loc[(record['type'] == metric_id) & (record['device'].str.contains('Watch'))]
    metric.loc[:,'value'] = pd.to_numeric(metric.loc[:,'value'])

    # correct units to count/min, km, count, kcal if not already
    unit_conversions = {
         'count/min': 1,
         'km': 1,
         'count': 1,
         'kcal': 1,
         'mi': 1.609344,
         'Cal': 1
    }
    for unit in metric['unit'].unique():
        if unit not in unit_conversions:
            print('Unknown unit:', unit)
        else:
            metric.loc[metric['unit'] == unit, 'value'] *= unit_conversions[unit]

    # format dataframe
    metric = metric[['endDate', 'value']]
    metric = metric.rename(columns = {'endDate': 'timestamp', 'value': metric_name})
    
    # add any extra data and sort it
    for data in extra_data:
        metric = pd.concat([metric, data])
    
    metric = metric.set_index('timestamp')
    metric = metric.sort_index()
    
    # resample the data at 5 minute intervals and apply the method to values in this interval then set the value to nan for any intervals with no values
    resampled_count = metric.resample('5min', closed='right', label='right').count()
    metric = metric.resample('5min', closed='right', label='right').apply(method)
    metric[resampled_count == 0] = float('nan')

    return metric


def extract_apple_activities(workout):
    '''
    Extracts the activity name data from the workout.csv file
    
    Parameters
    ----------
    workout: DataFrame
        the workout data
    
    Returns
    -------
    act: DataFrame
        the extarcted activity name data
    '''
    # filter entries made by the smartwatch
    workout = workout[workout['device'].str.contains('Watch').fillna(False)]
    workout = workout.drop(columns=['device'])
    
    # format the data and round the start and end times
    workout = workout.rename(columns={
        'workoutActivityType': 'activity',
        'startDate': 'start',
        'endDate': 'end'
    })
    workout['start'] = (workout['start'] + pd.Timedelta(seconds=450)).dt.floor('5min')
    workout['end'] = (workout['end'] - pd.Timedelta(seconds=150)).dt.ceil('5min')
    
    # generate a dataframe of every 5 minute interval that activity was being performed in with the activity label
    act = pd.DataFrame()
    for i, row in workout.iterrows():
        act_range = pd.date_range(row['start'], row['end'], freq='5min')
        act = pd.concat([act, pd.DataFrame({'activity': [row['activity']] * len(act_range)}, index=act_range)])
    
    act = act.rename_axis('timestamp')

    return act


def extract_apple(p_path, fso):
    '''
    Extracts data from apple health files.
    
    Parameters
    ----------
    p_path: string
        path to the participant's directory
    fso: string
        name of folder to extract the apple health data from
    
    Returns
    -------
    data: DataFrame
        data set of the extracted smartwatch data from the apple health files
    '''
    # get timezone data
    tz = extract_tz(os.path.join(p_path, fso))
    
    # get record and workout data
    record = extract_record(os.path.join(p_path, fso))
    workout = extract_workout(os.path.join(p_path, fso))
    bpm = extract_bpm(os.path.join(p_path, fso))
    
    # update data to the timezones
    record = adjust_timezone(record, tz, ['endDate'])
    workout = adjust_timezone(workout, tz, ['startDate', 'endDate'])
    bpm = adjust_timezone(bpm, tz, ['time'])    
    
    # rename bpm columns
    bpm = bpm.rename(columns = {'time':'timestamp', 'bpm': 'hr'})
    
    # extract the metrics from the record data
    hr = extract_apple_metric(record, 'HKQuantityTypeIdentifierHeartRate', 'hr', lambda x: x.mean(), [bpm])
    dist = extract_apple_metric(record, 'HKQuantityTypeIdentifierDistanceWalkingRunning', 'dist', lambda x: x.sum()*1000)
    steps = extract_apple_metric(record, 'HKQuantityTypeIdentifierStepCount', 'steps', lambda x: x.sum())
    cals = extract_apple_metric(record, 'HKQuantityTypeIdentifierActiveEnergyBurned', 'cals', lambda x: x.sum())
    
    # extract activities data
    act = extract_apple_activities(workout)
    
    # combine metrics into a single dataframe and adds the device name
    data = pd.concat([hr, dist, steps, cals, act], axis=1)
    data = add_device(p_path, data, 'Smartwatch')
    
    return data

## Process raw files

In [None]:
def check_directory(path, known_fsos, dir_name):
    '''
    Prints any unknown file system objects.

    Parameters
    ----------
    path: string
        path to the directory
    known_fsos: list of strings
        list of known file system objects in the directory
    dir_name: string
        name of the directory being searched
    '''
    unknowns = []
    for fso in os.listdir(path):
        if not any(fso.startswith(known_fso) for known_fso in known_fsos):
            unknowns.append(fso)

    if len(unknowns) > 0:
        print('Unknown file system objects in {} directory: {}'.format(dir_name, unknowns))
        

def process_raw_data(raw_path, save_path):
    '''
    Processes the raw BrisT1D data and saves it in a consitant format.

    Parameters
    ----------
    raw_path: string
        path to the raw data
    save_path: string
        path to where the processed data will be saved
    '''
    data_extracts = {
        'apple_health': extract_apple,
        'carelink_pump_': extract_carelink_pump,
        'carelink_sensor_': extract_carelink_sensor,
        'clarity_': extract_clarity,
        'fitbit': extract_dashboard,
        'glooko': extract_glooko,
        'libreview.csv': extract_libreview,
        'takeout': extract_takeout
    }
    
    other_fsos = [
        'basal_profile.txt',
        'devices.csv',
        'device_carelink.txt',
        'extended.csv'
    ]
    

    medtronic_loops = ['P12', 'P16', 'P18']
    omnipod_5_starts = {'P07': pd.to_datetime('2023-07-05 10:00'), 'P17': pd.to_datetime('2023-06-01 00:00')}

    for p_num in os.listdir(raw_path):
        print('Participant {}'.format(p_num[1:]))
        p_path = os.path.join(raw_path, p_num)
        check_directory(p_path, list(data_extracts.keys())+other_fsos, p_num)
        p_data = pd.DataFrame()

        # consider glooko files together for pump and phone misalingment detection
        for data_extract in data_extracts:
            fsos = [fso for fso in os.listdir(p_path) if fso.startswith(data_extract)]
            if fsos:
                print(fsos)
                if data_extract == 'glooko':
                    if p_num in omnipod_5_starts:
                        p_data = pd.concat([p_data, data_extracts['glooko'](p_path, fsos, omnipod_5_starts[p_num])])
                    else:
                        p_data = pd.concat([p_data, data_extracts['glooko'](p_path, fsos)])
                else:
                    for fso in fsos:
                        # remove data after the change to carelink exporting that removed closed-loop insulin adjustments for those that use one
                        if p_num in medtronic_loops and fso.startswith('carelink_pump_') and dt.datetime.strptime(fso[-14:-4], '%Y-%m-%d') > dt.datetime.strptime('2023-12-01', '%Y-%m-%d'):
                            print('carelink file "{}" ignored due to missing closed-loop adjustments from later carelink exports'.format(fso))
                        else:
                            p_data = pd.concat([p_data, data_extracts[data_extract](p_path, fso)])

        # resorts the data, removes duplicates, and removes data after the last insulin value
        p_data = p_data.sort_index().reset_index().drop_duplicates().reset_index()
        p_data = p_data.set_index('timestamp')
        all_columns = ['bg', 'insulin', 'carbs', 'hr', 'dist', 'steps', 'cals', 'activity', 'device']
        columns = []
        for column in all_columns:
            if column not in p_data.columns:
                p_data[column] = np.nan
        p_data = p_data[all_columns]

        # round columms
        p_data['insulin'] = p_data['insulin'].round(4)
        p_data['hr'] = p_data['hr'].round(1)
        p_data['dist'] = p_data['dist'].round(1)
        p_data['steps'] = p_data['steps'].round(0)
        p_data['cals'] = p_data['cals'].round(2)

        # prints all the data and plots the t1d data
        display(p_data)
        plot_data(p_data)
        print()

        # saves the data as a .csv file
        p_data.to_csv(os.path.join(save_path, p_num + '.csv'))

In [None]:
raw_path = '../data/raw_state'
save_path = '../data/processed_state/'

process_raw_data(raw_path, save_path)