# Aggregate Night Scout Statistics
Author: Spencer Weston

This notebook exists to begin evaluating the nightscout/openaps data as a whole. This notebook should help determine which variables we can drop and generate some metadata. We also need to validate some assumptions across all subjects. We'll get setup by accumulating all the file pathes into a single list.

In [126]:
import os
import numpy as np
import pandas as pd 
import re 
from datetime import datetime, timedelta, timezone
import pytz
from collections import namedtuple
from sortedcontainers import SortedDict
from pandas.errors import ParserError
from pandas.errors import OutOfBoundsDatetime

In [9]:
os.getcwd()

'C:\\Users\\spenc\\Documents\\Berkeley\\Capstone\\BGPredict\\Notebooks'

In [10]:
SubjectInfo = namedtuple("SubjectInfo", ['path', 'subject_id', 'devicestatus', 'entries', 'treatments'])

def generate_subject_info(data_dir, subject_number):
    direct_sharing_folder = 'direct-sharing-31'
    path = f"{data_dir}/{subject_number}"
    # Some individuals have multiple direct-sharing folders. Ensure the correct folder exists and add it to path
    # For example, one individual has a folder with menstruation data
    if direct_sharing_folder not in os.listdir(path):
        return None
    else:
        path = f"{path}/{direct_sharing_folder}"
    
    subject_dirs = [folder for folder in os.listdir(path)]
    relevant_dir_names = ["treatments", "devicestatus", "entries"]
    relevant_dirs = {k: [] for k in relevant_dir_names}
    # Identify every file associated with treatments, device status, or entries and store them in a dictionary
    for folder in subject_dirs:
        for dir_name in relevant_dir_names:
            if dir_name in folder and folder.endswith("_csv"):
                dir_path = f"{path}/{folder}"
                files = [f"{dir_path}/{file}" for file in os.listdir(dir_path)]
                relevant_dirs[dir_name].extend(files)
    return SubjectInfo(path=path, subject_id=subject_number, devicestatus=relevant_dirs['devicestatus'],
                      entries=relevant_dirs['entries'], treatments=relevant_dirs['treatments'])
    
# subject = SubjectInfo()
data_dir = "C:\\Users\spenc\Documents\Berkeley\Capstone\n=183_OpenAPS_Data_Commons_August_2021_UNZIPPED"
data_dir = data_dir.replace("\\", "/")
data_dir = data_dir.replace("\n", "/n")
# isnumeric validates that the folder comes from openaps/nightscout
subject_dirs = [generate_subject_info(data_dir, folder) for folder in os.listdir(data_dir) if folder.isnumeric()] 

# None will be returned if an individual has no  
try:
    while subject_dirs.index(None):
        idx = subject_dirs.index(None)
        subject_dirs.pop(idx)
except ValueError:
    pass

In [11]:
s = [i for i in subject_dirs if i.subject_id=="00221634"]
file = s[0].entries[0]
df = pd.read_csv(file, header=None, low_memory=False)
df.columns = ['time', 'bg']
print(df.shape)
df.head()

(45380, 2)


Unnamed: 0,time,bg
0,2018-08-04T23:58:50Z,150
1,2018-08-04T23:53:51Z,153
2,2018-08-04T23:48:51Z,155
3,2018-08-04T23:43:51Z,159
4,2018-08-04T23:38:51Z,164


In [6]:
test = df.head().to_csv(index=False)
print(test)

time,bg
2018-08-04T23:58:50Z, 150
2018-08-04T23:53:51Z, 153
2018-08-04T23:48:51Z, 155
2018-08-04T23:43:51Z, 159
2018-08-04T23:38:51Z, 164



In [47]:
df = pd.read_csv(file, header=None, low_memory=False)
df.columns = ['time', 'bg']
print(df.shape)
df.head()
df = df.loc[df.bg!=" null", :]
print(df.shape)
df.head()

(45264, 2)


Unnamed: 0,time,bg
0,2018-08-04T23:58:50Z,150
1,2018-08-04T23:53:51Z,153
2,2018-08-04T23:48:51Z,155
3,2018-08-04T23:43:51Z,159
4,2018-08-04T23:38:51Z,164


## Summary Statistics
Create some rough summary statistics based on the available files

In [4]:
file_presence = {"treatments": [], "devicestatus": [], "entries": []}
for x in subject_dirs:
    for key in file_presence.keys():
        tuple_idx = x._fields.index(key)
        files = x[tuple_idx]
        file_presence[key].append(len(files))

In [5]:
file_presence.keys()

dict_keys(['treatments', 'devicestatus', 'entries'])

Here, we look at subjects with no device status but treatment data and subjects with no folders associated with their data. If they have no folders, that's an easy exclusion criteria. If they have treatment but not device status data, we'll need to validate rather the treatment data appears equivalently formatted to the device status from other olders. 

In [6]:
# Get count of the number of files for each subject
subjects = [x.subject_id for x in subject_dirs]
file_count = list(zip(subjects, *[file_presence[k] for k in file_presence.keys()]))

# subjects with no device status but treatment data; will need to be evaluated for data integrity
no_device_but_treatment = [] 
# Subjects with no folders associated with their data; exclusion criterion
no_folders = [] 
# No blood glucose data; exclusion criterion 
no_entries = []

for x in file_count:
    # no entries
    if x[1] < 1:
        if x[2] == 0 and x[3] == 0:
            no_folders.append(x[0])
        else: 
            no_entries.append(x[0])
        next 
    # no device status but treatment 
    if x[2] == 0 and x[3] > 0:
        no_device_but_treatment.append(x[0])
        next

print(f"NDT: {no_device_but_treatment} \n \n NF: {no_folders} \n \n NE: {no_entries}")

NDT: ['42052178', '50311906', '61179686', '66773091'] 
 
 NF: ['32635618', '51359431'] 
 
 NE: []


### Evaluate Subjects with No Device Status but Treatment Tables

In [7]:
# Functions from NightScoutJoinAnalysis
def define_column_superset(dataframes: list):
    superset = set()
    for df in dataframes:
        cols = list(df.columns)
        superset = superset.union(cols)
    return superset

def apply_superset(df, superset):
    df_cols = set(list(df.columns))
    set_diff = superset.difference(df_cols)
    n = len(df)
    additional_col_df = pd.DataFrame({k: [None for _ in range(n)] for k in set_diff})
    new_df = pd.concat([df, additional_col_df], axis=1)
    return new_df

def concat_dfs(dataframes: list):
    return pd.concat(dataframes, axis = 0)

In [8]:
ndt_subjects = [x for x in subject_dirs if x.subject_id in no_device_but_treatment]
ndt_dfs = {}
for subj in ndt_subjects:
    print(f"Subject: {subj.subject_id}, Num_files: {len(subj.treatments)}")
    ndt_dfs.update({subj.subject_id: [pd.read_csv(file, dtype=str) for file in subj.treatments]}) 
    for df in ndt_dfs[subj.subject_id]:
        print(df.shape)

Subject: 42052178, Num_files: 6
(15000, 53)
(15000, 56)
(15000, 57)
(15000, 61)
(15000, 54)
(5911, 348)
Subject: 50311906, Num_files: 3
(205, 48)
(15000, 88)
(10835, 81)
Subject: 61179686, Num_files: 5
(15000, 258)
(15000, 16)
(15000, 16)
(15000, 16)
(1272, 18)
Subject: 66773091, Num_files: 6
(15000, 16)
(15000, 16)
(15000, 16)
(15000, 16)
(15000, 16)
(10132, 16)


In [10]:
# df_list = [df for df in ndt_dfs["42052178"]]
# superset = define_column_superset(df_list)
# print('supersetting')
# superset_dfs = [apply_superset(df, superset) for df in df_list]
# print('concatting')
# union_df = concat_dfs(superset_dfs)
# # union_df.describe()

In [12]:
test = [df.reset_index(drop=True) for df in superset_dfs]
start = datetime.now()
union_df = pd.concat(test, axis = 0)
end = datetime.now()
print("Unioned in:", end-start)
union_df.head()

Unioned in: 0:00:01.662886


Unnamed: 0,created_at,absolute,carbs,_id,duration,NSCLIENT_ID,rate,eventType,enteredBy,insulin,...,raw_duration/_type,bolus/appended/0/data/5/age,square/appended/0/data/0/amount,changed/insulin_sensitivies/3/_offset,raw_rate/appended/0/data/1/amount,preBolus,changed/insulin_sensitivies/7/offset,stale/insulin_sensitivies/0/_offset,raw_rate/_body,stale/insulin_sensitivies/7/sensitivity
0,2019-05-10T23:40:47Z,0.0,,5cd69da3131daf5594f593e1,120.0,1557531647994,0.0,Temp Basal,S6MIS6,,...,,,,,,,,,,
1,2019-05-10T23:39:03Z,,,5cd69da3131daf5594f593e3,,1557531661775,,Correction Bolus,,0.4,...,,,,,,,,,,
2,2019-05-10T23:30:53Z,,,5cd69da3131daf5594f593e2,,1557531053916,,Temp Basal,S6MIS6,,...,,,,,,,,,,
3,2019-05-10T23:29:01Z,,,5cd69da3131daf5594f593e4,,1557531068046,,Correction Bolus,,0.2,...,,,,,,,,,,
4,2019-05-10T23:17:56Z,0.0,,5cd69d5a131daf5594f593cc,82.0,1557530276855,0.0,Temp Basal,S6MIS6,,...,,,,,,,,,,


In [None]:
## Prints columns
# cols = []
# for i,col in enumerate(union_df.columns):
#     cols.append(col)
#     if i % 5 == 0:
#         print(cols)
#         cols = []
union_df.describe()

We can see that these treatment tables hold a lot of relevant information. We'd love to keep this info if possible. We'll have to do a lot of work to process the though.

## Evaluate Treatment Uniqueness
In Nightscout join analysis, we saw that a union of treatments with duplicates dropped exactly matched a device status file. Let's see how common that is. 

In [27]:
class Subject:
    
    def __init__(self, subject_id, subject_path, treatment_files, device_status_files, entries_files):
        self.subject_id = subject_id
        self.subject_path = subject_path
        
        self.treatment_files = treatment_files
        self.treatment_shapes = None
        self.treatment_df = None
        
        self.device_status_files = device_status_files
        self.device_status_shapes = None
        self.device_status_df = None
        
        self.entries_files = entries_files
        self.entries_shapes = None
        self.entries_df = None 
        
    
    def get_device_status_shapes(self):
        if self.device_status_shapes is not None:
            return self.device_status_shapes
        else:
            self.device_status_shapes = [pd.read_csv(file, low_memory=False).shape for file in self.device_status_files]
            return self.device_status_shapes
    
    def get_device_status_df(self):
        if self.device_status_df is not None:
            return self.device_status_df
        else:
            device_status_dfs = [pd.read_csv(file, low_memory=False).reset_index(drop=True) for file in self.device_status_files]
            self.device_status_df = pd.concat(device_status_dfs, axis=0)
            return self.device_status_df 
    
    def get_treatment_shapes(self):
        if self.treatment_shapes is not None:
            return self.treatment_shapes
        else:
            self.treatment_shapes = [pd.read_csv(file, low_memory=False).shape for file in self.treatment_files]
            return self.treatment_shapes
        
    def get_treatment_df(self):
        if self.treatment_df is not None:
            return self.treatment_df
        else:
            treatment_dfs = [pd.read_csv(file, low_memory=False).reset_index(drop=True) for file in self.treatment_files]
            self.treatment_df = pd.concat(treatment_dfs, axis=0)
            return self.treatment_df
        
    def get_entries_shapes(self):
        if self.entries_shapes is not None:
            return self.entries_shapes
        else:
            self.entries_shapes = [pd.read_csv(file, low_memory=False).shape for file in self.entries_files]
            return self.entries_shapes
    
    def get_entries_df(self):
        if self.entries_df is not None:
            return self.entries_df
        else:
            self.entries_dfs = [pd.read_csv(file, low_memory=False, header=None).reset_index(drop=True) for file in self.entries_files]
            return self.entries_df
            
    def check_equivalence(self, treatment_df=None, check_unique=False):
        """Check rather treatment data is a subset of or equivalent to device status data"""
        if treatment_df is None:
            treatment_df = self.get_treatment_df()
        ds_df = self.get_device_status_df()
        if treatment_df.shape == ds_df.shape:
            treat_array = self.to_string_and_numpy(treatment_df)
            ds_array = self.to_string_and_numpy(ds_df)
            if np.array_equal(treat_array, ds_array):
                print("Equivalent full treatment and device status")
                return 1
            else:
                print("Equivalent full treatment and device status shape but not elements")
                return 0
        elif treatment_df.shape in self.get_device_status_shapes():
            idx = self.get_device_status_shapes().index(treatment_df.shape)
            ds_df = pd.read_csv(self.device_status_files[idx], low_memory=False)
            ds_array = self.to_string_and_numpy(ds_df)
            treat_array = self.to_string_and_numpy(treatment_df)
            if np.array_equal(treat_array, ds_array):
                print("Treatment equivalent to subset of device status")
                return 1 
            else:
                print("Treatment has equivalent shape to subset of device status but different elements")
                return 0 
        elif not check_unique:
            # Make a recursive check on the treatment dataframe with duplicates dropped 
            print('here')
            self.check_equivalence(treatment_df.drop_duplicates(), check_unique=True)
        else:
            print("Treatment data is not a copy of device status data")
    
    @staticmethod
    def to_string_and_numpy(df):
        return df.as_type('str').to_numpy()

**Subject 01352464: No duplicated treatment and device status**
Here, we validate that subject 01352464 does not have any duplicates from the treatment data frame and that the check equivalence function  

In [8]:
subject = subject_dirs[3]
sub_3 = Subject(subject.subject_id, subject.path, subject.treatments, subject.devicestatus, subject.entries)
# ds_shapes = sub_1.get_device_status_shapes()
# treat_shapes = sub_1.get_treatment_shapes()
treatment_df = sub_1.get_treatment_df()
ds_df = sub_1.get_device_status_df()

In [9]:
print(treatment_df.shape)
print(ds_df.shape)

(140527, 403)
(167799, 1222)


In [10]:
test = []
for t_col in treatment_df.columns:
    if t_col in ds_df.columns:
        test.append((t_col, 1))
    else:
        test.append((t_col, 0))
print(sum([x[1] for x in test]))

2


In [11]:
t = [x for x in test if x[1] ==1]
t

[('created_at', 1), ('_id', 1)]

In [12]:
subject = subject_dirs[3]
sub_3 = Subject(subject.subject_id, subject.path, subject.treatments, subject.devicestatus, subject.entries)
sub_3.check_equivalence()

here
no unique match


In [13]:
# Clean up variables
del sub_3
del treatment_df
del ds_df

**Subject 00221634: No duplicated treatment and device status**
In NightScoutJoinAnalysis, we saw that subject 00221634 has duplicate treatment and device status data. Evaluate rather `check_equivalence()` identifies the duplicate.

Written after running the below: For reasons, it appears that this data isn't as duplicated as I found in NightScoutJoinAnalysis. So, I'm just going to full join everything. Then, we can compress data across the relavent columns to shrink the dataset horizontally. Then, we'll still have multiple rows for each blood glucose entry. We can then groupby blood glucose entry and perform aggregations on the relevant columns.

In [28]:
subject = [s for s in subject_dirs if s.subject_id=="00221634"][0]
subject
sub_3 = Subject(subject.subject_id, subject.path, subject.treatments, subject.devicestatus, subject.entries)

In [24]:
print(subject.treatments)

['C:/Users/spenc/Documents/Berkeley/Capstone/n=183_OpenAPS_Data_Commons_August_2021_UNZIPPED/00221634/direct-sharing-31/00221634_treatments_2018-03-01_to_2018-08-05_csv/00221634_treatments_2018-03-01_to_2018-08-05_aa.csv', 'C:/Users/spenc/Documents/Berkeley/Capstone/n=183_OpenAPS_Data_Commons_August_2021_UNZIPPED/00221634/direct-sharing-31/00221634_treatments_2018-03-01_to_2018-08-05_csv/00221634_treatments_2018-03-01_to_2018-08-05_ab.csv']


In [22]:
sub_3.check_equivalence()

here
Treatment data is not a copy of device status data


In [23]:
treatment_df = sub_3.get_treatment_df()
print(treatment_df.shape)
ds_df = sub_3.get_device_status_df()
print(ds_df.shape)

(26689, 59)
(53877, 654)


In [72]:
pd.to_datetime(treatment_df.created_at)

0       2018-08-04 23:47:05+00:00
1       2018-08-04 23:01:08+00:00
2       2018-08-04 22:10:26+00:00
3       2018-08-04 22:10:26+00:00
4       2018-08-04 21:49:05+00:00
                   ...           
11684   2018-03-01 17:37:00+00:00
11685   2018-03-01 15:01:00+00:00
11686   2018-03-01 14:17:00+00:00
11687   2018-03-01 12:17:51+00:00
11688   2018-03-01 06:37:58+00:00
Name: created_at, Length: 26689, dtype: datetime64[ns, UTC]

In [25]:
len(treatment_df._id.drop_duplicates())

26689

## Verify all entries have the same number of columns

This verifies all entry files have the same shape and can be naively unioned.

In [None]:
subject_objs = []
for x in subject_dirs:
    subject_objs.append(
        Subject(x.subject_id, x.path, x.treatments, x.devicestatus, x.entries)
    )
for x in subject_objs:
    shapes = x.get_entries_shapes()
    for shape in shapes:
        if shape[1] != 2:
            print(f"{x.subject_id} has malconformed shape. {shapes}")
            break 

## Raw join of the data 

In [123]:
class Subject:
    
    def __init__(self, subject_id, subject_path, treatment_files, device_status_files, entries_files):
        self.subject_id = subject_id
        self.subject_path = subject_path
        
        self.treatment_files = treatment_files
        self.treatment_shapes = None
        self.treatment_df = None
        
        self.device_status_files = device_status_files
        self.device_status_shapes = None
        self.device_status_df = None
        
        self.entries_files = entries_files
        self.entries_shapes = None
        self.entries_df = None 
        
        self.join_table = None 
        
    def get_device_status_shapes(self):
        if self.device_status_shapes is not None:
            return self.device_status_shapes
        else:
            self.device_status_shapes = [pd.read_csv(file, low_memory=False).shape for file in self.device_status_files]
            return self.device_status_shapes
    
    def get_device_status_df(self):
        if len(self.device_status_files) == 0:
            return None
        if self.device_status_df is not None:
            return self.device_status_df
        else:
            device_status_dfs = [pd.read_csv(file, low_memory=False).reset_index(drop=True) for file in self.device_status_files]
            self.device_status_df = pd.concat(device_status_dfs, axis=0)
            self.device_status_df['timestamp'] = pd.to_datetime(self.device_status_df['created_at'])
            self.device_status_df['devicestatusid'] = [i for i in range(len(self.device_status_df))]
            return self.device_status_df 
    
    def get_treatment_shapes(self):
        if self.treatment_shapes is not None:
            return self.treatment_shapes
        else:
            self.treatment_shapes = [pd.read_csv(file, low_memory=False).shape for file in self.treatment_files]
            return self.treatment_shapes
        
    def get_treatment_df(self):
        if self.treatment_df is not None:
            return self.treatment_df
        else:
            treatment_dfs = [pd.read_csv(file, low_memory=False).reset_index(drop=True) for file in self.treatment_files]
            self.treatment_df = pd.concat(treatment_dfs, axis=0)
            try:
                self.treatment_df['timestamp'] = pd.to_datetime(self.treatment_df["created_at"])
            except ValueError:
                # Somewhat randomly, for unclear reasons, we receive the following error:
                # ValueError: cannot reindex from a duplicate axis
                # Resetting index seems to resolve this.
                self.treatment_df = self.treatment_df.reset_index()
                self.treatment_df['timestamp'] = pd.to_datetime(self.treatment_df["created_at"])
            self.treatment_df['treatmentid'] = [i for i in range(len(self.treatment_df))]
            return self.treatment_df
        
    def get_entries_shapes(self):
        if self.entries_shapes is not None:
            return self.entries_shapes
        else:
            self.entries_shapes = [pd.read_csv(file, low_memory=False).shape for file in self.entries_files]
            return self.entries_shapes
    
    def get_entries_df(self):
        if self.entries_df is not None:
            return self.entries_df
        else:
            entries_dfs = [pd.read_csv(file, low_memory=False, header=None).reset_index(drop=True) for file in self.entries_files]
            self.entries_df = pd.concat(entries_dfs, axis=0)
            self.entries_df.columns = ["time", "bg"]
            try:
                self.entries_df['timestamp'] = pd.to_datetime(self.entries_df['time'])
            except Exception:
                print("HERE IN THE ERROR")
                self.entries_df['timestamp'] = pd.to_datetime(self.entries_df['time'].str.replace("PM", ""))
#                 self.entries_df['timestamp'] = pd.to_datetime(self.entries_df['time'])
            self.entries_df['entryid']  = [i for i in range(len(self.entries_df))]
            return self.entries_df
        
    def get_join_table(self):
        if self.join_table is not None:
            return self.join_table
        else:
            self.join_table = self._temporal_join()
            return self.join_table

    def _temporal_join(self):
        if len(self.entries_files) == 0:
            print(f"No entries for subject {self.subject_id}. Returning None")
            return None
        elif len(self.device_status_files) == 0 and len(self.treatment_files) == 0:
            print(f"{self.subject_id} does not have device status or treatment files. Returning None")
            return None
        
        # Load tables and convert relevant columns to date times
        entry_df = self.get_entries_df()
        treatments_df = self.get_treatment_df()
        device_status_df = self.get_device_status_df()
        
        
        if treatments_df.empty and device_status_df.empty:
            print(f"subject {self.subject_id} does not have treatment or device status tables. Passing.")
        # Creat
        index_dict = self._temporal_join_index_dict(entries=entry_df, device_status=device_status_df, treatments=treatments_df)
        join_df = self._create_temporal_join_df(index_dict)
        
        if device_status_df is not None and treatments_df is not None:
            joined_data = (join_df
                           .merge(entry_df, how='left', left_on ='entryid', right_on='entryid', suffixes=("_x","_ent"))
                           .merge(device_status_df, how='left', left_on="devicestatusid", right_on="devicestatusid", suffixes=("_y","_ds")) 
                           .merge(treatments_df, how='left', left_on="treatmentid", right_on="treatmentid", suffixes=("_z","_tre")) 
                          )
        elif device_status_df is None and treatments_df is not None:
            joined_data = (join_df
               .merge(entry_df, how='left', left_on ='entryid', right_on='entryid', suffixes=("_x","_ent"))
               .merge(treatments_df, how='left', left_on="treatmentid", right_on="treatmentid", suffixes=("_z","_tre")) 
              )
        elif device_status_df is not None and treatments_df is None:
            joined_data = (join_df
               .merge(entry_df, how='left', left_on ='entryid', right_on='entryid', suffixes=("_x","_ent"))
               .merge(device_status_df, how='left', left_on="devicestatusid", right_on="devicestatusid", suffixes=("_y","_ds")) 
              )
        return joined_data
    
    def _temporal_join_index_dict(self, entries, device_status, treatments):
        """Assign device status and treatment rows to the nearest entry that occurs after the device status or treatment row."""
        # Store timestamp and entries in zipped list; get entry timezones
        timestamp_keys = entries['timestamp'].to_list()
        timestamp_keys = [x.replace(tzinfo=pytz.utc) for x in timestamp_keys if isinstance(x, pd.Timestamp) or isinstance(x, datetime)]
        entry_id_list = entries['entryid'].to_list()
        zipped = list(zip(timestamp_keys, entry_id_list))
        
        # Check for offset aware 
        
        # fill in standard python dictionary with entry data; convert to SortedDict sorted on entry timestamps
        index_dict = SortedDict({timestamp: (entry_id, {"device_status": [], "treatment": []}) for timestamp, entry_id in zipped})
        
        # Generate list of tuples for (devicetimestamp, deviceid) 
        if device_status is not None:
            device_tuples = list(zip(device_status['timestamp'], device_status['devicestatusid']))
        else:
            device_tuples = None
        
        # Generate list of tuples for (devicetimestamp, deviceid) 
        if treatments is not None:
            treatments_tuples = list(zip(treatments['timestamp'], treatments['treatmentid']))
        else:
            treatments_tuples = None
        
        # Set constants from index_dict
        index_keys = index_dict.keys()
        max_idx = index_dict.index(index_keys[len(index_keys)-1])
        
        if device_tuples is not None:
            for comparison_timestamp, comparison_id in device_tuples:
                # Left idx is the index of the entry timestamp the comparison timestamp is less than or equal to 
                try:
                    left_idx = index_dict.bisect_left(comparison_timestamp) 
                except TypeError:
                    left_idx = index_dict.bisect_left(comparison_timestamp.replace(tzinfo=pytz.utc))

                # Assign comparison timestamps greater than the last entry to the last entry
                # (Comparisons < min(entry timestamp) will naturally be joined to min(entry timestamp))
                if left_idx >= max_idx:
                    left_idx = max_idx

                # Get the index_dict key associated with the bisect_left operation
                assignment_key = index_keys[left_idx]

                # Assign the comparison_id to the assignment key of the index_dict
                index_dict[assignment_key][1]['device_status'].append(comparison_id)
        
        if treatments_tuples is not None:
            # Equivalent to the above for-loop but for treatments_tuples    
            for comparison_timestamp, comparison_id in treatments_tuples:
                # Left idx is the index of the entry timestamp the comparison timestamp is less than or equal to 
                try:
                    left_idx = index_dict.bisect_left(comparison_timestamp) 
                except TypeError:
                    left_idx = index_dict.bisect_left(comparison_timestamp.replace(tzinfo=pytz.utc))

                # Assign comparison timestamps greater than the last entry to the last entry
                # (Comparisons < min(entry timestamp) will naturally be joined to min(entry timestamp))
                if left_idx >= max_idx:
                    left_idx = max_idx

                # Get the index_dict key associated with the bisect_left operation
                assignment_key = index_keys[left_idx]

                # Assign the comparison_id to the assignment key of the index_dict
                index_dict[assignment_key][1]['treatment'].append(comparison_id)

        return index_dict
    
    def _create_temporal_join_df(self, index_dict):
        join_ids = []
        keys = index_dict.keys()
        for k in keys:
            device_ids = index_dict[k][1]['device_status']
            treatment_ids = index_dict[k][1]['treatment']

            zip_lists = []
            # if device_ids and treatment_ids have different lengths...
            if len(device_ids) != len(treatment_ids):
                # ... fill the shorter list with None values 
                ids = [device_ids, treatment_ids]
                lengths = [len(i) for i in ids]
                zip_length = max(lengths)

                # Fill in the shortest list
                min_len_idx = lengths.index(min(lengths))
                min_len_ids = ids[min_len_idx]
                min_len_ids.extend([None for _ in range(min_len_idx, zip_length)])

                # identify the longest list 
                max_len_idx = lengths.index(max(lengths))
                max_len_ids = ids[max_len_idx]

                # Append all lists to zip_list ordered as (entry_ids, device_ids, treatments_ids) 
                zip_lists.append([index_dict[k][0] for _ in range(zip_length)]) # entry Id's

                # Identify the device id list to facilitate appending lists in the correct order
                if max_len_ids == device_ids:
                    zip_lists.append(max_len_ids) # device_ids
                    zip_lists.append(min_len_ids) # treatment_ids
                else:
                    zip_lists.append(min_len_ids) # device_ids
                    zip_lists.append(max_len_ids) # treatment_ids
            else:
                # Set the zip_length to the length of device_ids
                zip_length = len(device_ids)
                zip_lists.append([index_dict[k][0] for _ in range(zip_length)]) # Entry_ids
                zip_lists.append(device_ids)
                zip_lists.append(treatment_ids)

            # Add the current iteration results 
            join_ids.extend(list(zip(*zip_lists)))

        join_dict = {"entryid": [i[0] for i in join_ids],
                     "devicestatusid": [i[1] for i in join_ids],
                     "treatmentid": [i[2] for i in join_ids]}
        join_df = pd.DataFrame(join_dict)
        return join_df

In [124]:
subject = [s for s in subject_dirs if s.subject_id=="97872409"][0]
# subject = subject_dirs[1]
sub = Subject(subject.subject_id, subject.path, subject.treatments, subject.devicestatus, subject.entries)
print(sub.subject_id)

97872409


In [136]:
try:
    entries = sub.get_entries_df()
    t = pd.to_datetime(entries["time"])
except OutOfBoundsDatetime:
    print("here")

here


In [169]:
entries = sub.get_entries_df()
entries['time'] = entries['time'].str.replace("PM", "")
try:
    t = pd.to_datetime(entries['time'])
except OutOfBoundsDatetime:
    print('out of bounds datetime')
    entries['timestamp'] = pd.to_datetime(entries['time'], errors='coerce')
    nulls = entries['timestamp'].loc[entries['timestamp'].isna(), ]
    entries = entries.loc[~entries['timestamp'].isin(nulls)]
    print(entries.shape)

out of bounds datetime
(457970, 3)


In [166]:
test = entries['timestamp'].loc[entries['timestamp'].isna(),]

entries.loc[~entries['timestamp'].isin(test), :]

Unnamed: 0,time,bg,timestamp
0,2017-07-21T19:31:00-04:00,96.0,2017-07-21 19:31:00-04:00
1,2017-07-21T19:31:00-04:00,94.0,2017-07-21 19:31:00-04:00
2,2017-07-21T19:31:00-04:00,96.0,2017-07-21 19:31:00-04:00
3,2017-07-21T19:31:00-04:00,94.0,2017-07-21 19:31:00-04:00
4,2017-07-21T19:31:00-04:00,96.0,2017-07-21 19:31:00-04:00
...,...,...,...
275066,2017-04-03T19:00:30.000Z,105.0,2017-04-03 19:00:30+00:00
275067,2017-04-03T18:57:30.000Z,105.0,2017-04-03 18:57:30+00:00
275068,2017-04-03T18:42:30.000Z,94.0,2017-04-03 18:42:30+00:00
275069,2017-04-03T18:34:30.000Z,80.0,2017-04-03 18:34:30+00:00


In [157]:
print(t.shape)
print(t.isna().shape)
print((t.index == t.isna().index).all() )
test =t.dropna()
print(test.shape)
drop_indices = test.index. difference(entries.index)


(457991,)
(457991,)
True
(457970,)
Int64Index([], dtype='int64')


In [125]:
test = sub.get_join_table()

HERE IN THE ERROR


OutOfBoundsDatetime: Out of bounds nanosecond timestamp: 117-07-31 05:01:13

In [122]:
test

Unnamed: 0,entryid,devicestatusid,treatmentid,time,bg_z,timestamp_z,created_at,absolute,carbs,_id,...,square/appended/0/_head,square/appended/0/_body,square/appended/0/_date,square/duration,square/_description,square/programmed,square/amount,square/_head,square/_body,square/_date
0,198303,,80582,Mon Jan 29 08:20:59 CET 2018,146,2018-01-29 08:20:59,2018-01-29T08:45:28+01:00,,,5b099a36b7c2fc0994a1ce4f,...,,,,,,,,,,
1,198303,,80583,Mon Jan 29 08:20:59 CET 2018,146,2018-01-29 08:20:59,2018-01-29T08:45:25+01:00,,,5b00b3af586b983f516ffd5d,...,,,,,,,,,,
2,198303,,80584,Mon Jan 29 08:20:59 CET 2018,146,2018-01-29 08:20:59,2018-01-29T07:33:13+01:00,,,42e4f7a264494bca8d8669af,...,,,,,,,,,,
3,198303,,80585,Mon Jan 29 08:20:59 CET 2018,146,2018-01-29 08:20:59,2018-01-29T07:33:13+01:00,,,2bc7f6cbf1f5431281b9423d,...,,,,,,,,,,
4,198303,,80586,Mon Jan 29 08:20:59 CET 2018,146,2018-01-29 08:20:59,2018-01-29T07:33:13+01:00,,,5b00b3af586b983f516ffd4e,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80906,62,,4,2019-05-10T23:20:19.259+0200,276,2019-05-10 23:20:19.259000+02:00,2019-05-10T23:17:56Z,0.0,,5cd69d5a131daf5594f593cc,...,,,,,,,,,,
80907,58,,3,2019-05-10T23:30:19.259+0200,274,2019-05-10 23:30:19.259000+02:00,2019-05-10T23:29:01Z,,,5cd69da3131daf5594f593e4,...,,,,,,,,,,
80908,56,,2,2019-05-10T23:35:19.259+0200,272,2019-05-10 23:35:19.259000+02:00,2019-05-10T23:30:53Z,,,5cd69da3131daf5594f593e2,...,,,,,,,,,,
80909,54,,1,2019-05-10T23:40:19.259+0200,270,2019-05-10 23:40:19.259000+02:00,2019-05-10T23:39:03Z,,,5cd69da3131daf5594f593e3,...,,,,,,,,,,


In [74]:
test = {'a': 1}
try:
    test['b']
except KeyError as e:
    print(e.args)
#     if "KeyError" in e.args:
#         print('asdf')

('b',)


In [127]:
# full_run_start = datetime.now()
# i =0 
# for x in subject_dirs[14:]:
#     print("iteration", i)
#     iter_start = datetime.now()
#     sub_id = x.subject_id
#     subject = Subject(x.subject_id, x.path, x.treatments, x.devicestatus, x.entries)
#     joined_data = subject.get_join_table()
#     print(joined_data.shape)
#     run_time = datetime.now()-iter_start
#     print(f"Run time for subject {sub_id} = {run_time}")
#     i+=1
# print(f"Full run time: {datetime.now() - full_run_start}")

In [15]:
subject = Subject(x.subject_id, x.path, x.treatments, x.devicestatus, x.entries)
print(subject.get_device_status_shapes())
print(subject.get_treatment_shapes())
# entries = subject.get_entries_df()
# timestamp_keys = entries['timestamp'].to_list()
# [x.replace(tzinfo=pytz.utc) for x in timestamp_keys if type(x)==datetime]
# [type(x) for x in timestamp_keys]

[(15000, 454), (15000, 454), (15000, 454), (15000, 454), (15000, 454), (15000, 454), (15000, 454), (15000, 455), (15000, 454), (15000, 454), (15000, 454), (15000, 454), (15000, 454), (15000, 457), (15000, 454), (15000, 454), (15000, 454), (15000, 454), (15000, 454), (15000, 454), (15000, 454), (15000, 454), (15000, 454), (15000, 454), (15000, 454), (15000, 454), (15000, 454), (15000, 454), (15000, 454), (15000, 454), (15000, 454), (15000, 454), (15000, 454), (15000, 455), (15000, 454), (15000, 556), (15000, 445), (15000, 446), (14547, 312), (15000, 443), (15000, 443), (15000, 441), (15000, 371), (15000, 300), (15000, 124), (15000, 42), (13461, 3), (15000, 124), (15000, 123), (15000, 121), (15000, 120), (15000, 121), (15000, 121), (11615, 154)]
[(15000, 438), (15000, 389), (15000, 501), (15000, 424), (15000, 364), (15000, 384), (4446, 365), (15000, 29), (15000, 32), (15000, 29), (15000, 31), (15000, 32), (15000, 460), (8497, 361)]


In [166]:
type(timestamp_keys[0]) == datetime
isinstance(timestamp_keys[0], pd.Timestamp) or isinstance(timestamp_keys[0], datetime)

True

In [107]:
entry_df = subject.get_entries_df()
t = entry_df.timestamp.to_list()
l = [x for x in t if type(x)==datetime]
len(l)

233642

In [None]:
timestamp_keys = subject.get_entries_df()['timestamp'].to_list()

**Below this point, the cells will not run. These cells were used to test different elements of Subject.get_temporal_join()**

In the below cell, it's curious that we see the same treatment and device status ID's given that these are independently created. 

In [41]:
i = 0
for k,v in test.items():
    if len(v[1]["device_status"]) and len(v[1]['treatment']):
        print(v)
        i +=1
    if i > 10:
        break

(113104, {'device_status': [199581, 199582, 199583, 199584], 'treatment': [8889]})
(108504, {'device_status': [190469, 190470, 190471, 190472], 'treatment': [8886, 8887, 8888]})
(108500, {'device_status': [190461, 190462, 190463, 190464], 'treatment': [8884, 8885]})
(108472, {'device_status': [190405, 190406, 190407, 190408], 'treatment': [8883]})
(108462, {'device_status': [190385, 190386, 190387, 190388], 'treatment': [8881, 8882]})
(108454, {'device_status': [190369, 190370, 190371, 190372], 'treatment': [8879, 8880]})
(108431, {'device_status': [190324, 190325, 190326, 190327], 'treatment': [8878]})
(108427, {'device_status': [190314, 190315, 190316, 190317, 190318, 190319], 'treatment': [8877]})
(108407, {'device_status': [190274, 190275, 190276, 190277], 'treatment': [8875, 8876]})
(108403, {'device_status': [190266, 190267, 190268, 190269], 'treatment': [8870, 8871, 8872, 8873, 8874]})
(108389, {'device_status': [190242, 190243, 190244, 190245], 'treatment': [8867, 8868, 8869]})

Based on the cell below, it appears the device status and treatment ID's are never different where there are matched values. It appears that device status and treatment data both align with a portion of the entries data, AND they align with the same portion of the entries data. Therefore, we have some subset of entries data with treatment and device status data most of the time. (I re-ran this process for several subjects). 

Now, let's look into making the join dataframe from the temporal_join output.

In [20]:
# Validate unique ID's 
keys = test.keys()
unique_ds = set()
not_unique_ds = []
unique_treat =set()
not_unique_treat = []
for k,v in test.items():
    ds = v[1]['device_status']
    if ds:
        [unique_ds.add(i) for i in ds if i]
        [not_unique_ds.append(i) for i in ds if i]
    treat = v[1]['treatment']
    if treat:
#         print(treat)
        [unique_treat.add(i) for i in treat if i]
        [not_unique_treat.append(i) for i in treat if i]
print(len(unique_ds))
print(len(not_unique_ds))
print(len(unique_treat))
print(len(not_unique_treat))

218195
218195
8889
8889


In [21]:
# Modify to not assume equal length treatments/device_status 
join_ids = []
for k in keys:
    device_ids = test[k][1]['device_status']
    treatment_ids = test[k][1]['treatment']
    
    zip_lists = []
    # if device_ids and treatment_ids have different lengths...
    if len(device_ids) != len(treatment_ids):
        # ... fill the shorter list with None values 
        ids = [device_ids, treatment_ids]
        lengths = [len(i) for i in ids]
        zip_length = max(lengths)
        
        # Fill in the shortest list
        min_len_idx = lengths.index(min(lengths))
        min_len_ids = ids[min_len_idx]
        min_len_ids.extend([None for _ in range(min_len_idx, zip_length)])
        
        # identify the longest list 
        max_len_idx = lengths.index(max(lengths))
        max_len_ids = ids[max_len_idx]
        
        # Append all lists to zip_list ordered as (entry_ids, device_ids, treatments_ids) 
        zip_lists.append([test[k][0] for _ in range(zip_length)]) # entry Id's
        
        # Identify the device id list to facilitate appending lists in the correct order
        if max_len_ids == device_ids:
            zip_lists.append(max_len_ids) # device_ids
            zip_lists.append(min_len_ids) # treatment_ids
        else:
            zip_lists.append(min_len_ids) # device_ids
            zip_lists.append(max_len_ids) # treatment_ids
    else:
        # Set the zip_length to the length of device_ids
        zip_length = len(device_ids)
        zip_lists.append([test[k][0] for _ in range(zip_length)]) # Entry_ids
        zip_lists.append(device_ids)
        zip_lists.append(treatment_ids)

    # Add the current iteration results 
    join_ids.extend(list(zip(*zip_lists)))
    
join_dict = {"entryid": [i[0] for i in join_ids],
             "devicestatusid": [i[1] for i in join_ids],
             "treatmentid": [i[2] for i in join_ids]}
join_df = pd.DataFrame(join_dict)
join_df

Unnamed: 0,entryid,devicestatusid,treatmentid
0,121768,218190.0,
1,121768,218191.0,
2,121768,218192.0,
3,121764,218186.0,
4,121764,218187.0,
...,...,...,...
163236,1,9.0,
163237,0,0.0,
163238,0,1.0,
163239,0,2.0,


In [22]:
print(len(join_df))
print(len(join_df.drop_duplicates()))

163241
163241


In [23]:
ds_df = sub.get_device_status_df()
treat_df = sub.get_treatment_df()
entry_df = sub.get_entries_df()


In [24]:
# latest per device status
joined = (join_df
          .merge(entry_df, how='left', left_on ='entryid', right_on='entryid', suffixes=("_x","_ent")) # drop duplicates
          .merge(ds_df, how='left', left_on="devicestatusid", right_on="devicestatusid", suffixes=("_y","_ds")) # drop duplicates (sum columns left then groupby)
          .merge(treat_df, how='left', left_on="treatmentid", right_on="treatmentid", suffixes=("_z","_tre")) # drop duplicates 
         )
joined.head()

Unnamed: 0,entryid,devicestatusid,treatmentid,time,bg,timestamp_y,pump/status/status,pump/status/timestamp,pump/clock,pump/reservoir,...,splitNow,enteredinsulin,relative,uuid,timestamp,sysTime,isAnnouncement,CircadianPercentageProfile,percentage,timeshift
0,121768,218190.0,,2019-02-01T23:21:19.140+1000,180,2019-02-01 23:21:19.140000+10:00,normal,2019-02-01T13:16:32Z,2019-02-01T13:16:32Z,50.0,...,,,,,NaT,,,,,
1,121768,218191.0,,2019-02-01T23:21:19.140+1000,180,2019-02-01 23:21:19.140000+10:00,normal,2019-02-01T13:16:31Z,2019-02-01T13:16:31Z,50.0,...,,,,,NaT,,,,,
2,121768,218192.0,,2019-02-01T23:21:19.140+1000,180,2019-02-01 23:21:19.140000+10:00,,,,,...,,,,,NaT,,,,,
3,121764,218186.0,,2019-02-01T13:36:12.000Z,199,2019-02-01 13:36:12+00:00,normal,2019-02-01T13:31:28Z,2019-02-01T13:31:28Z,50.0,...,,,,,NaT,,,,,
4,121764,218187.0,,2019-02-01T13:36:12.000Z,199,2019-02-01 13:36:12+00:00,normal,2019-02-01T13:31:28Z,2019-02-01T13:31:28Z,50.0,...,,,,,NaT,,,,,


In [25]:
joined.shape

(163241, 765)

In [26]:
print(entry_df.shape)
print(ds_df.shape)
print(treat_df.shape)
print(len(joined.entryid.drop_duplicates()))
print(len(joined.drop_duplicates()))

(122782, 4)
(218196, 696)
(8890, 65)
59365
163241


In [27]:
entry_count = len(np.unique(entry_df.entryid))
join_count = len(np.unique(joined.entryid))
print(entry_count)
print(join_count)
unique_entry_ids = set()
for k,v in test.items():
    unique_entry_ids.add(v[0])
print(len(unique_entry_ids))

122782
59365
120415
