# Aggregate Night Scout Statistics
Author: Spencer Weston

This notebook exists to begin evaluating the nightscout/openaps data as a whole. This notebook should help determine which variables we can drop and generate some metadata. We also need to validate some assumptions across all subjects. We'll get setup by accumulating all the file pathes into a single list.

In [1]:
import os
import numpy as np
import pandas as pd 
import re 
from datetime import datetime, timedelta, timezone
import pytz
from collections import namedtuple
from sortedcontainers import SortedDict

In [2]:
os.getcwd()

'C:\\Users\\spenc\\Documents\\Berkeley\\Capstone\\BGPredict\\Notebooks'

In [2]:
SubjectInfo = namedtuple("SubjectInfo", ['path', 'subject_id', 'devicestatus', 'entries', 'treatments'])

def generate_subject_info(data_dir, subject_number):
    direct_sharing_folder = 'direct-sharing-31'
    path = f"{data_dir}/{subject_number}"
    # Some individuals have multiple direct-sharing folders. Ensure the correct folder exists and add it to path
    # For example, one individual has a folder with menstruation data
    if direct_sharing_folder not in os.listdir(path):
        return None
    else:
        path = f"{path}/{direct_sharing_folder}"
    
    subject_dirs = [folder for folder in os.listdir(path)]
    relevant_dir_names = ["treatments", "devicestatus", "entries"]
    relevant_dirs = {k: [] for k in relevant_dir_names}
    # Identify every file associated with treatments, device status, or entries and store them in a dictionary
    for folder in subject_dirs:
        for dir_name in relevant_dir_names:
            if dir_name in folder and folder.endswith("_csv"):
                dir_path = f"{path}/{folder}"
                files = [f"{dir_path}/{file}" for file in os.listdir(dir_path)]
                relevant_dirs[dir_name].extend(files)
    return SubjectInfo(path=path, subject_id=subject_number, devicestatus=relevant_dirs['devicestatus'],
                      entries=relevant_dirs['entries'], treatments=relevant_dirs['treatments'])
    
# subject = SubjectInfo()
data_dir = "C:\\Users\spenc\Documents\Berkeley\Capstone\n=183_OpenAPS_Data_Commons_August_2021_UNZIPPED"
data_dir = data_dir.replace("\\", "/")
data_dir = data_dir.replace("\n", "/n")
# isnumeric validates that the folder comes from openaps/nightscout
subject_dirs = [generate_subject_info(data_dir, folder) for folder in os.listdir(data_dir) if folder.isnumeric()] 

# None will be returned if an individual has no  
try:
    while subject_dirs.index(None):
        idx = subject_dirs.index(None)
        subject_dirs.pop(idx)
except ValueError:
    pass

## Summary Statistics
Create some rough summary statistics based on the available files

In [4]:
file_presence = {"treatments": [], "devicestatus": [], "entries": []}
for x in subject_dirs:
    for key in file_presence.keys():
        tuple_idx = x._fields.index(key)
        files = x[tuple_idx]
        file_presence[key].append(len(files))

In [5]:
file_presence.keys()

dict_keys(['treatments', 'devicestatus', 'entries'])

Here, we look at subjects with no device status but treatment data and subjects with no folders associated with their data. If they have no folders, that's an easy exclusion criteria. If they have treatment but not device status data, we'll need to validate rather the treatment data appears equivalently formatted to the device status from other olders. 

In [6]:
# Get count of the number of files for each subject
subjects = [x.subject_id for x in subject_dirs]
file_count = list(zip(subjects, *[file_presence[k] for k in file_presence.keys()]))

# subjects with no device status but treatment data; will need to be evaluated for data integrity
no_device_but_treatment = [] 
# Subjects with no folders associated with their data; exclusion criterion
no_folders = [] 
# No blood glucose data; exclusion criterion 
no_entries = []

for x in file_count:
    # no entries
    if x[1] < 1:
        if x[2] == 0 and x[3] == 0:
            no_folders.append(x[0])
        else: 
            no_entries.append(x[0])
        next 
    # no device status but treatment 
    if x[2] == 0 and x[3] > 0:
        no_device_but_treatment.append(x[0])
        next

print(f"NDT: {no_device_but_treatment} \n \n NF: {no_folders} \n \n NE: {no_entries}")

NDT: ['42052178', '50311906', '61179686', '66773091'] 
 
 NF: ['32635618', '51359431'] 
 
 NE: []


### Evaluate Subjects with No Device Status but Treatment Tables

In [7]:
# Functions from NightScoutJoinAnalysis
def define_column_superset(dataframes: list):
    superset = set()
    for df in dataframes:
        cols = list(df.columns)
        superset = superset.union(cols)
    return superset

def apply_superset(df, superset):
    df_cols = set(list(df.columns))
    set_diff = superset.difference(df_cols)
    n = len(df)
    additional_col_df = pd.DataFrame({k: [None for _ in range(n)] for k in set_diff})
    new_df = pd.concat([df, additional_col_df], axis=1)
    return new_df

def concat_dfs(dataframes: list):
    return pd.concat(dataframes, axis = 0)

In [8]:
ndt_subjects = [x for x in subject_dirs if x.subject_id in no_device_but_treatment]
ndt_dfs = {}
for subj in ndt_subjects:
    print(f"Subject: {subj.subject_id}, Num_files: {len(subj.treatments)}")
    ndt_dfs.update({subj.subject_id: [pd.read_csv(file, dtype=str) for file in subj.treatments]}) 
    for df in ndt_dfs[subj.subject_id]:
        print(df.shape)

Subject: 42052178, Num_files: 6
(15000, 53)
(15000, 56)
(15000, 57)
(15000, 61)
(15000, 54)
(5911, 348)
Subject: 50311906, Num_files: 3
(205, 48)
(15000, 88)
(10835, 81)
Subject: 61179686, Num_files: 5
(15000, 258)
(15000, 16)
(15000, 16)
(15000, 16)
(1272, 18)
Subject: 66773091, Num_files: 6
(15000, 16)
(15000, 16)
(15000, 16)
(15000, 16)
(15000, 16)
(10132, 16)


In [10]:
# df_list = [df for df in ndt_dfs["42052178"]]
# superset = define_column_superset(df_list)
# print('supersetting')
# superset_dfs = [apply_superset(df, superset) for df in df_list]
# print('concatting')
# union_df = concat_dfs(superset_dfs)
# # union_df.describe()

In [12]:
test = [df.reset_index(drop=True) for df in superset_dfs]
start = datetime.now()
union_df = pd.concat(test, axis = 0)
end = datetime.now()
print("Unioned in:", end-start)
union_df.head()

Unioned in: 0:00:01.662886


Unnamed: 0,created_at,absolute,carbs,_id,duration,NSCLIENT_ID,rate,eventType,enteredBy,insulin,...,raw_duration/_type,bolus/appended/0/data/5/age,square/appended/0/data/0/amount,changed/insulin_sensitivies/3/_offset,raw_rate/appended/0/data/1/amount,preBolus,changed/insulin_sensitivies/7/offset,stale/insulin_sensitivies/0/_offset,raw_rate/_body,stale/insulin_sensitivies/7/sensitivity
0,2019-05-10T23:40:47Z,0.0,,5cd69da3131daf5594f593e1,120.0,1557531647994,0.0,Temp Basal,S6MIS6,,...,,,,,,,,,,
1,2019-05-10T23:39:03Z,,,5cd69da3131daf5594f593e3,,1557531661775,,Correction Bolus,,0.4,...,,,,,,,,,,
2,2019-05-10T23:30:53Z,,,5cd69da3131daf5594f593e2,,1557531053916,,Temp Basal,S6MIS6,,...,,,,,,,,,,
3,2019-05-10T23:29:01Z,,,5cd69da3131daf5594f593e4,,1557531068046,,Correction Bolus,,0.2,...,,,,,,,,,,
4,2019-05-10T23:17:56Z,0.0,,5cd69d5a131daf5594f593cc,82.0,1557530276855,0.0,Temp Basal,S6MIS6,,...,,,,,,,,,,


In [None]:
## Prints columns
# cols = []
# for i,col in enumerate(union_df.columns):
#     cols.append(col)
#     if i % 5 == 0:
#         print(cols)
#         cols = []
union_df.describe()

We can see that these treatment tables hold a lot of relevant information. We'd love to keep this info if possible. We'll have to do a lot of work to process the though.

## Evaluate Treatment Uniqueness
In Nightscout join analysis, we saw that a union of treatments with duplicates dropped exactly matched a device status file. Let's see how common that is. 

In [27]:
class Subject:
    
    def __init__(self, subject_id, subject_path, treatment_files, device_status_files, entries_files):
        self.subject_id = subject_id
        self.subject_path = subject_path
        
        self.treatment_files = treatment_files
        self.treatment_shapes = None
        self.treatment_df = None
        
        self.device_status_files = device_status_files
        self.device_status_shapes = None
        self.device_status_df = None
        
        self.entries_files = entries_files
        self.entries_shapes = None
        self.entries_df = None 
        
    
    def get_device_status_shapes(self):
        if self.device_status_shapes is not None:
            return self.device_status_shapes
        else:
            self.device_status_shapes = [pd.read_csv(file, low_memory=False).shape for file in self.device_status_files]
            return self.device_status_shapes
    
    def get_device_status_df(self):
        if self.device_status_df is not None:
            return self.device_status_df
        else:
            device_status_dfs = [pd.read_csv(file, low_memory=False).reset_index(drop=True) for file in self.device_status_files]
            self.device_status_df = pd.concat(device_status_dfs, axis=0)
            return self.device_status_df 
    
    def get_treatment_shapes(self):
        if self.treatment_shapes is not None:
            return self.treatment_shapes
        else:
            self.treatment_shapes = [pd.read_csv(file, low_memory=False).shape for file in self.treatment_files]
            return self.treatment_shapes
        
    def get_treatment_df(self):
        if self.treatment_df is not None:
            return self.treatment_df
        else:
            treatment_dfs = [pd.read_csv(file, low_memory=False).reset_index(drop=True) for file in self.treatment_files]
            self.treatment_df = pd.concat(treatment_dfs, axis=0)
            return self.treatment_df
        
    def get_entries_shapes(self):
        if self.entries_shapes is not None:
            return self.entries_shapes
        else:
            self.entries_shapes = [pd.read_csv(file, low_memory=False).shape for file in self.entries_files]
            return self.entries_shapes
    
    def get_entries_df(self):
        if self.entries_df is not None:
            return self.entries_df
        else:
            self.entries_dfs = [pd.read_csv(file, low_memory=False).reset_index(drop=True) for file in self.entries_files]
            return self.entries_df
            
    def check_equivalence(self, treatment_df=None, check_unique=False):
        """Check rather treatment data is a subset of or equivalent to device status data"""
        if treatment_df is None:
            treatment_df = self.get_treatment_df()
        ds_df = self.get_device_status_df()
        if treatment_df.shape == ds_df.shape:
            treat_array = self.to_string_and_numpy(treatment_df)
            ds_array = self.to_string_and_numpy(ds_df)
            if np.array_equal(treat_array, ds_array):
                print("Equivalent full treatment and device status")
                return 1
            else:
                print("Equivalent full treatment and device status shape but not elements")
                return 0
        elif treatment_df.shape in self.get_device_status_shapes():
            idx = self.get_device_status_shapes().index(treatment_df.shape)
            ds_df = pd.read_csv(self.device_status_files[idx], low_memory=False)
            ds_array = self.to_string_and_numpy(ds_df)
            treat_array = self.to_string_and_numpy(treatment_df)
            if np.array_equal(treat_array, ds_array):
                print("Treatment equivalent to subset of device status")
                return 1 
            else:
                print("Treatment has equivalent shape to subset of device status but different elements")
                return 0 
        elif not check_unique:
            # Make a recursive check on the treatment dataframe with duplicates dropped 
            print('here')
            self.check_equivalence(treatment_df.drop_duplicates(), check_unique=True)
        else:
            print("Treatment data is not a copy of device status data")
    
    @staticmethod
    def to_string_and_numpy(df):
        return df.as_type('str').to_numpy()

**Subject 01352464: No duplicated treatment and device status**
Here, we validate that subject 01352464 does not have any duplicates from the treatment data frame and that the check equivalence function  

In [8]:
subject = subject_dirs[3]
sub_3 = Subject(subject.subject_id, subject.path, subject.treatments, subject.devicestatus, subject.entries)
# ds_shapes = sub_1.get_device_status_shapes()
# treat_shapes = sub_1.get_treatment_shapes()
treatment_df = sub_1.get_treatment_df()
ds_df = sub_1.get_device_status_df()

In [9]:
print(treatment_df.shape)
print(ds_df.shape)

(140527, 403)
(167799, 1222)


In [10]:
test = []
for t_col in treatment_df.columns:
    if t_col in ds_df.columns:
        test.append((t_col, 1))
    else:
        test.append((t_col, 0))
print(sum([x[1] for x in test]))

2


In [11]:
t = [x for x in test if x[1] ==1]
t

[('created_at', 1), ('_id', 1)]

In [12]:
subject = subject_dirs[3]
sub_3 = Subject(subject.subject_id, subject.path, subject.treatments, subject.devicestatus, subject.entries)
sub_3.check_equivalence()

here
no unique match


In [13]:
# Clean up variables
del sub_3
del treatment_df
del ds_df

**Subject 00221634: No duplicated treatment and device status**
In NightScoutJoinAnalysis, we saw that subject 00221634 has duplicate treatment and device status data. Evaluate rather `check_equivalence()` identifies the duplicate.

Written after running the below: For reasons, it appears that this data isn't as duplicated as I found in NightScoutJoinAnalysis. So, I'm just going to full join everything. Then, we can compress data across the relavent columns to shrink the dataset horizontally. Then, we'll still have multiple rows for each blood glucose entry. We can then groupby blood glucose entry and perform aggregations on the relevant columns.

In [28]:
subject = [s for s in subject_dirs if s.subject_id=="00221634"][0]
subject
sub_3 = Subject(subject.subject_id, subject.path, subject.treatments, subject.devicestatus, subject.entries)

In [24]:
print(subject.treatments)

['C:/Users/spenc/Documents/Berkeley/Capstone/n=183_OpenAPS_Data_Commons_August_2021_UNZIPPED/00221634/direct-sharing-31/00221634_treatments_2018-03-01_to_2018-08-05_csv/00221634_treatments_2018-03-01_to_2018-08-05_aa.csv', 'C:/Users/spenc/Documents/Berkeley/Capstone/n=183_OpenAPS_Data_Commons_August_2021_UNZIPPED/00221634/direct-sharing-31/00221634_treatments_2018-03-01_to_2018-08-05_csv/00221634_treatments_2018-03-01_to_2018-08-05_ab.csv']


In [22]:
sub_3.check_equivalence()

here
Treatment data is not a copy of device status data


In [23]:
treatment_df = sub_3.get_treatment_df()
print(treatment_df.shape)
ds_df = sub_3.get_device_status_df()
print(ds_df.shape)

(26689, 59)
(53877, 654)


In [72]:
pd.to_datetime(treatment_df.created_at)

0       2018-08-04 23:47:05+00:00
1       2018-08-04 23:01:08+00:00
2       2018-08-04 22:10:26+00:00
3       2018-08-04 22:10:26+00:00
4       2018-08-04 21:49:05+00:00
                   ...           
11684   2018-03-01 17:37:00+00:00
11685   2018-03-01 15:01:00+00:00
11686   2018-03-01 14:17:00+00:00
11687   2018-03-01 12:17:51+00:00
11688   2018-03-01 06:37:58+00:00
Name: created_at, Length: 26689, dtype: datetime64[ns, UTC]

In [25]:
len(treatment_df._id.drop_duplicates())

26689

## Verify all entries have the same number of columns

This verifies all entry files have the same shape and can be naively unioned.

In [None]:
subject_objs = []
for x in subject_dirs:
    subject_objs.append(
        Subject(x.subject_id, x.path, x.treatments, x.devicestatus, x.entries)
    )
for x in subject_objs:
    shapes = x.get_entries_shapes()
    for shape in shapes:
        if shape[1] != 2:
            print(f"{x.subject_id} has malconformed shape. {shapes}")
            break 

## Raw join of the data 

In [3]:
class Subject:
    
    def __init__(self, subject_id, subject_path, treatment_files, device_status_files, entries_files):
        self.subject_id = subject_id
        self.subject_path = subject_path
        
        self.treatment_files = treatment_files
        self.treatment_shapes = None
        self.treatment_df = None
        
        self.device_status_files = device_status_files
        self.device_status_shapes = None
        self.device_status_df = None
        
        self.entries_files = entries_files
        self.entries_shapes = None
        self.entries_df = None 
        
        self.join_table = None 
        
    def get_device_status_shapes(self):
        if self.device_status_shapes is not None:
            return self.device_status_shapes
        else:
            self.device_status_shapes = [pd.read_csv(file, low_memory=False).shape for file in self.device_status_files]
            return self.device_status_shapes
    
    def get_device_status_df(self):
        if self.device_status_df is not None:
            return self.device_status_df
        else:
            device_status_dfs = [pd.read_csv(file, low_memory=False).reset_index(drop=True) for file in self.device_status_files]
            self.device_status_df = pd.concat(device_status_dfs, axis=0)
            self.device_status_df['timestamp'] = pd.to_datetime(self.device_status_df['created_at'])
            self.device_status_df['devicestatusid'] = [i for i in range(len(self.device_status_df))]
            return self.device_status_df 
    
    def get_treatment_shapes(self):
        if self.treatment_shapes is not None:
            return self.treatment_shapes
        else:
            self.treatment_shapes = [pd.read_csv(file, low_memory=False).shape for file in self.treatment_files]
            return self.treatment_shapes
        
    def get_treatment_df(self):
        if self.treatment_df is not None:
            return self.treatment_df
        else:
            treatment_dfs = [pd.read_csv(file, low_memory=False).reset_index(drop=True) for file in self.treatment_files]
            self.treatment_df = pd.concat(treatment_dfs, axis=0)
            try:
                self.treatment_df['timestamp'] = pd.to_datetime(self.treatment_df["created_at"])
            except ValueError:
                # Somewhat randomly, for unclear reasons, we receive the following error:
                # ValueError: cannot reindex from a duplicate axis
                # Resetting index seems to resolve this.
                self.treatment_df = self.treatment_df.reset_index()
                self.treatment_df['timestamp'] = pd.to_datetime(self.treatment_df["created_at"])
            self.treatment_df['treatmentid'] = [i for i in range(len(self.treatment_df))]
            return self.treatment_df
        
    def get_entries_shapes(self):
        if self.entries_shapes is not None:
            return self.entries_shapes
        else:
            self.entries_shapes = [pd.read_csv(file, low_memory=False).shape for file in self.entries_files]
            return self.entries_shapes
    
    def get_entries_df(self):
        if self.entries_df is not None:
            return self.entries_df
        else:
            entries_dfs = [pd.read_csv(file, low_memory=False, header=None).reset_index(drop=True) for file in self.entries_files]
            self.entries_df = pd.concat(entries_dfs, axis=0)
            self.entries_df.columns = ["time", "bg"]
            self.entries_df['timestamp'] = pd.to_datetime(self.entries_df['time'])
            self.entries_df['entryid']  = [i for i in range(len(self.entries_df))]
            return self.entries_df

    def temporal_join(self):
        
        # Load tables and convert relevant columns to date times
        entries = self.get_entries_df()
        treatments = self.get_treatment_df()
        device_status = self.get_device_status_df()
        
        # Store timestamp and entries in zipped list
        timestamp_keys = entries['timestamp'].to_list()
        entry_id_list = entries['entryid'].to_list()
        zipped = list(zip(timestamp_keys, entry_id_list))

        # fill in standard python dictionary with entry data; convert to SortedDict sorted on entry timestamps
        index_dict = SortedDict({timestamp: (entry_id, {"device_status": [], "treatment": []}) for timestamp, entry_id in zipped})

        # Generate list of tuples for (devicetimestamp, deviceid) 
        device_tuples = list(zip(device_status['timestamp'], device_status['devicestatusid']))
        
        # Generate list of tuples for (devicetimestamp, deviceid) 
        treatments_tuples = list(zip(treatments['timestamp'], treatments['treatmentid']))
        
        # Set constants from index_dict
        index_keys = index_dict.keys()
        max_idx = index_dict.index(index_keys[len(index_keys)-1])
        
        for comparison_timestamp, comparison_id in device_tuples:
            # Left idx is the index of the entry timestamp the comparison timestamp is less than or equal to 
            left_idx = index_dict.bisect_left(comparison_timestamp) 
            
            # Assign comparison timestamps greater than the last entry to the last entry
            # (Comparisons < min(entry timestamp) will naturally be joined to min(entry timestamp))
            if left_idx >= max_idx:
                left_idx = max_idx
            
            # Get the index_dict key associated with the bisect_left operation
            assignment_key = index_keys[left_idx]
            
            # Assign the comparison_id to the assignment key of the index_dict
            index_dict[assignment_key][1]['device_status'].append(comparison_id)
            
        # Equivalent to the above for-loop but for treatments_tuples    
        for comparison_timestamp, comparison_id in device_tuples:
            # Left idx is the index of the entry timestamp the comparison timestamp is less than or equal to 
            left_idx = index_dict.bisect_left(comparison_timestamp) 

            # Assign comparison timestamps greater than the last entry to the last entry
            # (Comparisons < min(entry timestamp) will naturally be joined to min(entry timestamp))
            if left_idx >= max_idx:
                left_idx = max_idx

            # Get the index_dict key associated with the bisect_left operation
            assignment_key = index_keys[left_idx]

            # Assign the comparison_id to the assignment key of the index_dict
            index_dict[assignment_key][1]['treatment'].append(comparison_id)
        
        return index_dict

In [4]:
# subject = [s for s in subject_dirs if s.subject_id=="00221634"][0]
subject = subject_dirs[5]
sub = Subject(subject.subject_id, subject.path, subject.treatments, subject.devicestatus, subject.entries)
print(sub.subject_id)

02199852


In [5]:
# print(treatment_df.columns.is_unique)
# print(treatment_df.index.is_unique)
# treatment_df=  treatment_df.reset_index()
# print(treatment_df.index.is_unique)

NameError: name 'treatment_df' is not defined

In [6]:
treatment_df = sub.get_treatment_df()
treatment_df['created_at'] = pd.to_datetime(treatment_df.created_at)
treatment_df.reset_index().index.is_unique

True

In [7]:
test= sub.temporal_join()

In the below cell, it's curious that we see the same treatment and device status ID's given that these are independently created. 

In [8]:
i = 0
for k,v in test.items():
    if len(v[1]["device_status"]) and len(v[1]['treatment']):
        print(v)
        i +=1
    if i > 20:
        break

(49331, {'device_status': [121681, 121682, 121683], 'treatment': [121681, 121682, 121683]})
(49330, {'device_status': [121675, 121676, 121677, 121678, 121679, 121680], 'treatment': [121675, 121676, 121677, 121678, 121679, 121680]})
(49329, {'device_status': [121667, 121668, 121669, 121670, 121671, 121672, 121673, 121674], 'treatment': [121667, 121668, 121669, 121670, 121671, 121672, 121673, 121674]})
(49328, {'device_status': [121663, 121664, 121665, 121666], 'treatment': [121663, 121664, 121665, 121666]})
(49327, {'device_status': [121654, 121655, 121656, 121657, 121658, 121659, 121660, 121661, 121662], 'treatment': [121654, 121655, 121656, 121657, 121658, 121659, 121660, 121661, 121662]})
(49326, {'device_status': [121645, 121646, 121647, 121648, 121649, 121650, 121651, 121652, 121653], 'treatment': [121645, 121646, 121647, 121648, 121649, 121650, 121651, 121652, 121653]})
(49325, {'device_status': [121637, 121638, 121639, 121640, 121641, 121642, 121643, 121644], 'treatment': [121637

Based on the cell below, it appears the device status and treatment ID's are never different where there are matched values. It appears that device status and treatment data both align with a portion of the entries data, AND they align with the same portion of the entries data. Therefore, we have some subset of entries data with treatment and device status data most of the time. (I re-ran this process for several subjects). 

In [9]:
# Evalute 
i = 0
matched =0 
not_empty_unmatched = 0
matched_empty = 0
for k,v in test.items():
    if (v[1]["device_status"] == v[1]['treatment']) and len(v[1]["device_status"]) >=1:
        matched += 1 
    elif len(v[1]["device_status"]) >1 or len(v[1]["treatment"]) >1:
        print("Not matched \n", v)
        not_matched +=1
    elif (v[1]["device_status"] == v[1]['treatment']) and len(v[1]["device_status"]) <1:
        matched_empty +=1 
    else:
        print(v)
#     if i > 10:
#         break
print(len(test))
print("not empty matched: ", matched)
print("not empty unmatched:", not_empty_unmatched)
print("matched empty:" ,matched_empty)

51753
not empty matched:  39469
not empty unmatched: 0
matched empty: 12284


Now, let's look into making the join dataframe from the temporal_join output.

In [10]:
keys = test.keys()
for k in keys[15000:15005]:
    print(test[k])

(36686, {'device_status': [94551], 'treatment': [94551]})
(36685, {'device_status': [94550], 'treatment': [94550]})
(36684, {'device_status': [94549], 'treatment': [94549]})
(36683, {'device_status': [94548], 'treatment': [94548]})
(36682, {'device_status': [94547], 'treatment': [94547]})


In [11]:
ds_and_treatment_id = []
for k in keys:
    device_ids = test[k][1]['device_status']
    treatment_ids = test[k][1]['treatment']
    if len(device_ids) != len(treatment_ids):
        raise Exception(f"Nonconformant device and treatment lengths ({len(device_ids)}, {len(treatment_ids)}")
    
    entry_ids = [test[k][0] for _ in range(len(device_ids))]
    ds_and_treatment_id.extend(list(zip(entry_ids, devices, treatments)))
join_dict = {"entryid": [i[0] for  i in ds_and_treatment_id],
             "devicestatusid": [i[1] for i in ds_and_treatment_id],
             "treatmentid": [i[2] for i in ds_and_treatment_id]}
join_df = pd.DataFrame(join_dict)
join_df.head()

NameError: name 'devices' is not defined

In [None]:
entry_df.loc[entry_df['entryid'] == 51690, : ]

In [None]:
ds_df = sub.get_device_status_df()
treat_df = sub.get_treatment_df()
entry_df = sub.get_entries_df()
join_df.merge(entry_df, how='left', left_on ='entryid', right_on='entryid')


In [None]:
ds_df.head()

In [None]:
joined = (join_df
          .merge(entry_df, how='left', left_on ='entryid', right_on='entryid', suffixes=("_x","_ent"))
          .merge(ds_df, how='left', left_on="devicestatusid", right_on="devicestatusid", suffixes=("_y","_ds"))
          .merge(treat_df, how='left', left_on="treatmentid", right_on="treatmentid", suffixes=("_z","_tre"))
         )
joined.head()

In [294]:
joined.shape

(65104, 546)

In [292]:
joined.isna().sum()

entryid               0
devicestatusid        0
treatmentid           0
time                  0
bg                    0
                  ...  
ratio             65104
units             65104
glucoseType       65104
glucose           65104
isAnnouncement    65104
Length: 546, dtype: int64