# Aggregate Night Scout Statistics
Author: Spencer Weston

This notebook exists to begin evaluating the nightscout/openaps data as a whole. This notebook should help determine which variables we can drop and generate some metadata. We also need to validate some assumptions across all subjects. We'll get setup by accumulating all the file pathes into a single list.

In [24]:
import os
import numpy as np
import pandas as pd 
import re 
from datetime import datetime, timedelta, timezone
import pytz
from collections import namedtuple

In [2]:
os.getcwd()

'C:\\Users\\spenc\\Documents\\Berkeley\\Capstone\\BGPredict\\Notebooks'

In [56]:
SubjectInfo = namedtuple("SubjectInfo", ['path', 'subject_id', 'devicestatus', 'entries', 'treatments'])

def generate_subject_info(data_dir, subject_number):
    direct_sharing_folder = 'direct-sharing-31'
    path = f"{data_dir}/{subject_number}"
    # Some individuals have multiple direct-sharing folders. Ensure the correct folder exists and add it to path
    # For example, one individual has a folder with menstruation data
    if direct_sharing_folder not in os.listdir(path):
        return None
    else:
        path = f"{path}/{direct_sharing_folder}"
    
    subject_dirs = [folder for folder in os.listdir(path)]
    relevant_dir_names = ["treatments", "devicestatus", "entries"]
    relevant_dirs = {k: [] for k in relevant_dir_names}
    # Identify every file associated with treatments, device status, or entries and store them in a dictionary
    for folder in subject_dirs:
        for dir_name in relevant_dir_names:
            if dir_name in folder and folder.endswith("_csv"):
                dir_path = f"{path}/{folder}"
                files = [f"{dir_path}/{file}" for file in os.listdir(dir_path)]
                relevant_dirs[dir_name].extend(files)
    return SubjectInfo(path=path, subject_id=subject_number, devicestatus=relevant_dirs['devicestatus'],
                      entries=relevant_dirs['entries'], treatments=relevant_dirs['treatments'])
    
# subject = SubjectInfo()
data_dir = "C:\\Users\spenc\Documents\Berkeley\Capstone\n=183_OpenAPS_Data_Commons_August_2021_UNZIPPED"
data_dir = data_dir.replace("\\", "/")
data_dir = data_dir.replace("\n", "/n")
# isnumeric validates that the folder comes from openaps/nightscout
subject_dirs = [generate_subject_info(data_dir, folder) for folder in os.listdir(data_dir) if folder.isnumeric()] 

# None will be returned if an individual has no  
try:
    while subject_dirs.index(None):
        idx = subject_dirs.index(None)
        subject_dirs.pop(idx)
except ValueError:
    pass

## Summary Statistics
Create some rough summary statistics based on the available files

In [71]:
file_presence = {"treatments": [], "devicestatus": [], "entries": []}
for x in subject_dirs:
    for key in file_presence.keys():
        tuple_idx = x._fields.index(key)
        files = x[tuple_idx]
        file_presence[key].append(len(files))

In [72]:
file_presence.keys()

dict_keys(['treatments', 'devicestatus', 'entries'])

Here, we look at subjects with no device status but treatment data and subjects with no folders associated with their data. If they have no folders, that's an easy exclusion criteria. If they have treatment but not device status data, we'll need to validate rather the treatment data appears equivalently formatted to the device status from other olders. 

In [82]:
# Get count of the number of files for each subject
subjects = [x.subject_id for x in subject_dirs]
file_count = list(zip(subjects, *[file_presence[k] for k in file_presence.keys()]))

# subjects with no device status but treatment data; will need to be evaluated for data integrity
no_device_but_treatment = [] 
# Subjects with no folders associated with their data; exclusion criterion
no_folders = [] 
# No blood glucose data; exclusion criterion 
no_entries = []

for x in file_count:
    # no entries
    if x[1] < 1:
        if x[2] == 0 and x[3] == 0:
            no_folders.append(x[0])
        else: 
            no_entries.append(x[0])
        next 
    # no device status but treatment 
    if x[2] == 0 and x[3] > 0:
        no_device_but_treatment.append(x[0])
        next

print(f"NDT: {no_device_but_treatment} \n \n NF: {no_folders} \n \n NE: {no_entries}")

NDT: ['42052178', '50311906', '61179686', '66773091'] 
 
 NF: ['32635618', '51359431'] 
 
 NE: []


### Evaluate Subjects with No Device Status but Treatment Tables

In [83]:
# Functions from NightScoutJoinAnalysis
def define_column_superset(dataframes: list):
    superset = set()
    for df in dataframes:
        cols = list(df.columns)
        superset = superset.union(cols)
    return superset

def apply_superset(df, superset):
    df_cols = set(list(df.columns))
    set_diff = superset.difference(df_cols)
    n = len(df)
    additional_col_df = pd.DataFrame({k: [None for _ in range(n)] for k in set_diff})
    new_df = pd.concat([df, additional_col_df], axis=1)
    return new_df

def concat_dfs(dataframes: list):
    return pd.concat(dataframes, axis = 0)

In [95]:
ndt_subjects = [x for x in subject_dirs if x.subject_id in no_device_but_treatment]
ndt_dfs = {}
for subj in ndt_subjects:
    print(f"Subject: {subj.subject_id}, Num_files: {len(subj.treatments)}")
    ndt_dfs.update({subj.subject_id: [pd.read_csv(file, dtype=str) for file in subj.treatments]}) 
    for df in ndt_dfs[subj.subject_id]:
        print(df.shape)

Subject: 42052178, Num_files: 6
(15000, 53)
(15000, 56)
(15000, 57)
(15000, 61)
(15000, 54)
(5911, 348)
Subject: 50311906, Num_files: 3
(205, 48)
(15000, 88)
(10835, 81)
Subject: 61179686, Num_files: 5
(15000, 258)
(15000, 16)
(15000, 16)
(15000, 16)
(1272, 18)
Subject: 66773091, Num_files: 6
(15000, 16)
(15000, 16)
(15000, 16)
(15000, 16)
(15000, 16)
(10132, 16)


In [101]:
df_list = [df for df in ndt_dfs["42052178"]]
superset = define_column_superset(df_list)
print('supersetting')
superset_dfs = [apply_superset(df, superset) for df in df_list]
print('concatting')
union_df = concat_dfs(superset_dfs)
# union_df.describe()

supersetting
concatting


In [116]:
test = [df.reset_index(drop=True) for df in superset_dfs]
start = datetime.now()
union_df = pd.concat(test, axis = 0)
end = datetime.now()
print("Unioned in:", end-start)

Unioned in: 0:00:00.359388


In [108]:
## Prints columns
# cols = []
# for i,col in enumerate(union_df.columns):
#     cols.append(col)
#     if i % 5 == 0:
#         print(cols)
#         cols = []
union_df.describe()

Unnamed: 0,created_at,absolute,carbs,_id,duration,NSCLIENT_ID,rate,eventType,enteredBy,insulin,...,boluscalc/basaliobused,appended/0/_head,stale/insulin_sensitivies/4/_offset,key600,appended/0/_type,square/appended/0/_description,changed/carb_ratios/0/x,boluscalc/carbsneeded,wizard/_head,bolus/unabsorbed
count,80911,17261,6968,80911,25493,65482,8631,80911,47836,34599.0,...,901,3,6,1437,3,1,6,10,79,79
unique,66145,769,59,80911,151,64475,385,20,12,110.0,...,2,3,2,1437,1,1,1,1,56,44
top,2018-09-25T08:07:37Z,0,20,5cd69da3131daf5594f593e1,30,1545469901242,0,Correction Bolus,S6MIS6,0.1,...,true,5c0b965e8426cc843cd684,0,BG81D16AB6,UnabsorbedInsulinBolus,"UnabsorbedInsulinBolus unknown head[5], body[0...",0,0,5b00,0
freq,16,7488,1112,1,14730,9,5083,30533,21872,11987.0,...,892,1,4,1,3,1,6,10,10,22


We can see that these treatment tables hold a lot of relevant information. We'd love to keep this info if possible. We'll have to do a lot of work to process the though.

## Evaluate Treatment Uniqueness
In Nightscout join analysis, we saw that a union of treatments with duplicates dropped exactly matched a device status file. Let's see how common that is. 

In [158]:
class Subject:
    
    def __init__(self, subject_id, subject_path, treatment_files, device_status_files, entries_files):
        self.subject_id = subject_id
        self.subject_path = subject_path
        
        self.treatment_files = treatment_files
        self.treatment_shapes = None
        self.treatment_df = None
        
        self.device_status_files = device_status_files
        self.device_status_shapes = None
        self.device_status_df = None
        
        self.entries_files = entries_files
        self.entries_shapes = None
        self.entries_df = None 
        
    
    def get_device_status_shapes(self):
        if self.device_status_shapes is not None:
            return self.device_status_shapes
        else:
            self.device_status_shapes = [pd.read_csv(file, low_memory=False).shape for file in self.device_status_files]
            return self.device_status_shapes
    
    def get_device_status_df(self):
        if self.device_status_df is not None:
            return self.device_status_df
        else:
            device_status_dfs = [pd.read_csv(file, low_memory=False).reset_index(drop=True) for file in self.device_status_files]
            self.device_status_df = pd.concat(device_status_dfs, axis=0)
            return self.device_status_df 
    
    def get_treatment_shapes(self):
        if self.treatment_shapes is not None:
            return self.treatment_shapes
        else:
            self.treatment_shapes = [pd.read_csv(file, low_memory=False).shape for file in self.treatment_files]
            return self.treatment_shapes
        
    def get_treatment_df(self):
        if self.treatment_df is not None:
            return self.treatment_df
        else:
            treatment_dfs = [pd.read_csv(file, low_memory=False).reset_index(drop=True) for file in self.treatment_files]
            self.treatment_df = pd.concat(treatment_dfs, axis=0)
            return self.treatment_df
        
    def get_entries_shapes(self):
        if self.entries_shapes is not None:
            return self.entries_shapes
        else:
            self.entries_shapes = [pd.read_csv(file, low_memory=False).shape for file in self.entries_files]
            return self.entries_shapes
    
    def get_entries_df(self):
        if self.entries_df is not None:
            return self.entries_df
        else:
            entries_dfs = [pd.read_csv(file, low_memory=False).reset_index(drop=True) for file in self.entries_files]
            self.entries_dfs 
            
    def check_equivalence(self, treatment_df=None, check_unique=False):
        """Check rather treatment data is a subset of or equivalent to device status data"""
        if treatment_df is None:
            treatment_df = self.get_treatment_df()
        ds_df = self.get_device_status_df()
        if treatment_df.shape == ds_df.shape:
            treat_array = self.to_string_and_numpy(treatment_df)
            ds_array = self.to_string_and_numpy(ds_df)
            if np.array_equal(treat_array, ds_array):
                print("Equivalent full treatment and device status")
                return 1
            else:
                print("Equivalent full treatment and device status shape but not elements")
                return 0
        elif treatment_df.shape in self.get_device_status_shapes():
            idx = self.get_device_status_shapes().index(treatment_df.shape)
            ds_df = pd.read_csv(self.device_status_files[idx], low_memory=False)
            ds_array = self.to_string_and_numpy(ds_df)
            treat_array = self.to_string_and_numpy(treatment_df)
            if np.array_equal(treat_array, ds_array):
                print("Treatment equivalent to subset of device status")
                return 1 
            else:
                print("Treatment has equivalent shape to subset of device status but different elements")
                return 0 
        elif not check_unique:
            print('here')
            self.check_equivalence(treatment_df.drop_duplicates(), check_unique=True)
    
    @staticmethod
    def to_string_and_numpy(df):
        return df.as_type('str').to_numpy()

In [163]:
subject = subject_dirs[3]
sub_1 = Subject(subject.subject_id, subject.path, subject.treatments, subject.devicestatus, subject.entries)
# ds_shapes = sub_1.get_device_status_shapes()
# treat_shapes = sub_1.get_treatment_shapes()
treatment_df = sub_1.get_treatment_df()
ds_df = sub_1.get_device_status_df()

**Duplicated treatment and device status**

This is great. There will be multiple ways that treatment files and device status files are equivalent. But so far, they always seem to be equivalent.

In [164]:
print(treatment_df.shape)
print(ds_df.shape)

(140527, 403)
(167799, 1222)


In [166]:
test = []
for t_col in treatment_df.columns:
    if t_col in ds_df.columns:
        test.append((t_col, 1))
    else:
        test.append((t_col, 0))
print(sum([x[1] for x in test]))

2


In [167]:
t = [x for x in test if x[1] ==1]
t

[('created_at', 1), ('_id', 1)]

In [165]:
treatment_array = treatment_df.astype('str').to_numpy()
ds_array = ds_df.astype('str').to_numpy()
np.array_equal(treatment_array, ds_array)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\spenc\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3444, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\spenc\AppData\Local\Temp/ipykernel_12712/1806158811.py", line 1, in <module>
    treatment_array = treatment_df.astype('str').to_numpy()
  File "C:\Users\spenc\anaconda3\lib\site-packages\pandas\core\frame.py", line 1655, in to_numpy
    self._consolidate_inplace()
  File "C:\Users\spenc\anaconda3\lib\site-packages\pandas\core\generic.py", line 5565, in _consolidate_inplace
    self._protect_consolidate(f)
  File "C:\Users\spenc\anaconda3\lib\site-packages\pandas\core\generic.py", line 5553, in _protect_consolidate
    result = f()
  File "C:\Users\spenc\anaconda3\lib\site-packages\pandas\core\generic.py", line 5563, in f
    self._mgr = self._mgr.consolidate()
  File "C:\Users\spenc\anaconda3\lib\site-packages\pandas\core\internals\managers.py", line 619, in consolidate
   

TypeError: object of type 'NoneType' has no len()

## Verify all entries have the same number of columns

This verifies all entry files have the same shape and can be naively unioned.

In [147]:
subject_objs = []
for x in subject_dirs:
    subject_objs.append(
        Subject(x.subject_id, x.path, x.treatments, x.devicestatus, x.entries)
    )
for x in subject_objs:
    shapes = x.get_entries_shapes()
    for shape in shapes:
        if shape[1] != 2:
            print(f"{x.subject_id} has malconformed shape. {shapes}")
            break 

## Check for equivalence of all treatment tables to all device status tables

In [159]:
subject = subject_dirs[3]
sub_1 = Subject(subject.subject_id, subject.path, subject.treatments, subject.devicestatus, subject.entries)
sub_1.check_equivalence()

here


In [None]:
results