# <center> 1. Data Cleaning and Preprocessing

In [1]:
# importing all necessary libraries

import pandas as pd
import glob
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [21]:
# Merging the CGM data for all 25 patients into one data frame, adding patient_id column as identifier and then joininh with the demographic and sleep info

# Step 1: Load all CGM files
path = "../data/raw/HUPA*.csv"
files = glob.glob(path)

df_list = []
for f in files:
    temp = pd.read_csv(f, delimiter=";", header=0)  # adjust delimiter if needed
    temp['patient_id'] = os.path.basename(f).replace(".csv", "")
    df_list.append(temp)

dfCGMData = pd.concat(df_list, ignore_index=True)

# Step 2: Load patient metadata
dfDemographicsData = pd.read_csv("../data/raw/T1DM_patient_sleep_demographics_with_race.csv")

dfDemographicsData.rename(columns={"Patient_ID": "patient_id"}, inplace=True)

# Step 3: Merge on patient_id
dfT1DData = dfCGMData.merge(dfDemographicsData, on="patient_id", how="left")

# Step 4: Check result
dfT1DData.head()


Unnamed: 0,time,glucose,calories,heart_rate,steps,basal_rate,bolus_volume_delivered,carb_input,patient_id,Age,Gender,Race,Average Sleep Duration (hrs),Sleep Quality (1-10),% with Sleep Disturbances
0,2018-06-13T18:40:00,332.0,6.3595,82.322835,34.0,0.091667,0.0,0.0,HUPA0001P,34,Male,Other,6.3,4.5,80
1,2018-06-13T18:45:00,326.0,7.728,83.740157,0.0,0.091667,0.0,0.0,HUPA0001P,34,Male,Other,6.3,4.5,80
2,2018-06-13T18:50:00,330.0,4.7495,80.52518,0.0,0.091667,0.0,0.0,HUPA0001P,34,Male,Other,6.3,4.5,80
3,2018-06-13T18:55:00,324.0,6.3595,89.129032,20.0,0.091667,0.0,0.0,HUPA0001P,34,Male,Other,6.3,4.5,80
4,2018-06-13T19:00:00,306.0,5.152,92.495652,0.0,0.075,0.0,0.0,HUPA0001P,34,Male,Other,6.3,4.5,80


In [43]:
# Checking that each row in CGM Data files found a correspoding patient demography and sleep data row.

missing_demographics = dfT1DData['Age'].isna().sum()
print(f"Number of CGM rows without matching demographics: {missing_demographics}")

Number of CGM rows without matching demographics: 0


In [47]:
# Checking the number of patients in the data frame and if there are any null values in any of the columns

print(dfT1DData['patient_id'].nunique(), "unique patients in CGM data")
print(dfDemographicsData['patient_id'].nunique(), "unique patients in metadata")

print(dfT1DData['patient_id'].unique())

dfT1DData.isna().sum()

25 unique patients in CGM data
25 unique patients in metadata
['HUPA0001P' 'HUPA0002P' 'HUPA0003P' 'HUPA0004P' 'HUPA0005P' 'HUPA0006P'
 'HUPA0007P' 'HUPA0009P' 'HUPA0010P' 'HUPA0011P' 'HUPA0014P' 'HUPA0015P'
 'HUPA0016P' 'HUPA0017P' 'HUPA0018P' 'HUPA0019P' 'HUPA0020P' 'HUPA0021P'
 'HUPA0022P' 'HUPA0023P' 'HUPA0024P' 'HUPA0025P' 'HUPA0026P' 'HUPA0027P'
 'HUPA0028P']


time                            0
glucose                         0
calories                        0
heart_rate                      0
steps                           0
basal_rate                      0
bolus_volume_delivered          0
carb_input                      0
patient_id                      0
Age                             0
Gender                          0
Race                            0
Average Sleep Duration (hrs)    0
Sleep Quality (1-10)            0
% with Sleep Disturbances       0
dtype: int64