So this file is for getting the relevant CGM data range of values that fall within the measurement window for OGTT, HFMM etc.

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from collections import defaultdict
import re

In [2]:
def simplify_id(id_string):
    # Extract the numeric part
    num_part = re.search(r'\d+', id_string).group()
    # Remove leading zeros
    simplified_num = str(int(num_part))
    # Keep only the last two digits (or one if it's a single digit)
    simplified_num = simplified_num[-2:] if len(simplified_num) > 1 else simplified_num
    return f"Ex{simplified_num}"

In [3]:
def parse_date_flexible(date_string):
    date_formats = ['%m/%d/%Y', '%d/%m/%Y', '%Y-%m-%d']
    for fmt in date_formats:
        try:
            return datetime.strptime(date_string, fmt)
        except ValueError:
            continue
    raise ValueError(f"Unable to parse date: {date_string}")


In [4]:
def parse_time_flexible(time_string):
    time_formats = ['%I:%M:%S %p', '%H:%M:%S', '%I:%M %p', '%H:%M']
    for fmt in time_formats:
        try:
            return datetime.strptime(time_string, fmt).time()
        except ValueError:
            continue
    raise ValueError(f"Unable to parse time: {time_string}")

In [6]:
# Read the input files
time_hfmm = pd.read_csv('Time_OGTT_MMT.csv')
cgm_data = pd.read_csv('CGM_Data_Table.csv')

In [7]:
# Convert timestamp columns to datetime
cgm_data['Timestamp'] = pd.to_datetime(cgm_data['Timestamp'])

In [8]:
# Process HFMM data
processed_data = []
missing_cgm_data = []

In [9]:
for _, row in time_hfmm.iterrows():
    participant_id = row['c11_rand_nr']
    hfmm_date = row['c13_date']
    start_time = row['c13_HFMM_startdrink']
    end_time = row['c13_HFMM_act240']
    
    # Skip if any required data is missing
    if pd.isna(hfmm_date) or pd.isna(start_time) or pd.isna(end_time):
        continue
    
    try:
        hfmm_date = parse_date_flexible(hfmm_date)
        start_time = parse_time_flexible(start_time)
        end_time = parse_time_flexible(end_time)
        
        start_datetime = datetime.combine(hfmm_date, start_time)
        end_datetime = datetime.combine(hfmm_date, end_time)
        
        # Handle case where end time is on the next day
        if end_datetime <= start_datetime:
            end_datetime += timedelta(days=1)
        
        print(f"Processing ID: {participant_id}, HFMM Date: {hfmm_date.date()}")
        
        # Simplify the ID
        simplified_id = simplify_id(participant_id)
        
        # Filter CGM data for this participant and time window
        participant_cgm = cgm_data[
            (cgm_data['Participant'] == participant_id) &
            (cgm_data['Timestamp'] >= start_datetime) &
            (cgm_data['Timestamp'] <= end_datetime)
        ]
        
        print(f"ID {participant_id}: {len(participant_cgm)} CGM readings")
        
        if len(participant_cgm) == 0:
            missing_cgm_data.append(participant_id)
        
        # Process each CGM reading
        for _, cgm_row in participant_cgm.iterrows():
            time_diff = (cgm_row['Timestamp'] - start_datetime).total_seconds() / 60  # Convert to minutes
            processed_data.append({
                'ID': simplified_id,
                'Condition': 'HFMM',
                'test': 'CGM',
                'metab': 'gluc',
                'VAL': cgm_row['Sensor Glucose (mmol/L)'],
                'time': round(time_diff)  # Rounding to nearest minute
            })
    
    except ValueError as e:
        print(f"Error processing ID {participant_id}: {str(e)}")

# Create a DataFrame from the processed data
output_df = pd.DataFrame(processed_data)

# Sort the DataFrame by ID and time
output_df = output_df.sort_values(['ID', 'time'])

# Save to CSV
output_df.to_csv('processed_cgm_hfmm_data.csv', index=False)
print("Processed HFMM data saved to processed_cgm_hfmm_data.csv")

# Report on missing CGM data
if missing_cgm_data:
    print("\nParticipants with missing CGM data for HFMM:")
    for participant in missing_cgm_data:
        print(participant)
else:
    print("\nAll participants have CGM data for HFMM")

# Additional analysis
total_participants = time_hfmm['c11_rand_nr'].nunique()
participants_with_data = output_df['ID'].nunique()

print(f"\nTotal participants in Time_OGTT_MMT: {total_participants}")
print(f"Participants with HFMM CGM data: {participants_with_data}")
print(f"Participants missing HFMM CGM data: {total_participants - participants_with_data}")

# Check for participants in Time_OGTT_MMT but not in the final output
time_hfmm_ids = set(simplify_id(id) for id in time_hfmm['c11_rand_nr'])
output_ids = set(output_df['ID'])
missing_ids = time_hfmm_ids - output_ids

if missing_ids:
    print("\nParticipants in Time_OGTT_MMT but not in final output:")
    print(missing_ids)

Processing ID: PS010001, HFMM Date: 2018-07-23
ID PS010001: 36 CGM readings
Processing ID: PS010002, HFMM Date: 2019-01-30
ID PS010002: 47 CGM readings
Processing ID: PS010003, HFMM Date: 2018-09-25
ID PS010003: 0 CGM readings
Processing ID: PS010004, HFMM Date: 2018-11-09
ID PS010004: 0 CGM readings
Processing ID: PS010005, HFMM Date: 2018-04-09
ID PS010005: 0 CGM readings
Processing ID: PS010007, HFMM Date: 2018-10-29
ID PS010007: 0 CGM readings
Processing ID: PS010006, HFMM Date: 2018-10-23
ID PS010006: 48 CGM readings
Processing ID: PS010008, HFMM Date: 2018-10-22
ID PS010008: 48 CGM readings
Processing ID: PS010010, HFMM Date: 2018-12-11
ID PS010010: 0 CGM readings
Error processing ID PS010009: Unable to parse time: -95
Processing ID: PS010012, HFMM Date: 2018-11-28
ID PS010012: 0 CGM readings
Processing ID: PS010014, HFMM Date: 2019-01-16
ID PS010014: 0 CGM readings
Processing ID: PS010011, HFMM Date: 2018-04-12
ID PS010011: 0 CGM readings
Processing ID: PS010013, HFMM Date: 2019

In [10]:
# DataFrame from the processed data
output_df = pd.DataFrame(processed_data)

In [11]:
# Sort DataFrame by ID and time
output_df = output_df.sort_values(['ID', 'time'])

In [12]:
# Save to CSV
output_df.to_csv('processed_cgm_data.csv', index=False)
print("Processed data saved to processed_cgm_data.csv")

Processed data saved to processed_cgm_data.csv


In [13]:
def extract_last_two(id_string):
    return re.search(r'\d+', id_string).group()[-2:].zfill(2)

In [15]:
ogtt_ids = set(extract_last_two(id) for id in time_ogtt['c11_rand_nr'])
processed_ids = set(id[2:] for id in output_df['ID'])  # Remove 'Ex' prefix
missing_ids = ogtt_ids - processed_ids

print("Sample OGTT IDs:", list(ogtt_ids)[:5])
print("Sample Processed IDs:", list(processed_ids)[:5])
print("Number of OGTT IDs:", len(ogtt_ids))
print("Number of Processed IDs:", len(processed_ids))
print("Number of Missing IDs:", len(missing_ids))

if missing_ids:
    print("\nMissing IDs (last two digits):")
    print(missing_ids)
    
    # To get full original IDs of missing entries
    missing_full_ids = [id for id in time_ogtt['c11_rand_nr'] if extract_last_two(id) in missing_ids]
    print("\nMissing full IDs:")
    print(missing_full_ids)

NameError: name 'time_ogtt' is not defined

**Note:** I tried to implement a method of comparison between the c12_date and c13_date as the c13 date typically was 4-5 days later however the date formatting is inconsistent between the two for many entries. 