So this file is for getting the relevant CGM data range of values that fall within the measurement window for OGTT, HFMM etc.

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from collections import defaultdict
import re

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
def simplify_id(id_string):
    # Extract the numeric part
    num_part = re.search(r'\d+', id_string).group()
    # Remove leading zeros
    simplified_num = str(int(num_part))
    # Keep only the last two digits (or one if it's a single digit)
    simplified_num = simplified_num[-2:] if len(simplified_num) > 1 else simplified_num
    return f"Ex{simplified_num}"

In [3]:
def parse_date_flexible(date_string):
    date_formats = ['%m/%d/%Y', '%d/%m/%Y']
    for fmt in date_formats:
        try:
            return datetime.strptime(date_string, fmt)
        except ValueError:
            continue
    raise ValueError(f"Unable to parse date: {date_string}")

In [4]:
def is_date_ambiguous(date_obj):
    return date_obj.day <= 12

In [5]:
def get_alternate_date(date_obj):
    try:
        return date_obj.replace(month=date_obj.day, day=date_obj.month)
    except ValueError:
        return None  # Return None if the alternate date is invalid

In [6]:
# Read the input files
time_ogtt = pd.read_csv('updated_Time_OGTT_MMT.csv')
cgm_data = pd.read_csv('CGM_Data_Table.csv')

In [7]:
# Convert timestamp columns to datetime
time_ogtt['c12_date'] = pd.to_datetime(time_ogtt['c12_date'].apply(parse_date_flexible))
time_ogtt['c12_OGTT_startdrink'] = pd.to_datetime(time_ogtt['c12_date'].dt.strftime('%Y-%m-%d') + ' ' + time_ogtt['c12_OGTT_startdrink'], format='%Y-%m-%d %I:%M:%S %p')
time_ogtt['c12_OGTT_act120'] = pd.to_datetime(time_ogtt['c12_date'].dt.strftime('%Y-%m-%d') + ' ' + time_ogtt['c12_OGTT_act120'], format='%Y-%m-%d %I:%M:%S %p')
cgm_data['Timestamp'] = pd.to_datetime(cgm_data['Timestamp'])

In [11]:
# empty list to store the processed data
processed_data = []
ambiguous_dates_list = []
ambiguous_dates_with_data = defaultdict(list)

In [12]:
# Process each participant
for _, row in time_ogtt.iterrows():
    participant_id = row['c11_rand_nr']
    ogtt_date = row['c12_date']
    start_time = row['c12_OGTT_startdrink']
    end_time = row['c12_OGTT_act120']

    print(f"Processing ID: {participant_id}, OGTT Date: {ogtt_date.date()}")

    # Check for ambiguous dates
    if is_date_ambiguous(ogtt_date):
        alt_date = get_alternate_date(ogtt_date)
        if alt_date:
            ambiguous_dates_list.append((participant_id, ogtt_date.strftime('%m/%d/%Y'), [ogtt_date, alt_date]))
            print(f"Ambiguous date for ID {participant_id}: {ogtt_date.strftime('%m/%d/%Y')}")
            print(f"  Interpretation 1: {ogtt_date.strftime('%Y-%m-%d')}")
            print(f"  Interpretation 2: {alt_date.strftime('%Y-%m-%d')}")
            
            # Check CGM data for both interpretations
            cgm_data_available = []
            for date in [ogtt_date, alt_date]:
                cgm_for_date = cgm_data[
                    (cgm_data['Participant'] == participant_id) &
                    (cgm_data['Timestamp'].dt.date == date.date())
                ]
                cgm_count = len(cgm_for_date)
                print(f"  CGM readings for {date.strftime('%Y-%m-%d')}: {cgm_count}")
                if cgm_count > 0:
                    cgm_data_available.append(date)
            
            # Record if both dates have CGM data
            if len(cgm_data_available) == 2:
                ambiguous_dates_with_data[participant_id].append({
                    'original_date': ogtt_date.strftime('%m/%d/%Y'),
                    'interpretation1': ogtt_date.strftime('%Y-%m-%d'),
                    'interpretation2': alt_date.strftime('%Y-%m-%d'),
                    'cgm_count1': len(cgm_data[(cgm_data['Participant'] == participant_id) & (cgm_data['Timestamp'].dt.date == ogtt_date.date())]),
                    'cgm_count2': len(cgm_data[(cgm_data['Participant'] == participant_id) & (cgm_data['Timestamp'].dt.date == alt_date.date())])
                })

    # Simplify the ID
    simplified_id = simplify_id(participant_id)

    # Filter CGM data for this participant and time window
    participant_cgm = cgm_data[
        (cgm_data['Participant'] == participant_id) &
        (cgm_data['Timestamp'] >= start_time) &
        (cgm_data['Timestamp'] <= end_time)
    ]

    print(f"ID {participant_id}: {len(participant_cgm)} CGM readings")

    # Process each CGM reading
    for _, cgm_row in participant_cgm.iterrows():
        time_diff = (cgm_row['Timestamp'] - start_time).total_seconds() / 60  # Convert to minutes
        processed_data.append({
            'ID': simplified_id,
            'Condition': np.nan,
            'test': 'CGM',
            'metab': 'gluc',
            'VAL': cgm_row['Sensor Glucose (mmol/L)'],
            'time': round(time_diff)  # Rounding to nearest minute
        })

# Create a DataFrame from the processed data
output_df = pd.DataFrame(processed_data)

# Sort the DataFrame by ID and time
output_df = output_df.sort_values(['ID', 'time'])

output_df.to_csv('processed_cgm_data.csv', index=False)
print("Processed data saved to processed_cgm_data.csv")

# Report on ambiguous dates
print("\nAmbiguous dates with CGM data for both interpretations:")
for participant_id, dates in ambiguous_dates_with_data.items():
    print(f"Participant ID: {participant_id}")
    for date_info in dates:
        print(f"  Original date: {date_info['original_date']}")
        print(f"    Interpretation 1: {date_info['interpretation1']} (CGM readings: {date_info['cgm_count1']})")
        print(f"    Interpretation 2: {date_info['interpretation2']} (CGM readings: {date_info['cgm_count2']})")
    print()

# Save ambiguous dates
# ambiguous_dates_df = pd.DataFrame([
#     {
#         'Participant_ID': pid,
#         'Original_Date': date_info['original_date'],
#         'Interpretation1': date_info['interpretation1'],
#         'CGM_Count1': date_info['cgm_count1'],
#         'Interpretation2': date_info['interpretation2'],
#         'CGM_Count2': date_info['cgm_count2']
#     }
#     for pid, dates in ambiguous_dates_with_data.items()
#     for date_info in dates
# ])
# ambiguous_dates_df.to_csv('ambiguous_dates_with_data.csv', index=False)
# print("Ambiguous dates with CGM data saved to 'ambiguous_dates_with_data.csv'")

Processing ID: PS010001, OGTT Date: 2018-07-18
ID PS010001: 24 CGM readings
Processing ID: PS010002, OGTT Date: 2019-01-25
ID PS010002: 24 CGM readings
Processing ID: PS010003, OGTT Date: 2018-09-21
ID PS010003: 24 CGM readings
Processing ID: PS010004, OGTT Date: 2018-09-07
Ambiguous date for ID PS010004: 09/07/2018
  Interpretation 1: 2018-09-07
  Interpretation 2: 2018-07-09
  CGM readings for 2018-09-07: 288
  CGM readings for 2018-07-09: 0
ID PS010004: 24 CGM readings
Processing ID: PS010005, OGTT Date: 2018-08-31
ID PS010005: 24 CGM readings
Processing ID: PS010007, OGTT Date: 2018-10-25
ID PS010007: 0 CGM readings
Processing ID: PS010006, OGTT Date: 2018-10-19
ID PS010006: 24 CGM readings
Processing ID: PS010008, OGTT Date: 2018-10-18
ID PS010008: 25 CGM readings
Processing ID: PS010010, OGTT Date: 2018-11-08
Ambiguous date for ID PS010010: 11/08/2018
  Interpretation 1: 2018-11-08
  Interpretation 2: 2018-08-11
  CGM readings for 2018-11-08: 288
  CGM readings for 2018-08-11: 0


In [13]:
# DataFrame from the processed data
output_df = pd.DataFrame(processed_data)

In [14]:
# Sort DataFrame by ID and time
output_df = output_df.sort_values(['ID', 'time'])

In [15]:
# Save to CSV
output_df.to_csv('processed_cgm_data.csv', index=False)
print("Processed data saved to processed_cgm_data.csv")

Processed data saved to processed_cgm_data.csv


In [16]:
def extract_last_two(id_string):
    return re.search(r'\d+', id_string).group()[-2:].zfill(2)

In [17]:
ogtt_ids = set(extract_last_two(id) for id in time_ogtt['c11_rand_nr'])
processed_ids = set(id[2:] for id in output_df['ID'])  # Remove 'Ex' prefix
missing_ids = ogtt_ids - processed_ids

print("Sample OGTT IDs:", list(ogtt_ids)[:5])
print("Sample Processed IDs:", list(processed_ids)[:5])
print("Number of OGTT IDs:", len(ogtt_ids))
print("Number of Processed IDs:", len(processed_ids))
print("Number of Missing IDs:", len(missing_ids))

if missing_ids:
    print("\nMissing IDs (last two digits):")
    print(missing_ids)
    
    # To get full original IDs of missing entries
    missing_full_ids = [id for id in time_ogtt['c11_rand_nr'] if extract_last_two(id) in missing_ids]
    print("\nMissing full IDs:")
    print(missing_full_ids)

Sample OGTT IDs: ['29', '63', '10', '86', '14']
Sample Processed IDs: ['53', '19', '62', '71', '52']
Number of OGTT IDs: 88
Number of Processed IDs: 75
Number of Missing IDs: 13

Missing IDs (last two digits):
{'86', '87', '85', '60', '44', '15', '90', '88', '89', '63', '07', '35', '66'}

Missing full IDs:
['PS010007', 'PS010015', 'PS010035', 'PS010044', 'PS010060 ', 'PS010063 ', 'PS010066 ', 'ps010085', 'ps010086', 'ps010087', 'ps010088', 'ps010089', 'ps010090']


**Note:** I tried to implement a method of comparison between the c12_date and c13_date as the c13 date typically was 4-5 days later however the date formatting is inconsistent between the two for many entries. 