# Libraries

In [None]:
import json
import re

import numpy as np
import pandas as pd

from collections import Counter
from pprint import pprint

# Load Data

In [None]:
directory_path = f'/home/weiyanpeh/Git/SFM_Related/CADENCE/SHAPE/PilotStudyToEvaluate_DATA_LABELS_2024-08-12_0908.csv'
df = pd.read_csv(directory_path)
df.head(5)

# Sorting Test, Visit, and Discharge

In [None]:
columns = list(df.columns)

list_pointers = []
list_pointers.append((0, 'start'))

counter_test = 0
counter_visit = 0
counter_discharge = 0

for i in range(len(columns)):
    column = columns[i]
    
    if ('Date and time of test' in column) or ('Date/time of test' in column):
        # if 'test' not in list_pointers[-1][1]:        
        list_pointers.append((i, f'test_{counter_test}'))
        counter_test += 1
        
    if ('Date of visit' in column):
        list_pointers.append((i, f'visit_{counter_visit}'))
        counter_visit += 1
        
    if ('Date of discharge' in column):
        list_pointers.append((i, 'discharge')) 
        counter_discharge += 1
        
list_pointers.append((i, 'end'))
pprint(list_pointers)

In [None]:
for i in range(len(list_pointers)-1):
    pointer_start = list_pointers[i][0]
    pointer_end = list_pointers[i+1][0]
    
    title = list_pointers[i][1]
    column_subset = columns[pointer_start:pointer_end]
    
    print('')
    print(title, '-', len(column_subset))
    pprint(column_subset)

# Preprocessing Data

In [None]:
def remove_suffix_from_dict_keys(data):
    return {re.sub(r'\.\d+$', '', key).strip(): value for key, value in data.items()}

list_subject = df['Study ID'].tolist()
dict_subject = {}

for subject in list_subject:
    
    dict_subject[subject] = {
        'Visit': {}, 
        'Test': {},
        'Information': {}
        }
    
    df_subject = df[df['Study ID'] == subject].reset_index(drop=True)
    
    for i in range(len(list_pointers)-1):
        pointer_start = list_pointers[i][0]
        pointer_end = list_pointers[i+1][0]
        title = list_pointers[i][1]
        column_subset = columns[pointer_start:pointer_end]

        json_load = df_subject[column_subset].to_json(orient='records')
        json_load = json.loads(json_load)[0]
        
        json_load = remove_suffix_from_dict_keys(json_load)
        
        if 'Date/time of test' in json_load.keys():
            if json_load['Date/time of test'] == None:
                continue
            
        if 'Date and time of test' in json_load.keys():
            if json_load['Date and time of test'] == None:
                continue
        
        if 'Date of visit' in json_load.keys():
            if json_load['Date of visit'] == None:
                continue
    
        if 'test' in title:   
           dict_subject[subject]['Test'][title] = json_load
        elif 'visit' in title:
           dict_subject[subject]['Visit'][title] = json_load
        else:
           dict_subject[subject]['Information'][title] = json_load

# Get Subject Information

In [None]:
df_summary = []
for subject in list_subject:
    temp = dict_subject[subject]['Information'].copy()
    temp_start = temp['start'].copy()
    temp_discharge = temp['discharge'].copy()
    
    temp_start.update(temp_discharge)

    df_summary.append(temp_start)
    
df_summary = pd.DataFrame(df_summary)
df_summary.to_csv(f'/home/weiyanpeh/Git/SFM_Related/CADENCE/SHAPE/subject_summary.csv', index=False)
df_summary.head(5)

# Get Visits

In [None]:
df_summary = []
for subject in list_subject:
    temp = dict_subject[subject]['Visit'].copy()

    for key, value in temp.items():
        
        dict_info = {
            'Subject ID': subject,
            'Visit ID': key.replace('visit_', '')
            }
        
        dict_info.update(value)        
        df_summary.append(dict_info)
    
df_summary = pd.DataFrame(df_summary)

# Preprocess numeric details
df_summary['Weight'] = df_summary['Weight'].apply(lambda x: str(x).replace('kg', '').strip() if x is not None else x) # Accidentally add kg in the values
df_summary['Weight'] = df_summary['Weight'].apply(lambda x: str(x).replace('..', '.').strip() if x is not None else x) # Accidentally add ..

df_summary['SPO2'] = df_summary['SPO2'].apply(lambda x: str(x).replace('%', '').strip() if x is not None else x)

leg_columns_all = [
    'Left foot length (cm)', 
    'Left foot width (cm)', 
    'Left foot circumference at center (cm)', 
    'Left foot ankle circumference (cm)',
    'Right foot length (cm)',
    'Right foot width (cm)', 
    'Right foot circumference at center (cm)',
    'Right foot ankle circumference (cm)'
    ]

for col in leg_columns_all:
    df_summary[col] = df_summary[col].apply(lambda x: str(x).replace('..', '.').strip() if x is not None else x)
    df_summary[col] = df_summary[col].apply(lambda x: str(x).replace('/', '.').strip() if x is not None else x)

# Preprocess comments
column = 'How is the subject feeling today'
df_summary[column] = df_summary[column].apply(lambda x: str(x).lower() if x is not None else x)

# Save
df_summary = df_summary.fillna('')
df_summary.to_csv(f'/home/weiyanpeh/Git/SFM_Related/CADENCE/SHAPE/subject_visit.csv', index=False)
df_summary.head(5)

# Get Tests

In [None]:
df_summary = []
for subject in list_subject:
    temp = dict_subject[subject]['Test'].copy()

    for key, value in temp.items():
        
        # Update all date time to same column
        if 'Date/time of test' in value.keys():
            value['Date and time of test'] = value.pop('Date/time of test')
            
        # Get type of test
        test_type = ''
        if 'Troponin' in value.keys():
            test_type = 'Heart'
        elif 'Sodium' in value.keys():
            test_type = 'Kidney'
        elif 'Hemoglobin' in value.keys():
            test_type = 'Blood'
        elif 'NT-ProBNP' in value.keys():
            test_type = 'Clinical Trials'
    
        dict_info = {
            'Subject ID': subject,
            'Test ID': key.replace('test_', ''),
            'Test Type': test_type}
        
        dict_info.update(value)
        df_summary.append(dict_info)
    
df_summary = pd.DataFrame(df_summary)

df_summary = df_summary.fillna('')
# df_summary['Date and time of test'] = df_summary['Date and time of test'].apply(lambda x: x.split(' ')[0])
df_summary['Date and time of test'] = pd.to_datetime(df_summary['Date and time of test'], format="%d/%m/%Y %H:%M")
df_summary = df_summary.groupby('Subject ID').apply(lambda x: x.sort_values(by='Date and time of test')).reset_index(drop=True)

print('Distribution of tests')
pprint(Counter(df_summary['Test Type'].tolist()))

print(f'No. of tests: {len(df_summary)}')

df_summary.to_csv(f'/home/weiyanpeh/Git/SFM_Related/CADENCE/SHAPE/subject_test.csv', index=False)
df_summary.head(5)    