# Libraries

In [None]:
import os
import json
import re
import math

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from datetime import datetime


from tqdm import tqdm
from collections import Counter
from pprint import pprint

In [None]:
directory_path = f'/home/weiyanpeh/Git/SFM_Related/CADENCE/SHAPE/PilotStudyToEvaluate_DATA_LABELS_2024-08-12_0908.csv'

df = pd.read_csv(directory_path)
list_subject = df['Study ID'].tolist()
# list_subject

# Get Subject Information

In [None]:
df_summary = pd.read_csv(f'/home/weiyanpeh/Git/SFM_Related/CADENCE/SHAPE/subject_summary.csv')
list_subject = df_summary['Study ID'].tolist()

# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(8, 6))

# Plot distribution of Sex
df_summary['Sex'].value_counts().plot(kind='bar', ax=axes[0, 0], color='skyblue')
axes[0, 0].set_title('Gender')
axes[0, 0].set_xlabel('Gender')
axes[0, 0].set_ylabel('Count')
axes[0, 0].tick_params(axis='x', rotation=0)

# Plot distribution of Ethnicity
df_summary['Ethnicity'].value_counts().plot(kind='bar', ax=axes[0, 1], color='lightgreen')
axes[0, 1].set_title('Ethnicity')
axes[0, 1].set_xlabel('Ethnicity')
axes[0, 1].set_ylabel('Count')
axes[0, 1].tick_params(axis='x', rotation=0)

# Plot distribution of Smoking History
df_summary['Smoking History'].value_counts().plot(kind='bar', ax=axes[1, 0], color='salmon')
axes[1, 0].set_title('Smoking History')
axes[1, 0].set_xlabel('Smoking History')
axes[1, 0].set_ylabel('Count')
axes[1, 0].tick_params(axis='x', rotation=0)

# Plot distribution of Age
df_summary['Age at Admission'].plot(kind='hist', bins=10, ax=axes[1, 1], color='#B8860B')
axes[1, 1].set_title('Age')
axes[1, 1].set_xlabel('Age')
axes[1, 1].set_ylabel('Count')

# Super title
plt.suptitle('Test Subject Distribution')
plt.tight_layout()
plt.show()
plt.savefig(f'/home/weiyanpeh/Git/SFM_Related/CADENCE/SHAPE_Details/Vitals/subject_distribution.png')


In [None]:
df_summary.describe().round(2)

## Medical History

In [None]:
df_summary = pd.read_csv(f'/home/weiyanpeh/Git/SFM_Related/CADENCE/SHAPE/subject_summary.csv')
list_subject = df_summary['Study ID'].tolist()

list_diseases = [
       'None',
       
       # Cardiovascular
       'Hypertension',
       'Ischemic Heart Disease',
       'Atrial fibrillation',
       'Valvular Heart disease',
       'Chronic Venous Insufficiency',
       'Prior Episode of Heart Failure',
       
       # Endocrine
       'Diabetes Mellitus',
       
       # Renal
       'Chronic Kidney Disease',
       
       # Metabolic
       'Dyslipidemia',
       
       # Medication use
       'Diuretic Use Medication',
       ]

df_diseases = []
for diseases in list_diseases:
    col = f'Does the patient have any relevant medical history? (check all that apply) (choice={diseases})'
    results = df_summary[col].tolist()
    df_diseases.append([diseases, results.count('Checked'), results.count('Unchecked')])
    
df_diseases = pd.DataFrame(df_diseases, columns=['medical history', 'yes', 'no'])
df_diseases

In [None]:
df_summary[df_summary['Does the patient have any relevant medical history? (check all that apply) (choice=None)'] == 'Checked'].filter(like='Does')

# Get Visits

## Clean "How is the subject feeling today"

In [None]:
df_summary = pd.read_csv(f'/home/weiyanpeh/Git/SFM_Related/CADENCE/SHAPE/subject_visit.csv')
df_summary['Date of visit'] = pd.to_datetime(df_summary['Date of visit'], format='%d/%m/%Y')

# Column Preprocessing to One-Hot Encoding
def preprocessing_str(text):
    text = str(text).strip()
    text = text.replace('-', ' - ')
    text = re.sub(' +', ' ', text) 
    return text   

def preprocess_subject_feelings(text):
    list_very_good = ['very well', 'much better']
    list_good = ['well', 'better', 'good']
    list_neutral = ['ok', 'asymptomatic']
    list_bad = ['headache', 'unwell', 'worse', 'weak', 'not so good', 'nausea']
    list_very_bad = ['passed away', 'deteriorated ', 'icu']
    list_unknown = ['nan', 'n/a', 'not done', 'keen to go home']
       
    dictionary_feelings = {
        # Sorted based on priority checks, do not rearrange
        2: list_bad,        # Bad first
        1: list_very_bad,   # Follow by very bad
        5: list_very_good,  # Then very good
        3: list_neutral,    # Neutral
        0: list_unknown,    # Then unknown
        4: list_good,       # Then the rest dump into good
    }
    
    for key, value in dictionary_feelings.items():
        if any(substring in text for substring in value):
            return key

    return text

column = 'How is the subject feeling today'
df_summary[column] = df_summary[column].apply(lambda x: preprocessing_str(x))

list_feelings = df_summary[column].tolist()
list_feelings = ['n/a' if str(x) == 'nan' else x for x in list_feelings]
list_feelings = [preprocess_subject_feelings(x) for x in list_feelings]
df_summary['feelings_score'] = list_feelings
Counter(list_feelings)

In [None]:
df_summary[df_summary['feelings_score']==1]

In [None]:
# Getting examples
dictionary_test = {}

for i, row in df_summary.iterrows():
    text = row['How is the subject feeling today']
    value = row['feelings_score']
    if value not in dictionary_test.keys():
        dictionary_test[value] = []
    dictionary_test[value].append(text)

for i in range(6):
    print(i, '-', len(dictionary_test[i]))
    dictionary_test[i] = set(dictionary_test[i])
    
columns = ['feeling_score', 'examples']
df_test = pd.DataFrame(dictionary_test.items(), columns=columns)
df_test

## Clinical team assessment of pedal edema

In [None]:
df_summary = pd.read_csv(f'/home/weiyanpeh/Git/SFM_Related/CADENCE/SHAPE/subject_visit.csv')
df_summary['Date of visit'] = pd.to_datetime(df_summary['Date of visit'], format='%d/%m/%Y')

# Column Preprocessing to One-Hot Encoding
def preprocessing_str(text):
    text = str(text).strip()
    text = text.replace('-', ' - ')
    text = text.lower()
    text = text.replace('oedema', 'edema') # Standardizing
    text = re.sub(' +', ' ', text) 
    return text   

def preprocess_subject_assessment_improve(text):
    list_improve = ['improv']
    list_improve = [x.lower() for x in list_improve]
    if any(substring in text.lower() for substring in list_improve):
        return 'Yes'
    else:
        return 'No'

def preprocess_subject_assessment_pitting(text):
    list_pitting = ['pitting']
    list_no_pitting = ['no pitting', 'nil pitting']
    
    list_pitting = [x.lower() for x in list_pitting]
    list_no_pitting = [x.lower() for x in list_no_pitting]
    
    
    if any(substring in text.lower() for substring in list_no_pitting):
        return 'No'
    elif any(substring in text.lower() for substring in list_pitting):
        return 'Yes'
    else:
        return 'No'

def preprocess_subject_assessment(text):
    list_no_edema = ['no edema', 'no pedal edema',
                     'nil edema', 'nil pedal edema', 
                     'nil pitting edema', '0 pedal edema', 'no pitting edema',
                     'no obvious edema',
                     ]
    list_trace_edema = ['minimal', 'trace']
    list_mild_edema = ['mild']
    
    list_edema = ['edema', 'supple']
    
    # list_edema = ['trace edema', 'trace pedal edema', 'pitting edema', 'trace',
    #                     'edematous', 'pedal edema', 'edema', 'supple']

    list_knee = ['knee']
    list_thigh = ['thigh']
    list_shin = ['mid shin', 'shin']
    list_ankle = ['ankle']
    list_feet = ['feet']
    
                        
    list_unknown = ['nan', 'Not done']
    list_others = ['n/a subject passed away']
       
    dictionary_feelings = {
        # Sorted based on priority checks, do not rearrange
        'no edema': list_no_edema,
        # 'edema': list_edema,
        
        'knee': list_knee,
        'thigh': list_thigh,
        'shin': list_shin,
        'ankle': list_ankle,
        'feet': list_feet,
        
        'mild': list_mild_edema,
        'trace': list_trace_edema,
        'edema': list_edema,
        
        # 'unknown': list_unknown + list_others,
        'others': list_others + list_unknown,
    }
    
    for key, value in dictionary_feelings.items():
        value = [x.lower() for x in value]
        if any(substring in text.lower() for substring in value):
            return key

    return 'others'
    # return text

column = "Clinical team assessment of pedal edema"
df_summary[column] = df_summary[column].apply(lambda x: preprocessing_str(x))

list_assessment = df_summary[column].tolist()
list_assessment_preprocessed = [preprocess_subject_assessment(x) for x in list_assessment]
list_assessment_improved = [preprocess_subject_assessment_improve(x) for x in list_assessment]
list_assessment_pitting = [preprocess_subject_assessment_pitting(x) for x in list_assessment]

df_summary['clinical_assessment'] = list_assessment_preprocessed
df_summary['clinical_assessment_improve'] = list_assessment_improved
df_summary['clinical_assessment_pitting'] = list_assessment_pitting

print('\nGeneral Assessments')
pprint(Counter(list_assessment_preprocessed))

print('\nImprovements?')
print(Counter(list_assessment_improved))

print('\nPitting?')
print(Counter(list_assessment_pitting))

In [None]:
dictionary_test = {}

for i, row in df_summary.iterrows():
    text = row["Clinical team assessment of pedal edema"]
    value = row['clinical_assessment']
    # value = row['clinical_assessment_improve']
    # value = row['clinical_assessment_pitting']
    
    if value not in dictionary_test.keys():
        dictionary_test[value] = []
    dictionary_test[value].append(text)


for key, value in dictionary_test.items():
    print()
    print(key, '-', len(value))
    dictionary_test[key] = set(value)
    pprint(set(value))
    
columns = ['clinical_assessment', 'examples']
df_test = pd.DataFrame(dictionary_test.items(), columns=columns)
df_test

## Others

In [None]:
print("Was the patient's vitals taken?")
pprint(Counter(df_summary["Was the patient's vitals taken?"].tolist()))

print('')
print("Was Subject's foot measurements taken?")
pprint(Counter(df_summary["Was Subject's foot measurements taken?"].tolist()))

print('')
print("If no, why")
pprint(Counter(df_summary["If no, why"].tolist()))

# Plot Combined

In [None]:
color_dict = {
    'Weight': 'k',
    'HR': 'r',
    'SPO2': 'g',
    }

list(color_dict.values())

In [None]:
# Load data
df_summary = pd.read_csv(f'/home/weiyanpeh/Git/SFM_Related/CADENCE/SHAPE/subject_visit.csv')
df_summary['Date of visit'] = pd.to_datetime(df_summary['Date of visit'], format='%d/%m/%Y')



def plot_all_measurements(df, subject):
    
    time_column = 'Date of visit'
    subject_column = 'Subject ID'

    list_columns = ['Left', 'Right']
    
    # Create subplots for each subject
    fig, axes = plt.subplots(
        1, 
        3,
        figsize=(8, 3), 
        sharex=True
        )

    subject_data = df.loc[df[subject_column] == subject].copy()    
    subject_data = subject_data[subject_data["Was Subject's foot measurements taken?"] == 'Yes'].reset_index(drop=True)
    
    # In case there are duplicate dates
    subject_data = subject_data.drop_duplicates(subset=[time_column], keep='first', inplace=False)

    # Create a new column for day number starting from day 0
    subject_data.loc[:, 'Day'] = (subject_data[time_column] - subject_data[time_column].min()).dt.days


    ########################################################################
    # Plot health signs
    ########################################################################
    label_health = ['Weight', 'HR', 'SPO2']
    label_health_edit = ['Weight (kg)', 'HR', 'SPO2']
    
    color_dict = {
        'Weight': 'k',
        'HR': 'r',
        'SPO2': 'g',
        }
    
    subject_data.plot(x='Day', y=label_health, ax=axes[0], 
                      label=label_health_edit,
                      color=list(color_dict.values()), 
                      legend=False)
    
    list_values_lim = []
    for _, column in enumerate(label_health): 
        subject_data[column] = pd.to_numeric(subject_data[column], errors='coerce')
        axes[0].scatter(subject_data['Day'], subject_data[column], color=color_dict[column])
        list_values_lim = list_values_lim + subject_data[column].tolist()

    lim_max = np.nanmax(list_values_lim)
    lim_min = np.nanmin(list_values_lim)

    axes[0].set_title('Health Metrics\n(Weight, HR, SPO2)')
    axes[0].set_xlabel('Day')
    axes[0].set_ylabel('Values')
    axes[0].set_ylim([lim_min*0.95, lim_max*1.05])
    

    ########################################################################
    # Plot legs
    ########################################################################
    # Find % change in leg measurements
    leg_columns_all = [
        # 'Left foot length (cm)', 
        # 'Left foot width (cm)', 
        'Left foot circumference at center (cm)', 
        'Left foot ankle circumference (cm)',
        # 'Right foot length (cm)',
        # 'Right foot width (cm)', 
        'Right foot circumference at center (cm)',
        'Right foot ankle circumference (cm)'
        ]

    list_values_lim = []
    for col in leg_columns_all:
        subject_data[col] = subject_data[col].apply(lambda x: float(x))
        list_values_lim = list_values_lim + subject_data[col].tolist()
        # subject_data[col] = subject_data[col].pct_change(fill_method='bfill')*100
    subject_data.fillna(0, inplace=True)
    
    lim_max = np.nanmax(list_values_lim)
    lim_min = np.nanmin(list_values_lim)

    total_edema_change = 0
    for i, col in enumerate(list_columns): # df.columns
        i = i + 1
        
        leg_columns = [
            # f'{col} foot length (cm)', 
            # f'{col} foot width (cm)', 
            f'{col} foot circumference at center (cm)', 
            f'{col} foot ankle circumference (cm)',
            ]

        circumference_average = subject_data[leg_columns].mean(axis=1)
        subject_data[f'{col} circumference average'] = circumference_average
        leg_columns.append(f'{col} circumference average')
        
        # print(circumference_average.to_list())
        
        # Get classification
        c_avg_first = circumference_average.to_list()[0]
        c_avg_last = circumference_average.to_list()[-1]
        try:
            percentage_edema_change = round((c_avg_last - c_avg_first) / c_avg_first * 100, 1)
        except:
            percentage_edema_change = 0
            
        total_edema_change = total_edema_change + percentage_edema_change
        if percentage_edema_change < 0:
            target = f'Improved: {percentage_edema_change}%'
        elif percentage_edema_change > 0:
            target = f'Worsen: {percentage_edema_change}%'
        else:
            target = f'Unchanged: {percentage_edema_change}%'
            
        # label_legs = ['Length', 'Width', 'Foot Ø', 'Ankle Ø', 'Average']
        label_legs = ['Foot Ø', 'Ankle Ø', 'Average Ø']
        subject_data.plot(x='Day', y=leg_columns, ax=axes[i], 
                          label=label_legs, 
                          legend=False)
        
        for _, column in enumerate(leg_columns): 
            subject_data[column] = pd.to_numeric(subject_data[column], errors='coerce')
            axes[i].scatter(subject_data['Day'], subject_data[column])
            
        # axes[i].axhline(y=0, color='grey', linestyle='--')    
        axes[i].set_title(f'{col} Feet\nEdema {target}')
        axes[i].set_xlabel('Day')
        axes[i].set_ylabel('Ø (cm)')
        axes[i].set_ylim([lim_min*0.95, lim_max*1.05])
        
        
        if len(subject_data) == 1:
            axes[i].set_xlim([-1, 1])
            axes[i].set_xticks([0])
            axes[i].xaxis.set_major_locator(plt.MaxNLocator(integer=True))
        else:
            axes[i].set_xlim([-0.5, max(subject_data['Day'].tolist())+0.5])
            axes[i].xaxis.set_major_locator(plt.MaxNLocator(integer=True))

            
    # Add a single legend only to the right plot
    handles_0_plot, labels_0_plot = axes[0].get_legend_handles_labels()
    handles_2_plot, labels_2_plot = axes[2].get_legend_handles_labels()
    
    handles_plot = handles_0_plot + handles_2_plot
    labels_plot = labels_0_plot +  labels_2_plot

    # print(handles_plot)
    # print(labels_plot)

    fig.legend(
        handles_plot, 
        labels_plot, 
        
        # loc='upper center', 
        # bbox_to_anchor=(0.5,1.025), 
        # ncol=len(label_legs)
        
        loc='center left', 
        bbox_to_anchor=(1, 0.5),
        ncol=1,
        )
        
    list_date = subject_data[time_column].tolist()
    list_date = ', '.join([str(x.date()) for x in list_date])
    # print(list_date)
    
    total_edema_change = round(total_edema_change/2, 1) 
    if total_edema_change < 0:
        target = f'Improved: {total_edema_change}%'
    elif total_edema_change > 0:
        target = f'Worsen: {total_edema_change}%'
    else:
        target = f'Unchanged: {total_edema_change}%'
    
    plt.suptitle(f'Subject: {subject}\nVisitation Date: {list_date}\nOverall Edema {target}')
    plt.tight_layout()
    plt.savefig(f'/home/weiyanpeh/Git/SFM_Related/CADENCE/SHAPE_Details/Vitals/subject_{subject}_details.png', bbox_inches='tight')
    # plt.show()
    # plt.close()
    
    return total_edema_change



list_total_edema_change = []
for subject in tqdm(list_subject):
    # print('Subject:', subject)
    total_edema_change = plot_all_measurements(df_summary, subject)
    list_total_edema_change.append(total_edema_change)



In [None]:
list_total_edema_change

In [None]:
df_edema_status = pd.DataFrame()
df_edema_status['Subject'] = list_subject
df_edema_status['Edema Change'] = list_total_edema_change
df_edema_status.to_csv(f'/home/weiyanpeh/Git/SFM_Related/CADENCE/SHAPE_Details/subject_edema_status.csv', index=False)

In [None]:
list_subject

In [None]:
colors = ['red' if val > 0 else 'green' if val < 0 else 'black' for val in list_total_edema_change]

# Create bar chart
plt.figure(figsize=(20, 4))
bars = plt.bar(list_subject, list_total_edema_change, color=colors)


# Add black line for zero values
for bar, val in zip(bars, list_total_edema_change):
    if val == 0:
        bar.set_edgecolor('black')
        bar.set_linewidth(1.5)


plt.xlabel('Subject')
plt.ylabel('% Improvement / Worsen')
plt.title('Total Edema change in both legs\nImprovement (-) / Worsen (+) % by Subject')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.grid(axis='x', linestyle='--', alpha=0.7) # Add vertical grid lines

plt.xlim([0, 71])
plt.xticks(np.arange(1, 71, step=1))


# Add legend
legend_labels = ['Worsen', 'Improvement', 'Unchanged']
legend_colors = ['red', 'green', 'black']
patches = [plt.Line2D(list_subject, list_total_edema_change, color=color, lw=4) for color in legend_colors]
plt.legend(patches, legend_labels, loc='upper right')

save_path = f'/home/weiyanpeh/Git/SFM_Related/CADENCE/SHAPE_Details/subject_edema_improvement.png'
fig.savefig(save_path, dpi=300)  # Save at high resolution

plt.show()


In [None]:
print('+ :', len([x for x in list_total_edema_change if x > 0]))
print('0 :', len([x for x in list_total_edema_change if x == 0]))
print('- :', len([x for x in list_total_edema_change if x < 0]))

In [None]:
print('+ :', np.median([x for x in list_total_edema_change if x > 0]))
print('0 :', np.median([x for x in list_total_edema_change if x == 0]))
print('- :', np.median([x for x in list_total_edema_change if x < 0]))

## Plot Weight, HR, SP02

In [None]:
# df_summary = pd.read_csv(f'/home/weiyanpeh/Git/SFM_Related/CADENCE/SHAPE/subject_visit.csv')
# df_summary['Date of visit'] = pd.to_datetime(df_summary['Date of visit'], format='%d/%m/%Y')


# # Assuming df is the dataframe with the data
# # Group by 'Subject' column and plot each column as a time series

# def plot_time_series(df, subject):
    
#     time_column = 'Date of visit'

#     columns = ['Weight', 'HR', 'SPO2']
    
#     subject_data = df[df_summary['Subject ID'].isin([subject])].copy()
    
#     # Create a new column for day number starting from day 0
#     subject_data.loc[:, 'Day'] = (subject_data[time_column] - subject_data[time_column].min()).dt.days

#     # print(subject_data[columns].values)

#     # Create subplots for each subject
#     fig, axes = plt.subplots(
#         1, 
#         len(columns),
#         figsize=(8, 3), 
#         sharex=True
#         )

#     for i, column in enumerate(columns): # df.columns
#         subject_data_column = subject_data[['Day', column]]
#         subject_data_column = subject_data_column.dropna(subset=[column])
        
#         clean_list = [x for x in subject_data_column[column].tolist() if x is not None and not math.isnan(x)]
        
#         subject_data_column.plot(x='Day', y=column, ax=axes[i], legend=False)
#         axes[i].scatter(subject_data_column['Day'], subject_data_column[column])
#         axes[i].set_title(column)
#         axes[i].set_xlabel('Day')
#         # axes[i].set_ylabel('Values')
        
#         axes[i].set_ylim([min(clean_list)*0.9, max(clean_list)*1.1])
#         axes[i].set_xlim([-0.5, max(subject_data_column['Day'].tolist())+0.5])
#         axes[i].xaxis.set_major_locator(plt.MaxNLocator(integer=True))

#     plt.suptitle(f'Subject: {subject}')
#     plt.tight_layout()
#     plt.savefig(f'/home/weiyanpeh/Git/SFM_Related/CADENCE/SHAPE_Details/Health_Statistics/subject_{subject}_health.png')
#     # plt.show()
#     plt.close()
    
#     return None


# for subject in tqdm(list_subject):
#     # print('Subject:', subject)
#     plot_time_series(df_summary, subject)


## Plot Feet

In [None]:
# df_summary = pd.read_csv(f'/home/weiyanpeh/Git/SFM_Related/CADENCE/SHAPE/subject_visit.csv')
# df_summary['Date of visit'] = pd.to_datetime(df_summary['Date of visit'], format='%d/%m/%Y')


# # Function to plot time series for left and right foot measurements
# def plot_foot_measurements(df, subject):
    
#     time_column = 'Date of visit'
#     subject_column = 'Subject ID'

#     list_columns = ['Left', 'Right']
    
#     # Create subplots for each subject
#     fig, axes = plt.subplots(
#         1, 
#         2,
#         figsize=(8, 3), 
#         sharex=True
#         )

#     subject_data = df.loc[df[subject_column] == subject].copy()
    
#     subject_data = subject_data[subject_data["Was Subject's foot measurements taken?"] == 'Yes'].reset_index(drop=True)
#     foot_columns = subject_data.filter(like='foot')

#     # print('Subject', subject)
#     # print(foot_columns.values)
    
    
#     # Create a new column for day number starting from day 0
#     subject_data.loc[:, 'Day'] = (subject_data[time_column] - subject_data[time_column].min()).dt.days

#     # Find % change in leg measurements
#     leg_columns_all = [
#         # 'Left foot length (cm)', 
#         # 'Left foot width (cm)', 
#         'Left foot circumference at center (cm)', 
#         'Left foot ankle circumference (cm)',
#         # 'Right foot length (cm)',
#         # 'Right foot width (cm)', 
#         'Right foot circumference at center (cm)',
#         'Right foot ankle circumference (cm)'
#         ]

#     for col in leg_columns_all:
#         subject_data[col] = subject_data[col].apply(lambda x: float(x))
#         # subject_data[col] = subject_data[col].pct_change(fill_method='bfill')*100
#     subject_data.fillna(0, inplace=True)

#     for i, col in enumerate(list_columns): # df.columns
        
#         leg_columns = [
#             # f'{col} foot length (cm)', 
#             # f'{col} foot width (cm)', 
#             f'{col} foot circumference at center (cm)', 
#             f'{col} foot ankle circumference (cm)',
#             ]

#         average_change = subject_data[leg_columns].mean(axis=1)
#         subject_data['{col} foot average change'] = average_change
#         leg_columns.append('{col} foot average change')

#         # label_legs = ['Length', 'Width', 'Foot Ø', 'Ankle Ø', 'Average']
#         label_legs = ['Foot Ø', 'Ankle Ø', 'Average']
#         subject_data.plot(x='Day', y=leg_columns, ax=axes[i], label=label_legs, legend=False)
        
#         for _, column in enumerate(leg_columns): 
#             subject_data[column] = pd.to_numeric(subject_data[column], errors='coerce')
#             axes[i].scatter(subject_data['Day'], subject_data[column])
            
#         # axes[i].axhline(y=0, color='grey', linestyle='--')    
#         axes[i].set_title(f'{col} Feet')
#         axes[i].set_xlabel('Day')
#         axes[i].set_ylabel('Circumference (cm)')
        
        
#         # axes[i].set_ylim([-25, 25])
#         # axes[i].set_ylim([-15, 15])
        
#         if len(subject_data) == 1:
#             axes[i].set_xlim([-1, 1])
#             axes[i].set_xticks([0])
#             axes[i].xaxis.set_major_locator(plt.MaxNLocator(integer=True))
#         else:
#             axes[i].set_xlim([-0.5, max(subject_data['Day'].tolist())+0.5])
#             axes[i].xaxis.set_major_locator(plt.MaxNLocator(integer=True))

            
#         # Add a single legend only to the right plot
#         handles_right_plot, labels_right_plot = axes[i].get_legend_handles_labels()
#         fig.legend(
#             handles_right_plot, 
#             labels_right_plot, 
            
#             # loc='upper center', 
#             # bbox_to_anchor=(0.5,1.025), 
#             # ncol=len(label_legs)
            
#             loc='center left', 
#             bbox_to_anchor=(1, 0.5),
#             ncol=1,
            
#             )
        
#     list_date = subject_data[time_column].tolist()
#     list_date = ', '.join([str(x.date()) for x in list_date])
    
#     plt.suptitle(f'Subject: {subject}\n{list_date}')
#     plt.tight_layout()
#     plt.savefig(f'/home/weiyanpeh/Git/SFM_Related/CADENCE/SHAPE_Details/Feet/subject_{subject}_foot.png', bbox_inches='tight')
#     plt.show()
#     plt.close()



# for subject in tqdm(list_subject):
#     # print('Subject:', subject)
#     plot_foot_measurements(df_summary, subject)


# Group dates

In [None]:
df_summary = pd.read_csv(f'/home/weiyanpeh/Git/SFM_Related/CADENCE/SHAPE/subject_visit.csv')
# df_summary['Date of visit'] = pd.to_datetime(df_summary['Date of visit'], format='%d/%m/%Y')

df_summary_group = df_summary.groupby('Subject ID').agg(lambda x: list(x))
df_summary_group = df_summary_group.reset_index()
df_summary_group['count'] = df_summary_group['Date of visit'].apply(lambda x: len(x))
df_summary_group.to_csv(f'/home/weiyanpeh/Git/SFM_Related/CADENCE/SHAPE_Details/data_summary_grouped.csv', index=False)
df_summary_group['count'].describe().round(0)

In [None]:
df_summary_group["Was Subject's foot measurements taken?"]

In [None]:
df_summary_group.columns

In [None]:

def convert_dates_to_days(dates):
    # Convert string dates to datetime objects
    date_objects = [datetime.strptime(date, '%d/%m/%Y') for date in dates]

    # Find the smallest date
    min_date = min(date_objects)
    
    # Calculate the number of days from the smallest date
    days_from_min_date = [(date - min_date).days for date in date_objects]

    return days_from_min_date



columns = ['Subject ID', 
           "Was Subject's foot measurements taken?",
           'Date of visit',
           'Date of visit relative',
           ]
df_summary_group['measurement_counts'] = df_summary_group["Was Subject's foot measurements taken?"].apply(lambda x: x.count('Yes'))

df_summary_group['Date of visit relative'] = df_summary_group['Date of visit'].apply(lambda x: convert_dates_to_days(x))

for i, row in df_summary_group.iterrows():
    
    list_measurement = row["Was Subject's foot measurements taken?"]
    
    list_index_true = []
    for j in range(len(list_measurement)):
        if list_measurement[j] == 'Yes':
            list_index_true.append(j)

    for col in df_summary_group.columns:
        data_col = row[col]
        if type(row[col]) == list:
            
            cropped_data = [row[col][k] for k in list_index_true]
            df_summary_group[col][i] = cropped_data

            

# for i in range(len(df_summary_group)):
#     list_date = df_summary_group['Date of visit'][i]
#     list_measurement = df_summary_group["Was Subject's foot measurements taken?"][i]
#     list_date_relative = df_summary_group['Date of visit relative'][i]
    
#     # list_date_relative = convert_dates_to_days(list_date)
    
#     # print('')
#     # print('Subject', i+1)
#     # print(list_date)
#     # print(list_date_relative)
    
#     list_measurement_visit_date = []
#     list_measurement_visit_date_relative = []
#     for j in range(len(list_date)):
#         if list_measurement[j] == 'Yes':
#             list_measurement_visit_date.append(list_date[j])
#             list_measurement_visit_date_relative.append(list_date_relative[j])
#     df_summary_group['measurement_date'][i] = list_measurement_visit_date
#     df_summary_group['measurement_date_relative'][i] = list_measurement_visit_date_relative
            
df_summary_group[columns]

# Get videos for subjects

In [None]:
def get_folders_in_directory(directory):
    # Get a list of all files and folders in the specified directory
    items = os.listdir(directory)
    
    # Filter out the items that are folders
    folders = [item for item in items if os.path.isdir(os.path.join(directory, item))]
    
    return folders

# Specify the directory path
directory_path = f'/home/weiyanpeh/Git/SFM_Related/CADENCE/SHAPE'

# Get the list of folders in the directory
folders = get_folders_in_directory(directory_path)
folders = [x for x in folders if 'Subject' in x]

folders.sort()


def list_directory_contents(directory):
    # Get a list of all files and folders in the specified directory
    items = os.listdir(directory)
    
    return items


dictionary_subject_video = {}

for folder in folders:
    folder_path = os.path.join(directory_path, folder)
    
    # Get the list of files and folders in the directory
    contents = list_directory_contents(folder_path)
    contents = [x for x in contents if ':Zone.Identifier' not in x]

    subject = int(folder.split('.')[0])

    dictionary_subject_video[subject] = []

    for content in contents:
        if '(' not in content:
            dictionary_subject_video[subject].append(content)
        # print(folder, '-', content)
        
    dictionary_subject_video[subject].sort()

# pprint(dictionary_subject_video)

columns = ['Subject ID', 'Videos']
df_subject_video = pd.DataFrame(dictionary_subject_video.items(), columns=columns)
df_subject_video = df_subject_video.sort_values('Subject ID')
df_subject_video['video_counts'] = df_subject_video['Videos'].apply(lambda x: len(x))

df_subject_video['video_date'] = ''
df_subject_video['video_date_LR'] = ''
for i in range(len(df_subject_video)):
    print('Subject', i+1)
    list_videos = df_subject_video['Videos'][i]
    # print(list_videos)
    
    list_date = [x.replace('d', 'D') for x in list_videos]
    list_date = [x.replace('l', 'L') for x in list_date]
    list_date = [x.replace('r', 'R') for x in list_date]
    list_date = [x.replace('.mp4', '') for x in list_date]
    list_date = [x for x in list_date if 'D' in x]
    list_date = [re.split(r'[Dd]', x)[1] for x in list_date]
    
    df_subject_video['video_date_LR'][i] = list_date
    
    list_date = [x.split('R')[0] for x in list_date]
    list_date = [x.split('L')[0] for x in list_date]
    
    # print(list_date)
    list_date = [int(x) for x in list_date]

    
    list_date = list(set(list_date))
    list_date.sort()
    
    df_subject_video['video_date'][i] = list_date
    
df_subject_video

# Join Video count and visit measurement record count

In [None]:
df_join = pd.merge(df_summary_group, df_subject_video, on='Subject ID', how='inner')

df_join['match'] = ''
for i, row in df_join.iterrows():
    
    count_measurement = row['measurement_counts']
    count_video = row['video_counts']
    
    print('')
    print('#'*60)
    print(f'Subject {row['Subject ID']} - {count_measurement} - {count_video}')
    
    print('Date (all)         :', row['Date of visit'])
    print('Date relative (all):', row['Date of visit relative'])
    print('')
    
    print('Date   :', row['Date of visit relative'])
    print('Videos :', row['Videos'])
    
    
    print('Videos :', row['video_date'])
    print('Videos :', row['video_date_LR'])
    
    print('')
    if row['Date of visit relative'] == row['video_date']:
        df_join['match'][i] = 'Yes'
        pass
    else:
        df_join['match'][i] = 'No'
        print('NOT MATCH')

In [None]:
df_join.columns

In [None]:
columns = [
    'Subject ID', 'Date of visit',
       'How is the subject feeling today',
       'Is the subject feeling breathless?', 'Did the Subject sleep well?',
       'Was the height taken', 'Height', 'Was weight taken?', 'Weight', 'BMI',
       "Was the patient's vitals taken?", 'Blood Pressure', 'HR', 'SPO2',
       'Clinical team assessment of pedal edema',

       'Left foot length (cm)', 'Left foot width (cm)',
       'Left foot circumference at center (cm)',
       'Left foot ankle circumference (cm)', 'Right foot length (cm)',
       'Right foot width (cm)', 'Right foot circumference at center (cm)',
       'Right foot ankle circumference (cm)', 
       
       'Date of visit relative', 
       'Videos', 
       'video_counts',
       'video_date',
       'video_date_LR',
       'match'
       ]
df_join[columns]

df_join.to_csv(f'/home/weiyanpeh/Git/SFM_Related/CADENCE/SHAPE_Details/subject_summary_video_join.csv', index=False)

In [None]:
Counter(df_join['match'].tolist())

# Get Tests

In [None]:
df_summary = pd.read_csv(f'/home/weiyanpeh/Git/SFM_Related/CADENCE/SHAPE/subject_test.csv')
print(df_summary.describe)