In [1]:
import pandas as pd 
import numpy as np
import os 
import seaborn as sns 
import matplotlib.pyplot as plt 

## Set input and output folders 

In [2]:
# analysis folder version  
analysis_version = '006'

In [3]:
out_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis',
                        analysis_version, 
                        '001_histograms_counts')

if not os.path.exists(out_path): 
    os.makedirs(out_path)

### Load Clean Data - no missing BW Data 
May be missing video data 

In [4]:
# PWS 
zv_pws_bw_clean_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis', 
                                    analysis_version, 
                                    '000_merged_cleaned_data\zv_bw_merged_gait_vertical_PWS_1_clean.csv')

zv_pws_bw_clean_df = pd.read_csv(zv_pws_bw_clean_path, index_col = 0)

# FW 
zv_fw_bw_clean_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis', 
                                    analysis_version, 
                                   '000_merged_cleaned_data\zv_bw_merged_gait_vertical_FW_1_clean.csv') 
zv_fw_bw_clean_df = pd.read_csv(zv_fw_bw_clean_path, index_col = 0) 

# Home Videos 
hv_bw_clean_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis', 
                                analysis_version, 
                                '000_merged_cleaned_data\hv_bw_merged_clean.csv') 

hv_bw_clean_df = pd.read_csv(hv_bw_clean_path, index_col = 0) 

## Function - count number of non-missing values in each column and plot histogram 

In [5]:
def hist_and_shapiro(df, out_path, task_subfolder): 
    
    column_counts = [] # save number of non-missing values in each row 
    
    histogram_folder = os.path.join(out_path, 'histograms', task_subfolder)
    if not os.path.exists(histogram_folder):
        os.makedirs(histogram_folder)
    
    for column in df.columns:
        # count number of non missing columns 
        column_counts.append({'column_name' : column, 
                               'non_missing_rows' : df[column].count()})

        # histogram of numeric values 
        if (df[column].dtype == 'float64') or (df[column].dtype == 'int64'):
            # histogram 
            plt.figure(figsize=(8, 6))
            plt.hist(df[column], bins = 30, color='skyblue', edgecolor='black')
            plt.suptitle(task_subfolder) 
            plt.title(f'Histogram of {column}')
            plt.xlabel(column)
            plt.ylabel('Frequency')
            plt.savefig(os.path.join(histogram_folder, f'{column}.png'))
            plt.close()

    column_counts_df = pd.DataFrame(column_counts) 

    return column_counts_df

### Run histogram and counting functions 

In [6]:
# PWS 
zv_pws_bw_counts = hist_and_shapiro(zv_pws_bw_clean_df, out_path, 'zeno_pws')
zv_pws_bw_counts.to_csv(os.path.join(out_path, 'zeno_pws_column_counts.csv')) 

In [7]:
# FW 
zv_fw_bw_counts = hist_and_shapiro(zv_fw_bw_clean_df, out_path, 'zeno_fw')
zv_fw_bw_counts.to_csv(os.path.join(out_path, 'zeno_fw_column_counts.csv'))

In [8]:
# Home Videos  
hv_bw_counts = hist_and_shapiro(hv_bw_clean_df, out_path, 'home')
hv_bw_counts.to_csv(os.path.join(out_path, 'home_column_counts.csv'))

### Number of unique IDs 

In [10]:
unique_ids_df = pd.DataFrame(data = {"PWS" : [zv_pws_bw_clean_df['id_video'].nunique()], 
                                     "FW" : [zv_fw_bw_clean_df['id_video'].nunique()], 
                                     "Home" : [hv_bw_clean_df['id_video'].nunique()]}) 
unique_ids_df.to_csv(os.path.join(out_path, 'all_unique_id_counts.csv'))
unique_ids_df

Unnamed: 0,PWS,FW,Home
0,154,154,31


### Breakdown by REDCap visit type 

In [11]:
# PWS Videos 
pws_visit_types = pd.DataFrame(zv_pws_bw_clean_df['redcap_event_name'].value_counts())
pws_visit_types.to_csv(os.path.join(out_path, 'zeno_pws_redcap_event_counts.csv')) 

# FW Videos 
fw_visit_types = pd.DataFrame(zv_fw_bw_clean_df['redcap_event_name'].value_counts())
fw_visit_types.to_csv(os.path.join(out_path, 'zeno_fw_redcap_event_counts.csv')) 

# Home Videos  
home_visit_types = pd.DataFrame(hv_bw_clean_df['redcap_event_name'].value_counts())
home_visit_types.to_csv(os.path.join(out_path, 'home_redcap_event_counts.csv')) 

In [12]:
pws_visit_types

Unnamed: 0_level_0,count
redcap_event_name,Unnamed: 1_level_1
Brainwalk: Baseline visit (Arm 1: Baseline visit),138
Year 2 Visit (Arm 1: Baseline visit),78
Year 3 Visit (Arm 1: Baseline visit),7
Year 4 Visit (Arm 1: Baseline visit),1


In [13]:
fw_visit_types

Unnamed: 0_level_0,count
redcap_event_name,Unnamed: 1_level_1
Brainwalk: Baseline visit (Arm 1: Baseline visit),139
Year 2 Visit (Arm 1: Baseline visit),75
Year 3 Visit (Arm 1: Baseline visit),7
Year 4 Visit (Arm 1: Baseline visit),1


In [14]:
home_visit_types

Unnamed: 0_level_0,count
redcap_event_name,Unnamed: 1_level_1
Brainwalk: Baseline visit (Arm 1: Baseline visit),38
Year 2 Visit (Arm 1: Baseline visit),23
Year 3 Visit (Arm 1: Baseline visit),4


### Home videos - # of right vs left 

In [15]:
home_rightleft_df = pd.DataFrame(hv_bw_clean_df['task_pose_hv'].value_counts())
home_rightleft_df.to_csv(os.path.join(out_path, 'home_right_left_counts.csv'))
home_rightleft_df

Unnamed: 0_level_0,count
task_pose_hv,Unnamed: 1_level_1
gait_vertical_right,33
gait_vertical_left,32


### Breakdown of participants with all vs any missing video metrics 

In [1]:
# columns to check - video metrics 
subset_columns = ['delta_pix_h_rel_median_pose_zv', 'stride_time_median_sec_pose_zv',
                  'mean_cadence_step_per_min_pose_zv', 'stride_width_median_cm_pose_zv', 
                  'singlesupport_per_median_pose_zv', 'singlesupport_time_sec_median_pose_zv', 
                  'stance_time_per_median_pose_zv', 'stance_time_sec_median_pose_zv', 
                  'swing_time_per_median_pose_zv', 'swing_time_sec_median_pose_zv', 
                  'tot_dsupport_per_median_pose_zv', 'tot_dsupport_time_sec_median_pose_zv']

subset_columns_hv = ['delta_pix_h_rel_median_pose_hv', 'stride_time_median_sec_pose_hv',
                     'mean_cadence_step_per_min_pose_hv', 'stride_width_median_cm_pose_hv',
                     'singlesupport_per_median_pose_hv', 'singlesupport_time_sec_median_pose_hv', 
                     'stance_time_per_median_pose_hv', 'stance_time_sec_median_pose_hv', 
                     'swing_time_per_median_pose_hv', 'swing_time_sec_median_pose_hv', 
                     'tot_dsupport_per_median_pose_hv', 'tot_dsupport_time_sec_median_pose_hv'] 

In [22]:
# count videos that have all metrics calcualted 
pws_count_non_missing = len(zv_pws_bw_clean_df.dropna(subset = subset_columns))
fws_count_non_missing = len(zv_fw_bw_clean_df.dropna(subset = subset_columns))
home_count_non_missing = len(hv_bw_clean_df.dropna(subset = subset_columns_hv))

non_missing_video_metrics_df = pd.DataFrame(data = {"PWS" : [pws_count_non_missing], 
                                                    "FW" : [fws_count_non_missing], 
                                                    "Home" : [home_count_non_missing]})

non_missing_video_metrics_df.to_csv(os.path.join(out_path, 'all_non_missing_video_metrics_count.csv'))
non_missing_video_metrics_df

Unnamed: 0,PWS,FW,Home
0,178,189,56
