In [1]:
import pandas as pd 
import numpy as np
import os 
import seaborn as sns 
import matplotlib.pyplot as plt 

## Set input and output folders 

In [2]:
# analysis folder version  
analysis_version = '005'

In [3]:
out_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis',
                        analysis_version, 
                        '001_histograms_counts')

if not os.path.exists(out_path): 
    os.makedirs(out_path)

### Load Clean Data - no missing BW Data 
May be missing video data 

In [4]:
# PWS 
zv_pws_bw_clean_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis', 
                                    analysis_version, 
                                    '000_merged_cleaned_data\zv_bw_merged_gait_vertical_PWS_1_clean.csv')

zv_pws_bw_clean_df = pd.read_csv(zv_pws_bw_clean_path, index_col = 0)

# FW 
zv_fw_bw_clean_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis', 
                                    analysis_version, 
                                   '000_merged_cleaned_data\zv_bw_merged_gait_vertical_FW_1_clean.csv') 
zv_fw_bw_clean_df = pd.read_csv(zv_fw_bw_clean_path, index_col = 0) 

# Home Videos 
hv_bw_clean_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis', 
                                analysis_version, 
                                '000_merged_cleaned_data\hv_bw_merged_clean.csv') 

hv_bw_clean_df = pd.read_csv(hv_bw_clean_path, index_col = 0) 

## Function - count number of non-missing values in each column and plot histogram 

In [5]:
def hist_and_shapiro(df, out_path, task_subfolder): 
    
    column_counts = [] # save number of non-missing values in each row 
    
    histogram_folder = os.path.join(out_path, 'histograms', task_subfolder)
    if not os.path.exists(histogram_folder):
        os.makedirs(histogram_folder)
    
    for column in df.columns:
        # count number of non missing columns 
        column_counts.append({'column_name' : column, 
                               'non_missing_rows' : df[column].count()})

        # histogram of numeric values 
        if (df[column].dtype == 'float64') or (df[column].dtype == 'int64'):
            # histogram 
            plt.figure(figsize=(8, 6))
            plt.hist(df[column], bins = 30, color='skyblue', edgecolor='black')
            plt.suptitle(task_subfolder) 
            plt.title(f'Histogram of {column}')
            plt.xlabel(column)
            plt.ylabel('Frequency')
            plt.savefig(os.path.join(histogram_folder, f'{column}.png'))
            plt.close()

    column_counts_df = pd.DataFrame(column_counts) 

    return column_counts_df

### Run histogram and counting functions 

In [6]:
# PWS 
zv_pws_bw_counts = hist_and_shapiro(zv_pws_bw_clean_df, out_path, 'zeno_pws')
zv_pws_bw_counts.to_csv(os.path.join(out_path, 'zeno_pws_column_counts.csv')) 

In [7]:
# FW 
zv_fw_bw_counts = hist_and_shapiro(zv_fw_bw_clean_df, out_path, 'zeno_fw')
zv_fw_bw_counts.to_csv(os.path.join(out_path, 'zeno_fw_column_counts.csv'))

In [8]:
hv_bw_clean_df.dtypes

video_id_date_name_pose_hv           object
id_date_pose_hv                      object
task_pose_hv                         object
frames_per_second_pose_hv             int64
total_video_duration_sec_pose_hv    float64
                                     ...   
t25fw_group_num                     float64
t25fw_group_cat                      object
race_ethnicity_clean                 object
ms_dx_condensed                      object
bw_hv_date_diff_days                  int64
Length: 85, dtype: object

In [9]:
# Home Videos  
hv_bw_counts = hist_and_shapiro(hv_bw_clean_df, out_path, 'home')
hv_bw_counts.to_csv(os.path.join(out_path, 'home_column_counts.csv'))

### Number of unique IDs 

In [10]:
unique_ids_df = pd.DataFrame(data = {"PWS" : [zv_pws_bw_clean_df['id_video'].nunique()], 
                                     "FW" : [zv_fw_bw_clean_df['id_video'].nunique()], 
                                     "Home" : [hv_bw_clean_df['id_video'].nunique()]}) 
unique_ids_df.to_csv(os.path.join(out_path, 'all_unique_id_counts.csv'))

### Breakdown by REDCap visit type 

In [11]:
# PWS Videos 
pws_visit_types = pd.DataFrame(zv_pws_bw_clean_df['redcap_event_name'].value_counts())
pws_visit_types.to_csv(os.path.join(out_path, 'zeno_pws_redcap_event_counts.csv')) 

# FW Videos 
fw_visit_types = pd.DataFrame(zv_fw_bw_clean_df['redcap_event_name'].value_counts())
fw_visit_types.to_csv(os.path.join(out_path, 'zeno_fw_redcap_event_counts.csv')) 

# Home Videos  
home_visit_types = pd.DataFrame(hv_bw_clean_df['redcap_event_name'].value_counts())
home_visit_types.to_csv(os.path.join(out_path, 'home_redcap_event_counts.csv')) 

### Home videos - # of right vs left 

In [12]:
home_rightleft_df = pd.DataFrame(hv_bw_clean_df['task_pose_hv'].value_counts())
home_rightleft_df.to_csv(os.path.join(out_path, 'home_right_left_counts.csv'))

### Breakdown of participants with all vs any missing video metrics 

In [13]:
# columns to check - video metrics 
# for now, not including double and single support 
subset_columns = ['delta_pix_h_rel_median_pose_zv', 'stride_time_median_sec_pose_zv',
                  'mean_cadence_step_per_min_pose_zv', 'stride_width_median_cm_pose_zv'] 
subset_columns_hv = ['delta_pix_h_rel_median_pose_hv', 'stride_time_median_sec_pose_hv',
                  'mean_cadence_step_per_min_pose_hv', 'stride_width_median_cm_pose_hv'] 

In [14]:
# count videos that have all metrics calcualted 
pws_count_non_missing = len(zv_pws_bw_clean_df.dropna(subset = subset_columns))
fws_count_non_missing = len(zv_fw_bw_clean_df.dropna(subset = subset_columns))
home_count_non_missing = len(hv_bw_clean_df.dropna(subset = subset_columns_hv))

non_missing_video_metrics_df = pd.DataFrame(data = {"PWS" : [pws_count_non_missing], 
                                                    "FW" : [fws_count_non_missing], 
                                                    "Home" : [home_count_non_missing]})

non_missing_video_metrics_df.to_csv(os.path.join(out_path, 'all_non_missing_video_metrics_count.csv'))