In [1]:
import pandas as pd 
import numpy as np
import os 
import seaborn as sns 
import matplotlib.pyplot as plt 

## Set input and output folders 

In [2]:
# analysis folder version  
analysis_version = '007'

In [3]:
out_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis',
                        analysis_version, 
                        '001_histograms_counts')

if not os.path.exists(out_path): 
    os.makedirs(out_path)

### Load Clean Data - no missing BW Data 
May be missing video data 

In [4]:
# PWS 
zv_pws_bw_clean_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis', 
                                    analysis_version, 
                                    '000_merged_cleaned_data\zv_bw_merged_gait_vertical_PWS_1_clean.csv')

zv_pws_bw_clean_df = pd.read_csv(zv_pws_bw_clean_path, index_col = 0)

# FW 
zv_fw_bw_clean_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis', 
                                    analysis_version, 
                                   '000_merged_cleaned_data\zv_bw_merged_gait_vertical_FW_1_clean.csv') 
zv_fw_bw_clean_df = pd.read_csv(zv_fw_bw_clean_path, index_col = 0) 

# Home Videos 
hv_bw_clean_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis', 
                                analysis_version, 
                                '000_merged_cleaned_data\hv_bw_merged_clean.csv') 

hv_bw_clean_df = pd.read_csv(hv_bw_clean_path, index_col = 0) 

## Function - count number of non-missing values in each column and plot histogram 

In [5]:
def hist_and_shapiro(df, out_path, task_subfolder): 
    
    column_counts = [] # save number of non-missing values in each row 
    
    histogram_folder = os.path.join(out_path, 'histograms', task_subfolder)
    if not os.path.exists(histogram_folder):
        os.makedirs(histogram_folder)
    
    for column in df.columns:
        # count number of non missing columns 
        column_counts.append({'column_name' : column, 
                               'non_missing_rows' : df[column].count()})

        # histogram of numeric values 
        if ((df[column].dtype == 'float64') or (df[column].dtype == 'int64')) and (column != 'EDSS_same_before_after_MM'):
            # histogram 
            plt.figure(figsize=(8, 6))
            plt.hist(df[column], bins = 30, color='skyblue', edgecolor='black')
            plt.suptitle(task_subfolder) 
            plt.title(f'Histogram of {column}')
            plt.xlabel(column)
            plt.ylabel('Frequency')
            plt.savefig(os.path.join(histogram_folder, f'{column}.png'))
            plt.close()

    column_counts_df = pd.DataFrame(column_counts) 

    return column_counts_df

In [6]:
hv_bw_clean_df['EDSS_same_before_after_MM'].count()

0

### Run histogram and counting functions 

In [7]:
# PWS 
zv_pws_bw_counts = hist_and_shapiro(zv_pws_bw_clean_df, out_path, 'zeno_pws')
zv_pws_bw_counts.to_csv(os.path.join(out_path, 'zeno_pws_column_counts.csv')) 

In [8]:
# FW 
zv_fw_bw_counts = hist_and_shapiro(zv_fw_bw_clean_df, out_path, 'zeno_fw')
zv_fw_bw_counts.to_csv(os.path.join(out_path, 'zeno_fw_column_counts.csv'))

In [9]:
# Home Videos  
hv_bw_counts = hist_and_shapiro(hv_bw_clean_df, out_path, 'home')
hv_bw_counts.to_csv(os.path.join(out_path, 'home_column_counts.csv'))

### Number of unique IDs 

In [10]:
unique_ids_df = pd.DataFrame(data = {"PWS" : [zv_pws_bw_clean_df['id_video'].nunique()], 
                                     "FW" : [zv_fw_bw_clean_df['id_video'].nunique()], 
                                     "Home" : [hv_bw_clean_df['id_video'].nunique()]}) 
unique_ids_df.to_csv(os.path.join(out_path, 'all_unique_id_counts.csv'))
unique_ids_df

Unnamed: 0,PWS,FW,Home
0,142,142,31


## Check - are all participants in FW and PWS the same? 

In [11]:
# merge id_date_pose_zv variable (date and id of video) 
fw_pws_id_date_pose_zv = pd.concat([zv_pws_bw_clean_df[['bw_id', 'task_pose_zv']], zv_fw_bw_clean_df[['bw_id', 'task_pose_zv']]])

# check for unique variables - have pws video but no fw or vice versa 
fw_pws_id_date_pose_zv[~fw_pws_id_date_pose_zv['bw_id'].duplicated(keep = False)]

# think they are the same participants, just maybe a few with different follow up videos 

Unnamed: 0,bw_id,task_pose_zv


## Dates 

In [12]:
# PWS 
print(zv_pws_bw_clean_df['visit_date_video'].min())
print(zv_pws_bw_clean_df['visit_date_video'].max())

2022-09-12
2024-10-03


In [13]:
# HOme 
print(hv_bw_clean_df['visit_date_video'].min())
print(hv_bw_clean_df['visit_date_video'].max())

2023-05-05
2025-03-11


### Breakdown by REDCap visit type 

In [14]:
# PWS Videos 
pws_visit_types = pd.DataFrame(zv_pws_bw_clean_df['redcap_event_name'].value_counts())
pws_visit_types.to_csv(os.path.join(out_path, 'zeno_pws_redcap_event_counts.csv')) 

# FW Videos 
fw_visit_types = pd.DataFrame(zv_fw_bw_clean_df['redcap_event_name'].value_counts())
fw_visit_types.to_csv(os.path.join(out_path, 'zeno_fw_redcap_event_counts.csv')) 

# Home Videos  
home_visit_types = pd.DataFrame(hv_bw_clean_df['redcap_event_name'].value_counts())
home_visit_types.to_csv(os.path.join(out_path, 'home_redcap_event_counts.csv')) 

In [15]:
pws_visit_types

Unnamed: 0_level_0,count
redcap_event_name,Unnamed: 1_level_1
Brainwalk: Baseline visit (Arm 1: Baseline visit),133
Year 2 Visit (Arm 1: Baseline visit),69
Year 3 Visit (Arm 1: Baseline visit),5
Year 4 Visit (Arm 1: Baseline visit),1


In [16]:
fw_visit_types

Unnamed: 0_level_0,count
redcap_event_name,Unnamed: 1_level_1
Brainwalk: Baseline visit (Arm 1: Baseline visit),136
Year 2 Visit (Arm 1: Baseline visit),66
Year 3 Visit (Arm 1: Baseline visit),5
Year 4 Visit (Arm 1: Baseline visit),1


In [17]:
home_visit_types

Unnamed: 0_level_0,count
redcap_event_name,Unnamed: 1_level_1
Brainwalk: Baseline visit (Arm 1: Baseline visit),44
Year 2 Visit (Arm 1: Baseline visit),17
Year 3 Visit (Arm 1: Baseline visit),2


### IDs with Multiple vs Single visits 

In [18]:
# home videos 
# get unique id and date combos - remove right and left duplicate 
hv_unique_vid_date_df = hv_bw_clean_df[['id_video', 'visit_date_video']].drop_duplicates()
hv_unique_vid_date_df.head() 

Unnamed: 0,id_video,visit_date_video
0,BW-0018,2023-10-24
2,BW-0023,2023-05-05
6,BW-0025,2023-11-28
12,BW-0050,2024-02-09
14,BW-0053,2024-04-17


In [19]:
# merge all into single data frame 
single_vs_mult_visits_df = pd.DataFrame(data = {"PWS_single_visit" : [(zv_pws_bw_clean_df['id_video'].value_counts() == 1).sum()],
                                                "PWS_multiple_visits" : [zv_pws_bw_clean_df['id_video'][zv_pws_bw_clean_df['id_video'].duplicated()].nunique()],
                                                "FW_single_visit" : [(zv_fw_bw_clean_df['id_video'].value_counts() == 1).sum()], 
                                                "FW_multiple_visits" : [zv_fw_bw_clean_df['id_video'][zv_fw_bw_clean_df['id_video'].duplicated()].nunique()], 
                                                "Home_single_visit" : [(hv_unique_vid_date_df['id_video'].value_counts() == 1).sum()],
                                                "Home_multiple_visits" : [hv_unique_vid_date_df['id_video'][hv_unique_vid_date_df['id_video'].duplicated()].nunique()]})
single_vs_mult_visits_df.to_csv(os.path.join(out_path, 'single_vs_multiple_visits.csv'))
single_vs_mult_visits_df

Unnamed: 0,PWS_single_visit,PWS_multiple_visits,FW_single_visit,FW_multiple_visits,Home_single_visit,Home_multiple_visits
0,80,62,80,62,28,3


### Home videos - # of right vs left 

In [20]:
home_rightleft_df = pd.DataFrame(hv_bw_clean_df['task_pose_hv'].value_counts())
home_rightleft_df.to_csv(os.path.join(out_path, 'home_right_left_counts.csv'))
home_rightleft_df

Unnamed: 0_level_0,count
task_pose_hv,Unnamed: 1_level_1
gait_vertical_right,32
gait_vertical_left,31


In [21]:
## Home Videos - # of right and left with segment identified 
home_rightleft_w_walking_identified = pd.DataFrame(hv_bw_clean_df['task_pose_hv'].loc[hv_bw_clean_df['walking_segmets_n_pose_hv'] > 0].value_counts())
home_rightleft_w_walking_identified.to_csv(os.path.join(out_path, 'home_right_left_with_walking_identified.csv'))
home_rightleft_w_walking_identified

Unnamed: 0_level_0,count
task_pose_hv,Unnamed: 1_level_1
gait_vertical_right,30
gait_vertical_left,27


### Breakdown of participants with all vs any missing video metrics 

In [22]:
# columns to check - video metrics 
subset_columns = ['delta_pix_h_rel_median_pose_zv', 'stride_time_median_sec_pose_zv',
                  'mean_cadence_step_per_min_pose_zv', 'stride_width_median_cm_pose_zv', 
                  'singlesupport_per_median_pose_zv', 'singlesupport_time_sec_median_pose_zv', 
                  'stance_time_per_median_pose_zv', 'stance_time_sec_median_pose_zv', 
                  'swing_time_per_median_pose_zv', 'swing_time_sec_median_pose_zv', 
                  'tot_dsupport_per_median_pose_zv', 'tot_dsupport_time_sec_median_pose_zv']

subset_columns_hv = ['delta_pix_h_rel_median_pose_hv', 'stride_time_median_sec_pose_hv',
                     'mean_cadence_step_per_min_pose_hv', 'stride_width_median_cm_pose_hv',
                     'singlesupport_per_median_pose_hv', 'singlesupport_time_sec_median_pose_hv', 
                     'stance_time_per_median_pose_hv', 'stance_time_sec_median_pose_hv', 
                     'swing_time_per_median_pose_hv', 'swing_time_sec_median_pose_hv', 
                     'tot_dsupport_per_median_pose_hv', 'tot_dsupport_time_sec_median_pose_hv'] 

In [23]:
# count videos that have all metrics calcualted 
pws_count_non_missing = len(zv_pws_bw_clean_df.dropna(subset = subset_columns))
fws_count_non_missing = len(zv_fw_bw_clean_df.dropna(subset = subset_columns))
home_count_non_missing = len(hv_bw_clean_df.dropna(subset = subset_columns_hv))

non_missing_video_metrics_df = pd.DataFrame(data = {"PWS" : [pws_count_non_missing], 
                                                    "FW" : [fws_count_non_missing], 
                                                    "Home" : [home_count_non_missing]})

non_missing_video_metrics_df.to_csv(os.path.join(out_path, 'all_non_missing_video_metrics_count.csv'))
non_missing_video_metrics_df

Unnamed: 0,PWS,FW,Home
0,157,158,51
