In [None]:
reminder 
copy and paste home video instruction 1 output if haven't yet merged with instructions 2 

In [1]:
import pandas as pd 
import numpy as np
import os 

In [None]:
# analysis folder version  
analysis_version = '006'

# video metric folder version 
version = '005'

## Define Function 

In [2]:
# create ordinal value of EDSS severity 
# 0-2 (mild), 2.5-4 (moderate), 4.5+ (severe)

# Function to categorize EDSS severity
def categorize_edss(edss_value):
    if 0 <= edss_value <= 2:
        return 1, 'mild'
    elif 2.5 <= edss_value <= 4:
        return 2, 'moderate'
    elif edss_value >= 4.5:
        return 3, 'severe'
    else:
        return None, None  # Handle cases outside the defined ranges

In [3]:
# create ordinal value for T25FW 
def categorize_t25fw(t25fw_value):
    if 0 <= t25fw_value < 6:
        return 1, 'under_6'
    elif 6 <= t25fw_value <= 7.99:
        return 2, '6_to_8'
    elif t25fw_value >= 8:
        return 3, 'over_8'
    else:
        return None, None  # Handle cases outside the defined ranges

In [4]:
# task = gait_vertical_PWS_1 or gait_vertical_FW_1
def merge_bw_zv(bw_df, zv_df, task, out_path):

    # filter zv to only include one task (bw drop columns, zv drop rows) 
    zv_task_df = zv_df[zv_df['task_pose_zv'] == task]
    print('confirm all one task')
    print(pd.unique(zv_task_df['task_pose_zv']))

    print('total zeno videos') 
    print(len(zv_task_df))
        
    # filter bw ids dataset to only those included in video data set 
    zv_in_bw_df = bw_df[bw_df['bw_id'].isin(zv_task_df['id_video'])]
    zv_in_bw_df.to_csv(os.path.join(out_path, 'zv_id_in_bw_df_' + task + '.csv')) # save excel 

    print('total bw rows with id in video dataset') 
    print(len(zv_in_bw_df))

    # merge bw data set rows with zeno videos rows 
        # merge bw data set rows with zeno videos rows 
        # id and date needs to be the same 
        # should only use each brainwalk visit once - once PWS_1 video per person 

    merged_bw_zv = []

    # Loop through each row in zv_task_df
    for index, zv_row in zv_task_df.iterrows():
       
        current_id = zv_row['id_video']
        current_date = zv_row['visit_date_video']
        zv_row_df = pd.DataFrame([zv_row])

        # Find rows in brainwalk data set with same id and same date as current zv data 
        zv_in_bw_current_id_rows = zv_in_bw_df[(zv_in_bw_df['bw_id'] == current_id) & (zv_in_bw_df['trialdate'] == current_date)]
        #zv_in_bw_current_id_date_rows = zv_in_bw_current_id_rows[zv_in_bw_current_id_rows['visit_date'] == current_date]
   
        if len(zv_in_bw_current_id_rows) == 1: 
            bw_row_to_merge = zv_in_bw_current_id_rows
            # merge bw and zv, store merged row
            merged_row = zv_row_df.merge(bw_row_to_merge, left_on='id_video', right_on='bw_id')
            merged_bw_zv.append(merged_row)
        
        # if more than one row for the id and date, pick one with least na values 
        elif len(zv_in_bw_current_id_rows) > 1:
            bw_row_to_merge = zv_in_bw_current_id_rows.loc[[zv_in_bw_current_id_rows.isna().sum(axis=1).idxmin()]]
            # merge bw and zv, store merged row
            merged_row = zv_row_df.merge(bw_row_to_merge, left_on='id_video', right_on='bw_id')
            merged_bw_zv.append(merged_row)

            print('multiple rows for the id and date combo')
            print(current_id)
            print(current_date)

        else: 
            print('No matching id and daterow from video vs mat')
            print(current_id)
            print(current_date)


    # merge all bw and zv data together 
    merged_bw_zv_df = pd.concat(merged_bw_zv)
    merged_bw_zv_df = merged_bw_zv_df.reset_index(drop=True) # reset index 

    # check same ID for each row 
    print('mismatched zeno video vs brainwalk id')
    print(sum(merged_bw_zv_df['id_video'] != merged_bw_zv_df['bw_id']))

    print('mismatched zeno video vs brainwalk date')
    print(sum(merged_bw_zv_df['visit_date_video'] != merged_bw_zv_df['trialdate']))

    # saved merged df for future reference 
    merged_bw_zv_df.to_csv(os.path.join(out_path,  'zv_bw_merged_' + task + '_raw.csv'))

    return merged_bw_zv_df

In [5]:
# merge home video data with preferred walking speed mat data 
# participants walk at preferred pace at home 

def merge_bw_hv(bw_df, hv_df, task, out_path):
    # filter zv to only include one task (bw drop columns, zv drop rows) 
    hv_task_df = hv_df[hv_df['task_pose_hv'] == task]
    print('confirm all one task')
    print(pd.unique(hv_task_df['task_pose_hv']))

    print('total home videos') 
    print(len(hv_task_df))

    # filter bw ids dataset to only those included in video data set 
    hv_in_bw_df = bw_df[bw_df['bw_id'].isin(hv_task_df['id_video'])]
    hv_in_bw_df.to_csv(os.path.join(out_path, 'hv_id_in_bw_df.csv')) # save excel 

    print('total bw rows with id in video dataset') 
    print(len(hv_in_bw_df))

    # Track used rows from zv_in_bw_df
    used_indices = set()

    # Helper function to find the closest date
    def find_closest_date_unique(row, in_bw_df):    
        # Filter rows with the same 'bw_id' and not already used
        filtered_df = in_bw_df[(in_bw_df['bw_id'] == row['id_video'])] 
        # if no matching dates found 
        if filtered_df.empty:
            return None 
        # Find the closest date
        closest_idx = (filtered_df['trialdate'] - row['visit_date_video']).abs().idxmin()
        return filtered_df.loc[closest_idx]

    # Apply the helper function row-wise
    closest_rows = hv_task_df.apply(
        lambda row: find_closest_date_unique(row, hv_in_bw_df), axis=1
    )
    
    # Convert the results into a DataFrame
   # closest_rows_df = pd.DataFrame(closest_rows.tolist(), index=hv_task_df.index)

    # Merge the original `zv_task_df` with `closest_rows_df`
    merged_bw_hv_df = hv_task_df.merge(closest_rows, left_index=True, right_index=True, suffixes=('', '_closest'))

    # add column for date diff 
    merged_bw_hv_df['bw_hv_date_diff_days'] = (merged_bw_hv_df['visit_date_video'] - merged_bw_hv_df['trialdate']).dt.days
    
    # check same ID for each row 
    print('mismatched home video vs brainwalk id')
    print(sum(merged_bw_hv_df['id_video'] != merged_bw_hv_df['bw_id']))
    
    return merged_bw_hv_df

In [6]:
# df input should be merged df - both video and bw data 

def print_video_counts(df):
    # number of zeno videos and participants included 
    print('total videos - df length: ' + 
          str(len(df))) 

    print('unique demographic_diagnosis in df: ' + 
         str(pd.unique(df['demographic_diagnosis'])))
    
    print('num videos with demographic_diagnosis == HC: ' + 
         str(len(df[df['demographic_diagnosis'] == 'HC']))) 

    print('num videos demographic_diagnosis == MS: ' + 
         str(len(df[df['demographic_diagnosis'] == 'MS']))) 

    print('------')

    print('unique id_video (participants) in df: ' + 
          str(len(pd.unique(df['id_video'])))) 

    print('num participants with demographic_diagnosis == HC: ' + 
         str(len(pd.unique(df['id_video'][df['demographic_diagnosis'] == 'HC'])))) 
    
    print('num participants demographic_diagnosis == MS: ' + 
         str(len(pd.unique(df['id_video'][df['demographic_diagnosis'] == 'MS']))))

    print('------') 
    
    print('number of participants with multiple videos in dataset: ' + 
         str(df['id_video'][df['id_video'].duplicated()].nunique()))

    print('number of participants with one video: ' + 
          str((df['id_video'].value_counts() == 1).sum()))

In [7]:
# merge race and ethnicity according to Bove lab patterns 

def merge_race_ethnicity(df): 
    df_2 = df
    df_2['race_ethnicity_clean'] = ''
    df_2 = df_2.copy()
    # White race and not hispanic or latino ethnicity = White Not Hispanic 
    df_2.loc[(df_2['clean_Race'] == 'White') & (df_2['clean_Ethnicity'] == 'Not Hispanic Or Latino'),
            'race_ethnicity_clean'] = 'White Not Hispanic' 

    # Exclude White Not Hispanic and ethnicity is Hispanic or Latino = 'Hispanic or Latino'
    df_2.loc[(df_2['race_ethnicity_clean'] != 'White Not Hispanic') & (df_2['clean_Ethnicity'] == 'Hispanic Or Latino'),
            'race_ethnicity_clean'] = 'Hispanic or Latino'

    # Of individuals that haven't been assignd race_ethnicity_clean, go off clean_Race column 
    # asian 
    df_2.loc[(df_2['race_ethnicity_clean'] == '') & (df_2['clean_Race'] == 'Asian'),
            'race_ethnicity_clean'] = 'Asian' 
    
   # Black Or African American 
    df_2.loc[(df_2['race_ethnicity_clean'] == '') & (df_2['clean_Race'] == 'Black Or African American'),
            'race_ethnicity_clean'] = 'Black Or African American'

    # if not yet assigned -> other, unknown, Declined 
    df_2.loc[df_2['race_ethnicity_clean'] == '',
            'race_ethnicity_clean'] = 'Other/Unknown/Declined'
    return df_2

In [8]:
def condense_ms_dx(df):
    df_2 = df
    df_2['ms_dx_condensed'] = ''
    df_2 = df_2.copy()
    
    # RRMS 
    df_2.loc[df_2['bingoEHR_DX_MS DX'] == 'RRMS (Relapsing-remitting Multiple Sclerosis)', 'ms_dx_condensed'] = 'RRMS' 

    # Progressive MS  
    df_2.loc[(df_2['bingoEHR_DX_MS DX'] == 'PPMS (Primary-progressive Multiple Sclerosis)') | 
            (df_2['bingoEHR_DX_MS DX'] == 'SPMS (Secondary-progressive Multiple Sclerosis)') | 
            (df_2['bingoEHR_DX_MS DX'] == 'PRMS (Progressive-relapsing Multiple Sclerosis)'), 
        'ms_dx_condensed'] = 'Progressive MS' 

    # subtype not specified 
    df_2.loc[(df_2['bingoEHR_DX_MS DX'] == 'MS, Subtype Not Specified'),  'ms_dx_condensed'] = 'MS, Subtype Not Specified'

    # if still blank and demographic diagnosis column = MS -> 'MS, Subtype Not Specified'
    df_2.loc[(df_2['ms_dx_condensed'] == '') & (df_2['demographic_diagnosis'] == 'MS'),  
              'ms_dx_condensed'] = 'MS, Subtype Not Specified'

    # everyone else, Nan -> Likely healthy control  
    df_2.loc[(df_2['ms_dx_condensed'] == ''), 'ms_dx_condensed'] = None 

    # update 'demographic_diagnosis' column to match 'ms_dx_condensed'
    df_2.loc[(df_2['ms_dx_condensed'] == 'RRMS') | 
                 (df_2['ms_dx_condensed'] == 'Progressive MS') | 
                 (df_2['ms_dx_condensed'] == 'MS, Subtype Not Specified'), 
            'demographic_diagnosis'] = 'MS'

    return df_2

## Load data and set output folders 

In [10]:
# output folders 
analysis_out_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis', 
                                analysis_version)

if not os.path.exists(analysis_out_path): 
    os.makedirs(analysis_out_path)

out_path = os.path.join(analysis_out_path, '000_merged_cleaned_data') 

if not os.path.exists(out_path): 
    os.makedirs(out_path)

In [12]:
# load data frames 

# zeno video metrics 
zv_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code',
                       'gait_bw_zeno_outputs_' + version, 
                       'gait_bw_zeno_outputs_' + version +'_pose_metrics_all.csv')

zv_df = pd.read_csv(zv_path, index_col = 0)

# home video metrics 
hv_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code',
                       'gait_bw_home_outputs_' + version, 
                       'gait_bw_home_outputs_' + version + '_pose_metrics_all.csv')
                       
hv_df = pd.read_csv(hv_path, index_col = 0)

# BW and zeno mat metrics 
# decrypted file - may need to decrypt again if says file doesn't exit 
# copied file saved in Megan Project folder in Brainwalk box. 
# if issues decrypting, try copying original file again and then decrypting 
bw_df = pd.read_excel(r'C:\Users\mmccu\AppData\Local\Temp\ccsecure\2025_03_26_BrainWalk_AllData_Long_MM.xlsx', 
                     index_col = None, 
                     usecols = ['bw_id', 'trialdate', 'visit_date', 'redcap_event_name', 'demoEHR_DiseaseDuration',
                                'clean_Sex', 'clean_Age', 'demographic_diagnosis','bingoEHR_DX_MS DX', 'bingoEHR_first_MS DX', 'clean_Race',	
                                'clean_Ethnicity', 'bingoEHR_Vitals_height(in)', 'clean_EDSS', 'clean_T25FW_Avg',
                                'PWS_stridetimesecmean', 'PWS_stridetimeseccv','PWS_cadencestepsminmean','PWS_totaldsupportmean', 
                                'PWS_singlesupportmean','PWS_totaldsupportratiolr', 'PWS_singlesupportratiolr', 
                                'PWS_stridewidthcmmean','PWS_stridewidthcmsd', 'PWS_velocitycmsecmean', 
                                'FW_stridetimesecmean', 'FW_stridetimeseccv','FW_cadencestepsminmean','FW_totaldsupportmean', 
                                'FW_singlesupportmean','FW_totaldsupportratiolr', 'FW_singlesupportratiolr', 
                                'FW_stridewidthcmmean','FW_stridewidthcmsd', 'FW_velocitycmsecmean', 
                                'demoGait_dateDiff', 'msfcEHR_T25FW SPEED TRIAL 1 vDate Diff', 'msfcEHR_T25FW SPEED TRIAL 2 vDate Diff',
                                'demoEHR_DX_dateDiff', 'demoEHR_EDSS_dateDiff', 'demoEHR_Vitals_dateDiff'])



bw_df.head()

Unnamed: 0,bw_id,trialdate,visit_date,demoGait_dateDiff,demoEHR_DiseaseDuration,bingoEHR_DX_MS DX,demographic_diagnosis,clean_Race,clean_Sex,clean_Ethnicity,...,PWS_totaldsupportratiolr,PWS_velocitycmsecmean,redcap_event_name,msfcEHR_T25FW SPEED TRIAL 1 vDate Diff,msfcEHR_T25FW SPEED TRIAL 2 vDate Diff,demoEHR_DX_dateDiff,demoEHR_EDSS_dateDiff,bingoEHR_Vitals_height(in),demoEHR_Vitals_dateDiff,bingoEHR_first_MS DX
0,BW-0146,NaT,2022-05-18,,,,HC,White,Male,Not Hispanic Or Latino,...,,,Brainwalk: Baseline visit (Arm 1: Baseline visit),,,,,,,
1,BW-0146,2022-08-17,2022-08-17,0.0,,,HC,White,Male,Not Hispanic Or Latino,...,1.004,84.99,Year 2 Visit (Arm 1: Baseline visit),,,,,,,
2,BW-0001,2022-08-26,2022-08-26,0.0,,,PD,White,Male,Not Hispanic Or Latino,...,0.918,74.593,Brainwalk: Baseline visit (Arm 1: Baseline visit),,,,,,,
3,BW-0002,2022-09-12,2022-09-12,0.0,0.0,RRMS (Relapsing-remitting Multiple Sclerosis),MS,Other,Male,Hispanic Or Latino,...,1.091,66.053,Brainwalk: Baseline visit (Arm 1: Baseline visit),0.0,0.0,97.0,0.0,66.0,-0.454109,RRMS (Relapsing-remitting Multiple Sclerosis)
4,BW-0004,2022-09-19,2022-09-19,0.0,17.0,RRMS (Relapsing-remitting Multiple Sclerosis),MS,White,Female,Not Hispanic Or Latino,...,1.216,101.165,Brainwalk: Baseline visit (Arm 1: Baseline visit),0.0,0.0,6289.0,0.0,67.5,-0.453264,RRMS (Relapsing-remitting Multiple Sclerosis)


In [23]:
zv_df.columns 
#columns_to_drop  = zv_df.columns.str.contains('foot')


Unnamed: 0,video_id_date_name_pose,id_date_pose,task_pose,frames_per_second_pose,total_video_duration_sec_pose,delta_pix_h_rel_median_pose,walking_segmets_n_pose,walking_segments_duration_mean_pose,walking_segments_duration_median_pose,stride_time_mean_sec_pose,...,frameDiff_hs1a_to2_median_pose,frameDiff_hs2_to1b_median_pose,frameDiff_to1a_hs1a_median_pose,frameDiff_to1b_hs1b_median_pose,frameDiff_to2_hs2_median_pose,frameDiff_hs1a_to2_std_pose,frameDiff_hs2_to1b_std_pose,frameDiff_to1a_hs1a_std_pose,frameDiff_to1b_hs1b_std_pose,frameDiff_to2_hs2_std_pose
0,gait_vertical_PWS_1_BW-0002_2022_09_12,BW-0002\2022_09_12,gait_vertical_PWS_1,25,42.760000,0.15,,,,,...,,,,,,,,,,
1,gait_vertical_FW_1_BW-0002_2022_09_12,BW-0002\2022_09_12,gait_vertical_FW_1,25,28.960000,0.28,,,,,...,,,,,,,,,,
2,gait_vertical_PWS_1_BW-0002_2023_09_12,BW-0002\2023_09_12,gait_vertical_PWS_1,30,46.633333,0.22,4.0,4.46,4.32,1.354,...,,,,,,,,,,
3,gait_vertical_FW_1_BW-0002_2023_09_12,BW-0002\2023_09_12,gait_vertical_FW_1,30,36.500000,0.25,3.0,6.71,7.23,1.015,...,4.5,5.5,12.0,11.5,11.5,3.33,3.75,3.97,3.91,3.70
4,gait_vertical_PWS_1_BW-0003_2022_10_24,BW-0003\2022_10_24,gait_vertical_PWS_1,30,26.033333,0.26,4.0,4.53,4.87,1.192,...,3.0,2.0,13.0,13.0,12.0,6.58,4.50,3.28,3.25,3.39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,gait_vertical_FW_1_BW-0353_2024_08_21,BW-0353\2024_08_21,gait_vertical_FW_1,30,38.166667,0.51,3.0,2.76,2.83,0.947,...,3.0,3.5,12.5,10.0,10.0,1.37,0.55,4.00,2.40,1.64
613,gait_vertical_PWS_1_BW-0356_2024_09_25,BW-0356\2024_09_25,gait_vertical_PWS_1,30,30.700000,0.09,3.0,6.40,6.43,1.038,...,4.0,4.0,11.5,12.0,12.0,4.26,2.42,2.61,2.97,2.29
614,gait_vertical_FW_1_BW-0356_2024_09_25,BW-0356\2024_09_25,gait_vertical_FW_1,30,20.566667,0.29,1.0,4.47,4.47,0.883,...,4.0,3.0,9.0,10.0,10.0,1.68,1.68,1.83,2.43,2.54
615,gait_vertical_PWS_1_BW-0357_2024_09_25,BW-0357\2024_09_25,gait_vertical_PWS_1,30,33.666667,1.83,4.0,4.82,5.10,1.156,...,3.0,2.0,15.0,16.0,15.0,2.93,3.24,3.91,4.39,3.61


In [29]:
# if version metric greater than 005 - drop extra support metrics 
# new calculation for support 
if int(version) >= 5: 
    zv_df = zv_df.loc[:, ~zv_df.columns.str.contains('foot')]
    hv_df = hv_df.loc[:, ~hv_df.columns.str.contains('foot')]

In [30]:
# rename zeno video variables 
# ad zv to all zeno volumn names 
zv_df = zv_df.add_suffix('_zv')

# add bw id and video date to zv df 
zv_df['id_video'] = zv_df['id_date_pose_zv'].str.extract(r'(BW-\d{4})')
zv_df['visit_date_video'] = zv_df['id_date_pose_zv'].str[8:]
zv_df['visit_date_video'] = pd.to_datetime(zv_df['visit_date_video'].str.replace('_', '-'), format='%Y-%m-%d')
zv_df.head()

Unnamed: 0,video_id_date_name_pose_zv,id_date_pose_zv,task_pose_zv,frames_per_second_pose_zv,total_video_duration_sec_pose_zv,delta_pix_h_rel_median_pose_zv,walking_segmets_n_pose_zv,walking_segments_duration_mean_pose_zv,walking_segments_duration_median_pose_zv,stride_time_mean_sec_pose_zv,...,frameDiff_to1a_hs1a_median_pose_zv,frameDiff_to1b_hs1b_median_pose_zv,frameDiff_to2_hs2_median_pose_zv,frameDiff_hs1a_to2_std_pose_zv,frameDiff_hs2_to1b_std_pose_zv,frameDiff_to1a_hs1a_std_pose_zv,frameDiff_to1b_hs1b_std_pose_zv,frameDiff_to2_hs2_std_pose_zv,id_video,visit_date_video
0,gait_vertical_PWS_1_BW-0002_2022_09_12,BW-0002\2022_09_12,gait_vertical_PWS_1,25,42.76,0.15,,,,,...,,,,,,,,,BW-0002,2022-09-12
1,gait_vertical_FW_1_BW-0002_2022_09_12,BW-0002\2022_09_12,gait_vertical_FW_1,25,28.96,0.28,,,,,...,,,,,,,,,BW-0002,2022-09-12
2,gait_vertical_PWS_1_BW-0002_2023_09_12,BW-0002\2023_09_12,gait_vertical_PWS_1,30,46.633333,0.22,4.0,4.46,4.32,1.354,...,,,,,,,,,BW-0002,2023-09-12
3,gait_vertical_FW_1_BW-0002_2023_09_12,BW-0002\2023_09_12,gait_vertical_FW_1,30,36.5,0.25,3.0,6.71,7.23,1.015,...,12.0,11.5,11.5,3.33,3.75,3.97,3.91,3.7,BW-0002,2023-09-12
4,gait_vertical_PWS_1_BW-0003_2022_10_24,BW-0003\2022_10_24,gait_vertical_PWS_1,30,26.033333,0.26,4.0,4.53,4.87,1.192,...,13.0,13.0,12.0,6.58,4.5,3.28,3.25,3.39,BW-0003,2022-10-24


In [31]:
# add hv to all home column names 
hv_df = hv_df.add_suffix('_hv')

# add bw id and video date to hv df 
hv_df['id_video'] = hv_df['id_date_pose_hv'].str.extract(r'(BW-\d{4})')
hv_df['visit_date_video'] = hv_df['id_date_pose_hv'].str[8:]
hv_df['visit_date_video'] = pd.to_datetime(hv_df['visit_date_video'], errors = 'coerce')
hv_df.head()

  hv_df['visit_date_video'] = pd.to_datetime(hv_df['visit_date_video'], errors = 'coerce')


Unnamed: 0,video_id_date_name_pose_hv,id_date_pose_hv,task_pose_hv,frames_per_second_pose_hv,total_video_duration_sec_pose_hv,delta_pix_h_rel_median_pose_hv,walking_segmets_n_pose_hv,walking_segments_duration_mean_pose_hv,walking_segments_duration_median_pose_hv,stride_time_mean_sec_pose_hv,...,frameDiff_to1a_hs1a_median_pose_hv,frameDiff_to1b_hs1b_median_pose_hv,frameDiff_to2_hs2_median_pose_hv,frameDiff_hs1a_to2_std_pose_hv,frameDiff_hs2_to1b_std_pose_hv,frameDiff_to1a_hs1a_std_pose_hv,frameDiff_to1b_hs1b_std_pose_hv,frameDiff_to2_hs2_std_pose_hv,id_video,visit_date_video
0,gait_vertical_left_BW-0018_10-24-23,BW-0018\10-24-23,gait_vertical_left,30,26.8,0.22,6.0,2.88,2.92,1.182,...,,,,,,,,,BW-0018,2023-10-24
1,gait_vertical_right_BW-0018_10-24-23,BW-0018\10-24-23,gait_vertical_right,30,24.4,0.31,6.0,3.15,3.2,1.185,...,,,,,,,,,BW-0018,2023-10-24
2,gait_vertical_left_BW-0023_05-05-23,BW-0023\05-05-23,gait_vertical_left,30,35.966667,0.43,5.0,4.43,4.47,1.054,...,8.5,12.0,9.5,4.6,3.55,2.85,3.81,2.99,BW-0023,2023-05-05
3,gait_vertical_right_BW-0023_05-05-23,BW-0023\05-05-23,gait_vertical_right,30,38.466667,0.39,6.0,4.79,4.77,1.091,...,11.0,12.0,10.0,5.18,4.24,3.44,4.36,3.85,BW-0023,2023-05-05
4,gait_vertical_left_BW-0023_10-23-23,BW-0023\10-23-23,gait_vertical_left,30,36.1,0.3,3.0,4.27,3.93,1.008,...,10.0,10.0,10.0,1.13,1.13,3.2,1.72,1.39,BW-0023,2023-10-23


In [32]:
# add columns for ordinal EDSS severity and t25fw to bw_df
bw_df['edss_severity_num'], bw_df['edss_severity_cat'] = zip(*bw_df['clean_EDSS'].apply(categorize_edss))
bw_df['edss_severity_cat'] = pd.Categorical(bw_df['edss_severity_cat'], categories=["mild", "moderate", "severe"], ordered=True)

# Apply the function to create new columns to bw_df
bw_df['t25fw_group_num'], bw_df['t25fw_group_cat'] = zip(*bw_df['clean_T25FW_Avg'].apply(categorize_t25fw))
bw_df['t25fw_group_cat'] = pd.Categorical(bw_df['t25fw_group_cat'], categories=["under_6", "6_to_8", "over_8"], ordered=True)
bw_df.head()

Unnamed: 0,bw_id,trialdate,visit_date,demoGait_dateDiff,demoEHR_DiseaseDuration,bingoEHR_DX_MS DX,demographic_diagnosis,clean_Race,clean_Sex,clean_Ethnicity,...,msfcEHR_T25FW SPEED TRIAL 2 vDate Diff,demoEHR_DX_dateDiff,demoEHR_EDSS_dateDiff,bingoEHR_Vitals_height(in),demoEHR_Vitals_dateDiff,bingoEHR_first_MS DX,edss_severity_num,edss_severity_cat,t25fw_group_num,t25fw_group_cat
0,BW-0146,NaT,2022-05-18,,,,HC,White,Male,Not Hispanic Or Latino,...,,,,,,,,,,
1,BW-0146,2022-08-17,2022-08-17,0.0,,,HC,White,Male,Not Hispanic Or Latino,...,,,,,,,,,,
2,BW-0001,2022-08-26,2022-08-26,0.0,,,PD,White,Male,Not Hispanic Or Latino,...,,,,,,,,,,
3,BW-0002,2022-09-12,2022-09-12,0.0,0.0,RRMS (Relapsing-remitting Multiple Sclerosis),MS,Other,Male,Hispanic Or Latino,...,0.0,97.0,0.0,66.0,-0.454109,RRMS (Relapsing-remitting Multiple Sclerosis),2.0,moderate,1.0,under_6
4,BW-0004,2022-09-19,2022-09-19,0.0,17.0,RRMS (Relapsing-remitting Multiple Sclerosis),MS,White,Female,Not Hispanic Or Latino,...,0.0,6289.0,0.0,67.5,-0.453264,RRMS (Relapsing-remitting Multiple Sclerosis),2.0,moderate,1.0,under_6


In [33]:
# merge race and ethnicity columns in bw_df
bw_df = merge_race_ethnicity(bw_df)
bw_df.head()

Unnamed: 0,bw_id,trialdate,visit_date,demoGait_dateDiff,demoEHR_DiseaseDuration,bingoEHR_DX_MS DX,demographic_diagnosis,clean_Race,clean_Sex,clean_Ethnicity,...,demoEHR_DX_dateDiff,demoEHR_EDSS_dateDiff,bingoEHR_Vitals_height(in),demoEHR_Vitals_dateDiff,bingoEHR_first_MS DX,edss_severity_num,edss_severity_cat,t25fw_group_num,t25fw_group_cat,race_ethnicity_clean
0,BW-0146,NaT,2022-05-18,,,,HC,White,Male,Not Hispanic Or Latino,...,,,,,,,,,,White Not Hispanic
1,BW-0146,2022-08-17,2022-08-17,0.0,,,HC,White,Male,Not Hispanic Or Latino,...,,,,,,,,,,White Not Hispanic
2,BW-0001,2022-08-26,2022-08-26,0.0,,,PD,White,Male,Not Hispanic Or Latino,...,,,,,,,,,,White Not Hispanic
3,BW-0002,2022-09-12,2022-09-12,0.0,0.0,RRMS (Relapsing-remitting Multiple Sclerosis),MS,Other,Male,Hispanic Or Latino,...,97.0,0.0,66.0,-0.454109,RRMS (Relapsing-remitting Multiple Sclerosis),2.0,moderate,1.0,under_6,Hispanic or Latino
4,BW-0004,2022-09-19,2022-09-19,0.0,17.0,RRMS (Relapsing-remitting Multiple Sclerosis),MS,White,Female,Not Hispanic Or Latino,...,6289.0,0.0,67.5,-0.453264,RRMS (Relapsing-remitting Multiple Sclerosis),2.0,moderate,1.0,under_6,White Not Hispanic


In [34]:
print(bw_df['bingoEHR_DX_MS DX'].value_counts())
print(bw_df['demographic_diagnosis'].value_counts())
print('---- MS only -----')
print(bw_df.loc[bw_df['demographic_diagnosis'] == 'MS', 'bingoEHR_DX_MS DX'].value_counts())

# condense MS DX columns in bw_df
# add 'demographic_diagnosis' = MS if MS DX is MS diagnosis 
bw_df = condense_ms_dx(bw_df)

print('---- condensed ----') 
print(bw_df['ms_dx_condensed'].value_counts())
print(bw_df['demographic_diagnosis'].value_counts())

bingoEHR_DX_MS DX
RRMS (Relapsing-remitting Multiple Sclerosis)                       293
SPMS (Secondary-progressive Multiple Sclerosis)                      47
PPMS (Primary-progressive Multiple Sclerosis)                        46
MS, Subtype Not Specified                                            35
abnormal MRI                                                          4
PRMS (Progressive-relapsing Multiple Sclerosis)                       2
pending                                                               1
CIS (Clinically Isolated Syndrome), with high risk MS (MRI, CSF)      1
MOGAD                                                                 1
Name: count, dtype: int64
demographic_diagnosis
MS                                                                                 462
HC                                                                                  68
Mild TBI                                                                            37
PD                       

In [35]:
bw_df.head()

Unnamed: 0,bw_id,trialdate,visit_date,demoGait_dateDiff,demoEHR_DiseaseDuration,bingoEHR_DX_MS DX,demographic_diagnosis,clean_Race,clean_Sex,clean_Ethnicity,...,demoEHR_EDSS_dateDiff,bingoEHR_Vitals_height(in),demoEHR_Vitals_dateDiff,bingoEHR_first_MS DX,edss_severity_num,edss_severity_cat,t25fw_group_num,t25fw_group_cat,race_ethnicity_clean,ms_dx_condensed
0,BW-0146,NaT,2022-05-18,,,,HC,White,Male,Not Hispanic Or Latino,...,,,,,,,,,White Not Hispanic,
1,BW-0146,2022-08-17,2022-08-17,0.0,,,HC,White,Male,Not Hispanic Or Latino,...,,,,,,,,,White Not Hispanic,
2,BW-0001,2022-08-26,2022-08-26,0.0,,,PD,White,Male,Not Hispanic Or Latino,...,,,,,,,,,White Not Hispanic,
3,BW-0002,2022-09-12,2022-09-12,0.0,0.0,RRMS (Relapsing-remitting Multiple Sclerosis),MS,Other,Male,Hispanic Or Latino,...,0.0,66.0,-0.454109,RRMS (Relapsing-remitting Multiple Sclerosis),2.0,moderate,1.0,under_6,Hispanic or Latino,RRMS
4,BW-0004,2022-09-19,2022-09-19,0.0,17.0,RRMS (Relapsing-remitting Multiple Sclerosis),MS,White,Female,Not Hispanic Or Latino,...,0.0,67.5,-0.453264,RRMS (Relapsing-remitting Multiple Sclerosis),2.0,moderate,1.0,under_6,White Not Hispanic,RRMS


## Merge video data and brainwalk data 

In [36]:
# zeno videos - preferred walking speed 
bw_zv_pws_df = merge_bw_zv(bw_df, zv_df, 'gait_vertical_PWS_1', out_path) 

confirm all one task
['gait_vertical_PWS_1']
total zeno videos
305
total bw rows with id in video dataset
369
No matching id and daterow from video vs mat
BW-0036
2024-04-02 00:00:00
No matching id and daterow from video vs mat
BW-0121
2022-07-20 00:00:00
No matching id and daterow from video vs mat
BW-0322
2024-06-10 00:00:00
mismatched zeno video vs brainwalk id
0
mismatched zeno video vs brainwalk date
0


In [37]:
bw_zv_pws_df.head()

Unnamed: 0,video_id_date_name_pose_zv,id_date_pose_zv,task_pose_zv,frames_per_second_pose_zv,total_video_duration_sec_pose_zv,delta_pix_h_rel_median_pose_zv,walking_segmets_n_pose_zv,walking_segments_duration_mean_pose_zv,walking_segments_duration_median_pose_zv,stride_time_mean_sec_pose_zv,...,demoEHR_EDSS_dateDiff,bingoEHR_Vitals_height(in),demoEHR_Vitals_dateDiff,bingoEHR_first_MS DX,edss_severity_num,edss_severity_cat,t25fw_group_num,t25fw_group_cat,race_ethnicity_clean,ms_dx_condensed
0,gait_vertical_PWS_1_BW-0002_2022_09_12,BW-0002\2022_09_12,gait_vertical_PWS_1,25,42.76,0.15,,,,,...,0.0,66.0,-0.454109,RRMS (Relapsing-remitting Multiple Sclerosis),2.0,moderate,1.0,under_6,Hispanic or Latino,RRMS
1,gait_vertical_PWS_1_BW-0002_2023_09_12,BW-0002\2023_09_12,gait_vertical_PWS_1,30,46.633333,0.22,4.0,4.46,4.32,1.354,...,0.0,,-11.391412,RRMS (Relapsing-remitting Multiple Sclerosis),2.0,moderate,2.0,6_to_8,Hispanic or Latino,RRMS
2,gait_vertical_PWS_1_BW-0003_2022_10_24,BW-0003\2022_10_24,gait_vertical_PWS_1,30,26.033333,0.26,4.0,4.53,4.87,1.192,...,61.0,,-7.471505,"CIS (Clinically Isolated Syndrome), with high ...",1.0,mild,,,Asian,"MS, Subtype Not Specified"
3,gait_vertical_PWS_1_BW-0004_2022_09_19,BW-0004\2022_09_19,gait_vertical_PWS_1,30,30.4,0.37,,,,,...,0.0,67.5,-0.453264,RRMS (Relapsing-remitting Multiple Sclerosis),2.0,moderate,1.0,under_6,White Not Hispanic,RRMS
4,gait_vertical_PWS_1_BW-0006_2022_09_26,BW-0006\2022_09_26,gait_vertical_PWS_1,30,24.4,0.36,2.0,4.63,4.63,1.148,...,0.0,67.0,-0.428669,PPMS (Primary-progressive Multiple Sclerosis),2.0,moderate,1.0,under_6,White Not Hispanic,Progressive MS


In [38]:
print('----- print bw_zv_pws_df video counts ----')
print_video_counts(bw_zv_pws_df)

----- print bw_zv_pws_df video counts ----
total videos - df length: 302
unique demographic_diagnosis in df: ['MS' 'HC']
num videos with demographic_diagnosis == HC: 44
num videos demographic_diagnosis == MS: 258
------
unique id_video (participants) in df: 206
num participants with demographic_diagnosis == HC: 39
num participants demographic_diagnosis == MS: 167
------
number of participants with multiple videos in dataset: 88
number of participants with one video: 118


In [39]:
# zeno videos - fast walking speed 
bw_zv_fw_df = merge_bw_zv(bw_df, zv_df, 'gait_vertical_FW_1', out_path)

confirm all one task
['gait_vertical_FW_1']
total zeno videos
312
total bw rows with id in video dataset
372
No matching id and daterow from video vs mat
BW-0036
2024-04-02 00:00:00
No matching id and daterow from video vs mat
BW-0322
2024-06-10 00:00:00
mismatched zeno video vs brainwalk id
0
mismatched zeno video vs brainwalk date
0


In [40]:
bw_zv_fw_df.head()

Unnamed: 0,video_id_date_name_pose_zv,id_date_pose_zv,task_pose_zv,frames_per_second_pose_zv,total_video_duration_sec_pose_zv,delta_pix_h_rel_median_pose_zv,walking_segmets_n_pose_zv,walking_segments_duration_mean_pose_zv,walking_segments_duration_median_pose_zv,stride_time_mean_sec_pose_zv,...,demoEHR_EDSS_dateDiff,bingoEHR_Vitals_height(in),demoEHR_Vitals_dateDiff,bingoEHR_first_MS DX,edss_severity_num,edss_severity_cat,t25fw_group_num,t25fw_group_cat,race_ethnicity_clean,ms_dx_condensed
0,gait_vertical_FW_1_BW-0002_2022_09_12,BW-0002\2022_09_12,gait_vertical_FW_1,25,28.96,0.28,,,,,...,0.0,66.0,-0.454109,RRMS (Relapsing-remitting Multiple Sclerosis),2.0,moderate,1.0,under_6,Hispanic or Latino,RRMS
1,gait_vertical_FW_1_BW-0002_2023_09_12,BW-0002\2023_09_12,gait_vertical_FW_1,30,36.5,0.25,3.0,6.71,7.23,1.015,...,0.0,,-11.391412,RRMS (Relapsing-remitting Multiple Sclerosis),2.0,moderate,2.0,6_to_8,Hispanic or Latino,RRMS
2,gait_vertical_FW_1_BW-0003_2022_10_24,BW-0003\2022_10_24,gait_vertical_FW_1,30,28.233333,0.35,1.0,2.33,2.33,,...,61.0,,-7.471505,"CIS (Clinically Isolated Syndrome), with high ...",1.0,mild,,,Asian,"MS, Subtype Not Specified"
3,gait_vertical_FW_1_BW-0004_2022_09_19,BW-0004\2022_09_19,gait_vertical_FW_1,30,25.3,0.36,2.0,3.15,3.15,1.433,...,0.0,67.5,-0.453264,RRMS (Relapsing-remitting Multiple Sclerosis),2.0,moderate,1.0,under_6,White Not Hispanic,RRMS
4,gait_vertical_FW_1_BW-0006_2022_09_26,BW-0006\2022_09_26,gait_vertical_FW_1,30,24.966667,0.27,2.0,4.63,4.63,0.967,...,0.0,67.0,-0.428669,PPMS (Primary-progressive Multiple Sclerosis),2.0,moderate,1.0,under_6,White Not Hispanic,Progressive MS


In [41]:
print('---- print bw_zv_fw_df video counts ----')
print_video_counts(bw_zv_fw_df)

---- print bw_zv_fw_df video counts ----
total videos - df length: 310
unique demographic_diagnosis in df: ['MS' 'HC']
num videos with demographic_diagnosis == HC: 46
num videos demographic_diagnosis == MS: 264
------
unique id_video (participants) in df: 209
num participants with demographic_diagnosis == HC: 41
num participants demographic_diagnosis == MS: 168
------
number of participants with multiple videos in dataset: 92
number of participants with one video: 117


In [42]:
# home videos - merge home videos and BW 
print('----- right -------') 
bw_hv_r_pws_df = merge_bw_hv(bw_df, hv_df, 'gait_vertical_right', out_path)
print('----- left -------') 
bw_hv_l_pws_df = merge_bw_hv(bw_df, hv_df, 'gait_vertical_left', out_path)

----- right -------
confirm all one task
['gait_vertical_right']
total home videos
37
total bw rows with id in video dataset
72
mismatched home video vs brainwalk id
0
----- left -------
confirm all one task
['gait_vertical_left']
total home videos
36
total bw rows with id in video dataset
70
mismatched home video vs brainwalk id
0


In [43]:
# concatenate right and left 
bw_hv_pws_df = pd.concat([bw_hv_r_pws_df, bw_hv_l_pws_df], axis = 0).sort_index()
bw_hv_pws_df['edss_severity_cat'] = pd.Categorical(bw_hv_pws_df['edss_severity_cat'], categories=["mild", "moderate", "severe"], ordered=True)
print('---- print bw_hv_pws_df video counts ----')
print_video_counts(bw_hv_pws_df)

# save merged df  
bw_hv_pws_df.to_csv(os.path.join(out_path,  'hv_bw_merged_raw.csv'))

---- print bw_hv_pws_df video counts ----
total videos - df length: 73
unique demographic_diagnosis in df: ['MS']
num videos with demographic_diagnosis == HC: 0
num videos demographic_diagnosis == MS: 73
------
unique id_video (participants) in df: 35
num participants with demographic_diagnosis == HC: 0
num participants demographic_diagnosis == MS: 35
------
number of participants with multiple videos in dataset: 30
number of participants with one video: 5


In [44]:
bw_hv_pws_df.head()

Unnamed: 0,video_id_date_name_pose_hv,id_date_pose_hv,task_pose_hv,frames_per_second_pose_hv,total_video_duration_sec_pose_hv,delta_pix_h_rel_median_pose_hv,walking_segmets_n_pose_hv,walking_segments_duration_mean_pose_hv,walking_segments_duration_median_pose_hv,stride_time_mean_sec_pose_hv,...,bingoEHR_Vitals_height(in),demoEHR_Vitals_dateDiff,bingoEHR_first_MS DX,edss_severity_num,edss_severity_cat,t25fw_group_num,t25fw_group_cat,race_ethnicity_clean,ms_dx_condensed,bw_hv_date_diff_days
0,gait_vertical_left_BW-0018_10-24-23,BW-0018\10-24-23,gait_vertical_left,30,26.8,0.22,6.0,2.88,2.92,1.182,...,67.0,-0.347106,RRMS (Relapsing-remitting Multiple Sclerosis),2.0,moderate,1.0,under_6,White Not Hispanic,RRMS,0
1,gait_vertical_right_BW-0018_10-24-23,BW-0018\10-24-23,gait_vertical_right,30,24.4,0.31,6.0,3.15,3.2,1.185,...,67.0,-0.347106,RRMS (Relapsing-remitting Multiple Sclerosis),2.0,moderate,1.0,under_6,White Not Hispanic,RRMS,0
2,gait_vertical_left_BW-0023_05-05-23,BW-0023\05-05-23,gait_vertical_left,30,35.966667,0.43,5.0,4.43,4.47,1.054,...,66.0,15.552766,RRMS (Relapsing-remitting Multiple Sclerosis),2.0,moderate,1.0,under_6,Other/Unknown/Declined,RRMS,9
3,gait_vertical_right_BW-0023_05-05-23,BW-0023\05-05-23,gait_vertical_right,30,38.466667,0.39,6.0,4.79,4.77,1.091,...,66.0,15.552766,RRMS (Relapsing-remitting Multiple Sclerosis),2.0,moderate,1.0,under_6,Other/Unknown/Declined,RRMS,9
4,gait_vertical_left_BW-0023_10-23-23,BW-0023\10-23-23,gait_vertical_left,30,36.1,0.3,3.0,4.27,3.93,1.008,...,66.0,-0.560845,RRMS (Relapsing-remitting Multiple Sclerosis),2.0,moderate,1.0,under_6,Other/Unknown/Declined,RRMS,0


## Drop rows with missing brainwalk data 
May still be missing some video metrics, but will filter within each relevant analysis step 

!!! 2/24/2025 - drops healthy controls. Think might be better and clenaer for writing... 

3/4 - not dropping by 

In [45]:
# drop if missing any brainwalk data 

cols_to_check = ['demoEHR_DiseaseDuration', 'clean_Sex',
                 'race_ethnicity_clean', 'clean_Age',
                 'clean_EDSS', 'clean_T25FW_Avg',
                 'FW_cadencestepsminmean', 'FW_singlesupportmean',
                 'FW_singlesupportratiolr', 'FW_stridetimeseccv', 'FW_stridetimesecmean',
                 'FW_stridewidthcmmean', 'FW_stridewidthcmsd', 'FW_totaldsupportmean',
                 'FW_totaldsupportratiolr', 'FW_velocitycmsecmean',
                 'PWS_cadencestepsminmean', 'PWS_singlesupportmean',
                 'PWS_singlesupportratiolr', 'PWS_stridetimeseccv',
                 'PWS_stridetimesecmean', 'PWS_stridewidthcmmean', 'PWS_stridewidthcmsd',
                 'PWS_totaldsupportmean', 'PWS_totaldsupportratiolr',
                 'PWS_velocitycmsecmean']

In [46]:
# PWS 
print(f"Videos in raw data frame: {len(bw_zv_pws_df)}") 
print(bw_zv_pws_df['demographic_diagnosis'].value_counts())

bw_zv_pws_clean_df = bw_zv_pws_df.dropna(subset = cols_to_check)
bw_zv_pws_clean_df.to_csv(os.path.join(out_path, 'zv_bw_merged_gait_vertical_PWS_1_clean.csv'))
print(f"Videos in clean data frame: {len(bw_zv_pws_clean_df)}") 
print(bw_zv_pws_clean_df['demographic_diagnosis'].value_counts())

Videos in raw data frame: 302
demographic_diagnosis
MS    258
HC     44
Name: count, dtype: int64
Videos in clean data frame: 224
demographic_diagnosis
MS    224
Name: count, dtype: int64


In [47]:
# FW 
print(f"Videos in raw data frame: {len(bw_zv_fw_df)}") 
print(bw_zv_fw_df['demographic_diagnosis'].value_counts())

bw_zv_fw_clean_df = bw_zv_fw_df.dropna(subset = cols_to_check)
bw_zv_fw_clean_df.to_csv(os.path.join(out_path, 'zv_bw_merged_gait_vertical_FW_1_clean.csv'))
print(f"Videos in clean data frame: {len(bw_zv_fw_clean_df)}") 
bw_zv_fw_clean_df['demographic_diagnosis'].value_counts()

Videos in raw data frame: 310
demographic_diagnosis
MS    264
HC     46
Name: count, dtype: int64
Videos in clean data frame: 222


demographic_diagnosis
MS    222
Name: count, dtype: int64

In [48]:
# Home Videos 
print(f"Videos in raw data frame: {len(bw_hv_pws_df)}") 
print(bw_hv_pws_df['demographic_diagnosis'].value_counts())

bw_hv_pws_clean_df = bw_hv_pws_df.dropna(subset = cols_to_check)
bw_hv_pws_clean_df.to_csv(os.path.join(out_path, 'hv_bw_merged_clean.csv'))
print(f"Videos in clean data frame: {len(bw_hv_pws_clean_df)}") 
bw_hv_pws_clean_df['demographic_diagnosis'].value_counts()

Videos in raw data frame: 73
demographic_diagnosis
MS    73
Name: count, dtype: int64
Videos in clean data frame: 65


demographic_diagnosis
MS    65
Name: count, dtype: int64