In [1]:
import os 
import pandas as pd 
import numpy as np

In [2]:
# analysis folder version  
analysis_version = '007'

## Output Path 

In [3]:
out_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis',
                        analysis_version, 
                        '004_demographics_feasibility')

if not os.path.exists(out_path): 
    os.makedirs(out_path)

# Functions 

In [4]:
# split dataframe into healthy controls and participant swith MS 
# then select first visit date - maybe not baseline, but first visit with data in that dataset 

def split_MS_HC_first_visit(df): 
    df = df.copy()
    df['visit_date'] = pd.to_datetime(df['visit_date'])
    df['bw_id'] = df['bw_id'].str.strip()

    print('total unique bw_ids in df')
    print(df['bw_id'].nunique())
    print('--------------') 

    # drop full duplicate rows 
    df = df.drop_duplicates(keep='first')

    # drop dupllicated bw_id and visit_date - ex: same fw and pws 
    df = df.drop_duplicates(subset=['bw_id', 'visit_date'], keep='first')

    # first visit - keep the earliest date for each ID 
    df_first_visit = (
        df.sort_values(by=['bw_id', 'visit_date'])
        .groupby('bw_id')
        .first()
        .reset_index()
    )
    

    # check if any duplicates 
    duplicates = df_first_visit['bw_id'].duplicated().sum()
    
    print('any duplicate bw_ids in first visit df?') 
    print(duplicates)
    print('--------------')

    print('df_first_visit demographic diagnosis counts') 
    print(df_first_visit['demographic_diagnosis'].value_counts())
    print('--------------') 
    
    MS_1_df = df_first_visit.loc[df_first_visit['demographic_diagnosis'] == 'MS']
    HC_1_df = df_first_visit.loc[df_first_visit['demographic_diagnosis'] == 'HC'] 

    print('MS and HC rows should match from table above') 
    print('rows in final MS df')
    print(len(MS_1_df))
    print('MS df count bw_id rows with data') 
    print(MS_1_df['bw_id'].count()) 

    print('rows in final HC df')
    print(len(HC_1_df))
    print('HC df count bw_id rows with data') 
    print(HC_1_df['bw_id'].count()) 

    return MS_1_df, HC_1_df

In [5]:
def demographic_summary(df): 
    # summary statistics for  cohort 
    if len(df) > 0: 
        # age 
        age_count = df['clean_Age'].count()
        age_mean = df['clean_Age'].mean()
        age_sd = df['clean_Age'].std()
        
        # sex 
        sex_count = df['clean_Sex'].count()
        sex_n = df['clean_Sex'].value_counts()
        sex_n_female = sex_n['Female']
        sex_freqs = df['clean_Sex'].value_counts(normalize=True) * 100
        sex_freq_female = sex_freqs['Female'] 
        
        # edss 
        edss_count = df['clean_EDSS'].count()
        edss_median = df['clean_EDSS'].median()
        edss_iqr = df['clean_EDSS'].quantile(0.75) - df['clean_EDSS'].quantile(0.25)

        # t25fw 
        t25fw_count = df['clean_T25FW_Avg'].count()
        t25fw_mean= df['clean_T25FW_Avg'].mean()
        t25fw_sd = df['clean_T25FW_Avg'].std()

        #race 
        race_count = df['race_ethnicity_clean'].count()
        race_n = df['race_ethnicity_clean'].value_counts()
        race_freq = df['race_ethnicity_clean'].value_counts(normalize = True) * 100

        try:
            white_n = race_n['White Not Hispanic']
            white_freq = race_freq['White Not Hispanic']
        except KeyError:
            white_n = 0
            white_freq = 0

        try: 
            black_n = race_n['Black Or African American']
            black_freq = race_freq['Black Or African American']
        except KeyError:
            black_n = 0
            black_freq = 0
        
        try: 
            asian_n = race_n['Asian']
            asian_freq = race_freq['Asian']
        except KeyError:
            asian_n = 0
            asian_freq = 0

#        try: 
#             ai_an_n = race_n['American Indian Or Alaska Native']
#             ai_an_freq = race_freq['American Indian Or Alaska Native']
#         except KeyError:
#             ai_an_n = 0
#             ai_an_freq = 0

#         try: 
#             opi_n = race_n['Other Pacific Islander']
#             opi_freq = race_freq['Other Pacific Islander']
#         except KeyError:
#             opi_n = 0
#             opi_freq = 0
             
        try: 
            hispanic_n = race_n['Hispanic or Latino']
            hispanic_freq = race_freq['Hispanic or Latino']
        except KeyError: 
            hispanic_n = 0
            hispanic_freq = 0

        try: 
            decl_n = race_n['Other/Unknown/Declined']
            decl_freq = race_freq['Other/Unknown/Declined']
        except KeyError: 
            decl_n = 0
            decl_freq = 0

        # disease duration 
        duration_count = df['demoEHR_DiseaseDuration'].count()
        duration_mean = df['demoEHR_DiseaseDuration'].mean()
        duration_sd = df['demoEHR_DiseaseDuration'].std()

        # MS subtype 
        #'bingoEHR_DX_MS DX'
        ms_dx_count = df['ms_dx_condensed'].count()
        ms_dx_n = df['ms_dx_condensed'].value_counts()
        ms_dx_freq = df['ms_dx_condensed'].value_counts(normalize = True) * 100

        try:
            rrms_n = ms_dx_n['RRMS']
            rrms_freq = ms_dx_freq['RRMS']
        except KeyError:
            rrms_n = 0
            rrms_freq = 0

        
        try:
            pms_n = ms_dx_n['Progressive MS']
            pms_freq = ms_dx_freq['Progressive MS']
        except KeyError:
            pms_n = 0
            pms_freq = 0

#        try: 
#            spms_n = ms_dx_n['SPMS (Secondary-progressive Multiple Sclerosis)']
#            spms_freq = ms_dx_freq['SPMS (Secondary-progressive Multiple Sclerosis)']
#        except KeyError:
#            spms_n = 0
#            spms_freq = 0

#        try: 
#            ppms_n = ms_dx_n['PPMS (Primary-progressive Multiple Sclerosis)']
#            ppms_freq = ms_dx_freq['PPMS (Primary-progressive Multiple Sclerosis)']
#        except KeyError:
#            ppms_n = 0
#            ppms_freq = 0

#        try: 
#            prms_n = ms_dx_n['PRMS (Progressive-relapsing Multiple Sclerosis)']
#            prms_freq = ms_dx_freq['PRMS (Progressive-relapsing Multiple Sclerosis)']
#        except KeyError:
#            prms_n = 0
#            prms_freq = 0 

        try: 
            sns_n = ms_dx_n['MS, Subtype Not Specified']
            sns_freq = ms_dx_freq['MS, Subtype Not Specified']
        except KeyError:
            sns_n = 0
            sns_freq = 0

#        try: 
#            pend_n = ms_dx_n['pending']
#            pend_freq = ms_dx_freq['pending']
#        except KeyError:
#            pend_n = 0
#            pend_freq = 0

#        try: 
#            abn_n = ms_dx_n['abnormal MRI']
#            abn_freq = ms_dx_freq['abnormal MRI']
#        except KeyError:
#            abn_n = 0
#            abn_freq = 0

        # first visit - visit type 
        visit_count = df['redcap_event_name'].count()
        visit_n = df['redcap_event_name'].value_counts()
        visit_freq = df['redcap_event_name'].value_counts(normalize = True) * 100

        #baseline visits 
        base_n = visit_n['Brainwalk: Baseline visit (Arm 1: Baseline visit)']
        base_freq = visit_freq['Brainwalk: Baseline visit (Arm 1: Baseline visit)']

        # year 1 
        try: 
            yr2_n = visit_n['Year 2 Visit (Arm 1: Baseline visit)']
            yr2_freq = visit_freq['Year 2 Visit (Arm 1: Baseline visit)']
        except KeyError:
            yr2_n = 0
            yr2_freq = 0

        # year 1 
        try: 
            yr3_n = visit_n['Year 3 Visit (Arm 1: Baseline visit)']
            yr3_freq = visit_freq['Year 3 Visit (Arm 1: Baseline visit)']
        except KeyError:
            yr3_n = 0
            yr3_freq = 0
        
        # summary data 
        summary_data = {'Metric': ['N Participants', 
                                      'Age (Years), Mean (SD)',
                                      'Sex (Female), N (%)',
                                      'EDSS, Median (IQR)', 
                                      'T25FW, Mean (SD)',
                                      'Race, N (%)',
                                      'White Not Hispanic',
                                      'Hispanic or Latino',
                                      'Asian',
                                      'Black Or African American',
                                      'Other/Unknown/Declined',
                                      'Disease Duration (Years, Mean (SD)',
                                      'MS Subtype, N (%)',
                                      'Relapsing-remitting',
                                      'Progressive',
                                      'Subtype Not Specified',
                                      'Visit Type, N (%)',
                                      'Baseline', 
                                      'Year 2', 
                                      'Year 3'],
                         'Statistic': [df['bw_id'].nunique(),
                                       f"{age_mean:.2f} ({age_sd:.2f})",  # Mean (SD)
                                       f"{sex_n_female} ({sex_freq_female:.0f}%)",
                                       f"{edss_median:.1f} ({edss_iqr:.1f})", 
                                       f"{t25fw_mean:.2f} ({t25fw_sd:.2f})",
                                       np.nan, 
                                       f"{white_n} ({white_freq:.0f}%)",
                                       f"{hispanic_n} ({hispanic_freq:.0f}%)",
                                       f"{asian_n} ({asian_freq:.0f}%)",
                                       f"{black_n} ({black_freq:.0f}%)",
                                       f"{decl_n} ({decl_freq:.0f}%)",
                                       f"{duration_mean:.2f} ({duration_sd:.2f})",  # Mean (SD)
                                       np.nan,
                                       f"{rrms_n} ({rrms_freq:.0f}%)",
                                       f"{pms_n} ({pms_freq:.0f}%)",
                                       f"{sns_n} ({sns_freq:.0f}%)", 
                                       np.nan, 
                                       f"{base_n} ({base_freq:.0f}%)",
                                       f"{yr2_n} ({yr2_freq:.0f}%)",
                                       f"{yr3_n} ({yr3_freq:.0f}%)"
                                      ]
                       }

        demographics_summary_df = pd.DataFrame(summary_data)


        # counts of participants with demographic data 
        n_data = {'Metric': ['N', 
                             'n with age data',
                             'n with sex data',
                             'n with edss data',
                             'n with t25fw data',
                             'n with demoEHR_REC_2 data',
                             'n with duration data',
                             'n with ms subtype data'],
                         'Statistic': [df['bw_id'].nunique(),
                                       age_count, 
                                       sex_count, 
                                       edss_count,
                                       t25fw_count, 
                                       race_count,
                                       duration_count,
                                       ms_dx_count
                                       ]}

        n_data_df = pd.DataFrame(n_data) 

    else: 
        print('no participants') 
        demographics_summary_df = pd.DataFrame()
        n_data_df = pd.DataFrame()

    return demographics_summary_df, n_data_df

# Demographic Groups 

For each group 
- pulling demographic data from first brainwalk visit if participant has multiple visits 
- save .csv file for each group
- save summary table as .csv file 

Zeno 
1. All participants with videos: MS + HC
2. All participants with included videos: MS + HC
3. Participants with included PWS videos: MS + HC – is there a diff between groups?
4. Participants with included FW videos: MS + HC – is there a diff between groups?

Home Videos 
1. all BW participants with MS (TBD, maybe not all approached) 
2. All BW participants consented to home vids
3. All BW participants who sent home vids
4. All BW participants who sent usable/included home vids  

In [6]:
# drop if missing any brainwalk data 
cols_to_check = ['demoEHR_DiseaseDuration', 'clean_Sex',
                 'clean_Race', 'clean_Ethnicity', 'clean_Age', 'bingoEHR_DX_MS DX',
                 'clean_EDSS', 'clean_T25FW_Avg',
                 'FW_cadencestepsminmean', 'FW_singlesupportmean',
                 'FW_singlesupportratiolr', 'FW_stridetimeseccv', 'FW_stridetimesecmean',
                 'FW_stridewidthcmmean', 'FW_stridewidthcmsd', 'FW_totaldsupportmean',
                 'FW_totaldsupportratiolr', 'FW_velocitycmsecmean',
                 'PWS_cadencestepsminmean', 'PWS_singlesupportmean',
                 'PWS_singlesupportratiolr', 'PWS_stridetimeseccv',
                 'PWS_stridetimesecmean', 'PWS_stridewidthcmmean', 'PWS_stridewidthcmsd',
                 'PWS_totaldsupportmean', 'PWS_totaldsupportratiolr',
                 'PWS_velocitycmsecmean', 'bingoEHR_Vitals_height(in)',
                 'demographic_diagnosis']

# All BW participants w demographic data and walking tasks 
- Just MS participants
- Need to have all demographic data -drop if missing any 'cols_to_check'

In [7]:
# all Brainwalk data 

bw_path = r'C:\Users\mmccu\AppData\Local\Temp\ccsecure\2025_03_26_BrainWalk_AllData_Long_MM.xlsx'
bw_df = pd.read_excel(bw_path, 
                     index_col = None, 
                     usecols = ['bw_id', 'trialdate', 'visit_date', 'redcap_event_name', 'demoEHR_DiseaseDuration',
                                'clean_Sex', 'clean_Age', 'demographic_diagnosis', 'bingoEHR_DX_MS DX', 'bingoEHR_first_MS DX', 'clean_Race',	
                                'clean_Ethnicity', 'bingoEHR_Vitals_height(in)', 'clean_EDSS', 'clean_T25FW_Avg',
                                'PWS_stridetimesecmean', 'PWS_stridetimeseccv','PWS_cadencestepsminmean','PWS_totaldsupportmean', 
                                'PWS_singlesupportmean','PWS_totaldsupportratiolr', 'PWS_singlesupportratiolr', 
                                'PWS_stridewidthcmmean','PWS_stridewidthcmsd', 'PWS_velocitycmsecmean', 
                                'FW_stridetimesecmean', 'FW_stridetimeseccv','FW_cadencestepsminmean','FW_totaldsupportmean', 
                                'FW_singlesupportmean','FW_totaldsupportratiolr', 'FW_singlesupportratiolr', 
                                'FW_stridewidthcmmean','FW_stridewidthcmsd', 'FW_velocitycmsecmean', 
                                'demoGait_dateDiff', 'msfcEHR_T25FW SPEED TRIAL 1 vDate Diff', 'msfcEHR_T25FW SPEED TRIAL 2 vDate Diff',
                                'demoEHR_DX_dateDiff', 'demoEHR_EDSS_dateDiff', 'demoEHR_Vitals_dateDiff'])

In [8]:
# drop rows with any missing demographic and gait data and select only participants with MS 
bw_ms_clean_df = bw_df.loc[bw_df['demographic_diagnosis'] == 'MS']
bw_ms_clean_df = bw_df.dropna(subset = cols_to_check)
print('------ clean ----')
print(bw_ms_clean_df['demographic_diagnosis'].value_counts())

------ clean ----
demographic_diagnosis
MS    259
Name: count, dtype: int64


In [9]:
# All BW Dataset 
clean_bw_ms_first_visit, test_HC = split_MS_HC_first_visit(bw_ms_clean_df)
clean_bw_ms_dem, clean_bw_ms_counts = demographic_summary(clean_bw_ms_first_visit)
clean_bw_ms_first_visit.to_csv(os.path.join(out_path, 'clean_ms_all_first_visit.csv')) 
clean_bw_ms_dem.to_csv(os.path.join(out_path, 'clean_bw_ms_all_dem.csv'))

total unique bw_ids in df
166
--------------
any duplicate bw_ids in first visit df?
0
--------------
df_first_visit demographic diagnosis counts
demographic_diagnosis
MS    166
Name: count, dtype: int64
--------------
MS and HC rows should match from table above
rows in final MS df
166
MS df count bw_id rows with data
166
rows in final HC df
0
HC df count bw_id rows with data
0


KeyError: 'race_ethnicity_clean'

# Zeno Videos 

### All videos in clean df 
PWS vs FW below 

In [10]:
# clean dataframes contain all videos processed with BW data  

# PWS 
zv_pws_bw_clean_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis', 
                                    analysis_version, 
                                    '000_merged_cleaned_data\zv_bw_merged_gait_vertical_PWS_1_clean.csv')
zv_pws_bw_clean_df = pd.read_csv(zv_pws_bw_clean_path, index_col = 0)
print(zv_pws_bw_clean_df['demographic_diagnosis'].value_counts())

demographic_diagnosis
MS    224
Name: count, dtype: int64


In [11]:
# PWS Demographics - All Videos in clean dataset 
clean_zv_pws_first_visit, test_HC = split_MS_HC_first_visit(zv_pws_bw_clean_df)
clean_zv_pws_dem, clean_zv_pws_counts = demographic_summary(clean_zv_pws_first_visit)
clean_zv_pws_first_visit.to_csv(os.path.join(out_path, 'clean_zv_pws_first_visit.csv')) 
clean_zv_pws_dem.to_csv(os.path.join(out_path, 'clean_zv_pws_dem.csv'))

total unique bw_ids in df
154
--------------
any duplicate bw_ids in first visit df?
0
--------------
df_first_visit demographic diagnosis counts
demographic_diagnosis
MS    154
Name: count, dtype: int64
--------------
MS and HC rows should match from table above
rows in final MS df
154
MS df count bw_id rows with data
154
rows in final HC df
0
HC df count bw_id rows with data
0


In [12]:
clean_zv_pws_dem

Unnamed: 0,Metric,Statistic
0,N Participants,154
1,"Age (Years), Mean (SD)",48.17 (12.95)
2,"Sex (Female), N (%)",115 (75%)
3,"EDSS, Median (IQR)",2.5 (2.5)
4,"T25FW, Mean (SD)",5.88 (4.61)
5,"Race, N (%)",
6,White Not Hispanic,105 (68%)
7,Hispanic or Latino,15 (10%)
8,Asian,11 (7%)
9,Black Or African American,11 (7%)


In [13]:
clean_zv_pws_counts

Unnamed: 0,Metric,Statistic
0,N,154
1,n with age data,154
2,n with sex data,154
3,n with edss data,154
4,n with t25fw data,154
5,n with demoEHR_REC_2 data,154
6,n with duration data,154
7,n with ms subtype data,154


In [14]:
# FW 
zv_fw_bw_clean_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis', 
                                    analysis_version, 
                                   '000_merged_cleaned_data\zv_bw_merged_gait_vertical_FW_1_clean.csv') 
zv_fw_bw_clean_df = pd.read_csv(zv_fw_bw_clean_path, index_col = 0) 
print(zv_fw_bw_clean_df['demographic_diagnosis'].value_counts())

demographic_diagnosis
MS    222
Name: count, dtype: int64


In [16]:
# FW Demographics - All Videos in clean dataset 
clean_zv_fw_first_visit, test_HC = split_MS_HC_first_visit(zv_fw_bw_clean_df)
clean_zv_fw_dem, clean_zv_fw_counts = demographic_summary(clean_zv_fw_first_visit)
clean_zv_fw_first_visit.to_csv(os.path.join(out_path, 'clean_zv_fw_first_visit.csv')) 
clean_zv_fw_dem.to_csv(os.path.join(out_path, 'clean_zv_fw_dem.csv'))

total unique bw_ids in df
154
--------------
any duplicate bw_ids in first visit df?
0
--------------
df_first_visit demographic diagnosis counts
demographic_diagnosis
MS    154
Name: count, dtype: int64
--------------
MS and HC rows should match from table above
rows in final MS df
154
MS df count bw_id rows with data
154
rows in final HC df
0
HC df count bw_id rows with data
0


In [17]:
clean_zv_fw_dem

Unnamed: 0,Metric,Statistic
0,N Participants,154
1,"Age (Years), Mean (SD)",48.17 (12.94)
2,"Sex (Female), N (%)",115 (75%)
3,"EDSS, Median (IQR)",2.5 (2.5)
4,"T25FW, Mean (SD)",5.89 (4.61)
5,"Race, N (%)",
6,White Not Hispanic,105 (68%)
7,Hispanic or Latino,15 (10%)
8,Asian,11 (7%)
9,Black Or African American,11 (7%)


In [18]:
clean_zv_fw_counts

Unnamed: 0,Metric,Statistic
0,N,154
1,n with age data,154
2,n with sex data,154
3,n with edss data,154
4,n with t25fw data,154
5,n with demoEHR_REC_2 data,154
6,n with duration data,154
7,n with ms subtype data,154


### All videos w gait metrics + demographic data and walking tasks 
old "included" : linear walking segment identified. 
PWS vs FW  - split videos with and without segment number 

# Home Videos 

### All Participants who consented to home videos 
See old demographics table script

### All videos in clean df 

In [19]:
# Home Videos 
hv_bw_clean_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis', 
                                analysis_version, 
                                '000_merged_cleaned_data\hv_bw_merged_clean.csv') 

hv_bw_clean_df = pd.read_csv(hv_bw_clean_path, index_col = 0) 

In [20]:
# Home Demographics - All Videos in clean dataset 
clean_home_first_visit, test_HC = split_MS_HC_first_visit(hv_bw_clean_df)
clean_home_dem, clean_home_counts = demographic_summary(clean_home_first_visit)
clean_home_first_visit.to_csv(os.path.join(out_path, 'clean_home_first_visit.csv')) 
clean_home_dem.to_csv(os.path.join(out_path, 'clean_home_dem.csv'))

total unique bw_ids in df
32
--------------
any duplicate bw_ids in first visit df?
0
--------------
df_first_visit demographic diagnosis counts
demographic_diagnosis
MS    32
Name: count, dtype: int64
--------------
MS and HC rows should match from table above
rows in final MS df
32
MS df count bw_id rows with data
32
rows in final HC df
0
HC df count bw_id rows with data
0


In [21]:
clean_home_dem

Unnamed: 0,Metric,Statistic
0,N Participants,32
1,"Age (Years), Mean (SD)",50.22 (12.77)
2,"Sex (Female), N (%)",25 (78%)
3,"EDSS, Median (IQR)",3.0 (3.6)
4,"T25FW, Mean (SD)",5.85 (3.13)
5,"Race, N (%)",
6,White Not Hispanic,25 (78%)
7,Hispanic or Latino,1 (3%)
8,Asian,2 (6%)
9,Black Or African American,1 (3%)


In [22]:
clean_home_counts

Unnamed: 0,Metric,Statistic
0,N,32
1,n with age data,32
2,n with sex data,32
3,n with edss data,32
4,n with t25fw data,32
5,n with demoEHR_REC_2 data,32
6,n with duration data,32
7,n with ms subtype data,32
