In [1]:
import os 
import pandas as pd 
import numpy as np

In [2]:
# analysis folder version  
analysis_version = '007'

## Output Path 

In [3]:
out_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis',
                        analysis_version, 
                        '004_demographics_feasibility')

if not os.path.exists(out_path): 
    os.makedirs(out_path)

# Functions 

In [4]:
# split dataframe into healthy controls and participant swith MS 
# then select first visit date - maybe not baseline, but first visit with data in that dataset 

def split_MS_HC_first_visit(df, visit_date_col): 
    df = df.copy()
    df['visit_date_video'] = pd.to_datetime(df[visit_date_col])
    df['bw_id'] = df['bw_id'].str.strip()

    print('total unique bw_ids in df')
    print(df['bw_id'].nunique())
    print('--------------') 

    # drop full duplicate rows 
    df = df.drop_duplicates(keep='first')

    # drop dupllicated bw_id and visit_date - ex: same fw and pws 
    df = df.drop_duplicates(subset=['bw_id', visit_date_col], keep='first')

    # first visit - keep the earliest date for each ID 
    df_first_visit = (
        df.sort_values(by=['bw_id', visit_date_col])
        .groupby('bw_id')
        .first()
        .reset_index()
    )
    

    # check if any duplicates 
    duplicates = df_first_visit['bw_id'].duplicated().sum()
    
    print('any duplicate bw_ids in first visit df?') 
    print(duplicates)
    print('--------------')

    print('df_first_visit demographic diagnosis counts') 
    print(df_first_visit['demographic_diagnosis'].value_counts())
    print('--------------') 
    
    MS_1_df = df_first_visit.loc[df_first_visit['demographic_diagnosis'] == 'MS']
    HC_1_df = df_first_visit.loc[df_first_visit['demographic_diagnosis'] == 'HC'] 

    print('MS and HC rows should match from table above') 
    print('rows in final MS df')
    print(len(MS_1_df))
    print('MS df count bw_id rows with data') 
    print(MS_1_df['bw_id'].count()) 

    print('rows in final HC df')
    print(len(HC_1_df))
    print('HC df count bw_id rows with data') 
    print(HC_1_df['bw_id'].count()) 

    return MS_1_df, HC_1_df

In [5]:
def demographic_summary(df): 
    # summary statistics for  cohort 
    if len(df) > 0: 
        # age 
        age_count = df['clean_Age'].count()
        age_mean = df['clean_Age'].mean()
        age_sd = df['clean_Age'].std()
        
        # sex 
        sex_count = df['clean_Sex'].count()
        sex_n = df['clean_Sex'].value_counts()
        sex_n_female = sex_n['Female']
        sex_freqs = df['clean_Sex'].value_counts(normalize=True) * 100
        sex_freq_female = sex_freqs['Female'] 
        
        # edss 
        edss_count = df['clean_EDSS'].count()
        edss_median = df['clean_EDSS'].median()
        edss_iqr = df['clean_EDSS'].quantile(0.75) - df['clean_EDSS'].quantile(0.25)

        # t25fw 
        t25fw_count = df['clean_T25FW_Avg'].count()
        t25fw_mean= df['clean_T25FW_Avg'].mean()
        t25fw_sd = df['clean_T25FW_Avg'].std()

        #race 
        race_count = df['race_ethnicity_clean'].count()
        race_n = df['race_ethnicity_clean'].value_counts()
        race_freq = df['race_ethnicity_clean'].value_counts(normalize = True) * 100

        try:
            white_n = race_n['White Non Hispanic']
            white_freq = race_freq['White Non Hispanic']
        except KeyError:
            white_n = 0
            white_freq = 0

        try: 
            black_n = race_n['Black Or African American']
            black_freq = race_freq['Black Or African American']
        except KeyError:
            black_n = 0
            black_freq = 0
        
        try: 
            asian_n = race_n['Asian']
            asian_freq = race_freq['Asian']
        except KeyError:
            asian_n = 0
            asian_freq = 0

#        try: 
#             ai_an_n = race_n['American Indian Or Alaska Native']
#             ai_an_freq = race_freq['American Indian Or Alaska Native']
#         except KeyError:
#             ai_an_n = 0
#             ai_an_freq = 0

#         try: 
#             opi_n = race_n['Other Pacific Islander']
#             opi_freq = race_freq['Other Pacific Islander']
#         except KeyError:
#             opi_n = 0
#             opi_freq = 0
             
        try: 
            hispanic_n = race_n['Hispanic or Latino']
            hispanic_freq = race_freq['Hispanic or Latino']
        except KeyError: 
            hispanic_n = 0
            hispanic_freq = 0

        try: 
            decl_n = race_n['Other/Unknown/Declined']
            decl_freq = race_freq['Other/Unknown/Declined']
        except KeyError: 
            decl_n = 0
            decl_freq = 0

        # disease duration 
        duration_count = df['demoEHR_DiseaseDuration'].count()
        duration_mean = df['demoEHR_DiseaseDuration'].mean()
        duration_sd = df['demoEHR_DiseaseDuration'].std()

        # MS subtype 
        #'bingoEHR_DX_MS DX'
        ms_dx_count = df['ms_dx_condensed'].count()
        ms_dx_n = df['ms_dx_condensed'].value_counts()
        ms_dx_freq = df['ms_dx_condensed'].value_counts(normalize = True) * 100

        try:
            rrms_n = ms_dx_n['RRMS']
            rrms_freq = ms_dx_freq['RRMS']
        except KeyError:
            rrms_n = 0
            rrms_freq = 0

        
        try:
            pms_n = ms_dx_n['Progressive MS']
            pms_freq = ms_dx_freq['Progressive MS']
        except KeyError:
            pms_n = 0
            pms_freq = 0

#        try: 
#            spms_n = ms_dx_n['SPMS (Secondary-progressive Multiple Sclerosis)']
#            spms_freq = ms_dx_freq['SPMS (Secondary-progressive Multiple Sclerosis)']
#        except KeyError:
#            spms_n = 0
#            spms_freq = 0

#        try: 
#            ppms_n = ms_dx_n['PPMS (Primary-progressive Multiple Sclerosis)']
#            ppms_freq = ms_dx_freq['PPMS (Primary-progressive Multiple Sclerosis)']
#        except KeyError:
#            ppms_n = 0
#            ppms_freq = 0

#        try: 
#            prms_n = ms_dx_n['PRMS (Progressive-relapsing Multiple Sclerosis)']
#            prms_freq = ms_dx_freq['PRMS (Progressive-relapsing Multiple Sclerosis)']
#        except KeyError:
#            prms_n = 0
#            prms_freq = 0 

        try: 
            sns_n = ms_dx_n['MS, Subtype Not Specified']
            sns_freq = ms_dx_freq['MS, Subtype Not Specified']
        except KeyError:
            sns_n = 0
            sns_freq = 0

#        try: 
#            pend_n = ms_dx_n['pending']
#            pend_freq = ms_dx_freq['pending']
#        except KeyError:
#            pend_n = 0
#            pend_freq = 0

#        try: 
#            abn_n = ms_dx_n['abnormal MRI']
#            abn_freq = ms_dx_freq['abnormal MRI']
#        except KeyError:
#            abn_n = 0
#            abn_freq = 0

        # first visit - visit type 
        visit_count = df['redcap_event_name'].count()
        visit_n = df['redcap_event_name'].value_counts()
        visit_freq = df['redcap_event_name'].value_counts(normalize = True) * 100

        #baseline visits 
        base_n = visit_n['Brainwalk: Baseline visit (Arm 1: Baseline visit)']
        base_freq = visit_freq['Brainwalk: Baseline visit (Arm 1: Baseline visit)']

        # year 1 
        try: 
            yr2_n = visit_n['Year 2 Visit (Arm 1: Baseline visit)']
            yr2_freq = visit_freq['Year 2 Visit (Arm 1: Baseline visit)']
        except KeyError:
            yr2_n = 0
            yr2_freq = 0

        # year 1 
        try: 
            yr3_n = visit_n['Year 3 Visit (Arm 1: Baseline visit)']
            yr3_freq = visit_freq['Year 3 Visit (Arm 1: Baseline visit)']
        except KeyError:
            yr3_n = 0
            yr3_freq = 0

        # education 
        edu_count = df['tc_Examinee_Education'].count()
        edu_mean = df['tc_Examinee_Education'].mean()
        edu_sd = df['tc_Examinee_Education'].std()
        
        # summary data 
        summary_data = {'Metric': ['N Participants', 
                                      'Age (Years), Mean (SD)',
                                      'Sex (Female), N (%)',
                                      'EDSS, Median (IQR)', 
                                      'T25FW, Mean (SD)',
                                      'Race, N (%)',
                                      'White Not Hispanic',
                                      'Hispanic or Latino',
                                      'Asian',
                                      'Black Or African American',
                                      'Other/Unknown/Declined',
                                      'Disease Duration (Years, Mean (SD)',
                                      'MS Subtype, N (%)',
                                      'Relapsing-remitting',
                                      'Progressive',
                                      'Subtype Not Specified',
                                      'Visit Type, N (%)',
                                      'Baseline', 
                                      'Year 2', 
                                      'Year 3',
                                      'Education (Years), Mean (SD)'],
                         'Statistic': [df['bw_id'].nunique(),
                                       f"{age_mean:.2f} ({age_sd:.2f})",  # Mean (SD)
                                       f"{sex_n_female} ({sex_freq_female:.0f}%)",
                                       f"{edss_median:.1f} ({edss_iqr:.1f})", 
                                       f"{t25fw_mean:.2f} ({t25fw_sd:.2f})",
                                       np.nan, 
                                       f"{white_n} ({white_freq:.0f}%)",
                                       f"{hispanic_n} ({hispanic_freq:.0f}%)",
                                       f"{asian_n} ({asian_freq:.0f}%)",
                                       f"{black_n} ({black_freq:.0f}%)",
                                       f"{decl_n} ({decl_freq:.0f}%)",
                                       f"{duration_mean:.2f} ({duration_sd:.2f})",  # Mean (SD)
                                       np.nan,
                                       f"{rrms_n} ({rrms_freq:.0f}%)",
                                       f"{pms_n} ({pms_freq:.0f}%)",
                                       f"{sns_n} ({sns_freq:.0f}%)", 
                                       np.nan, 
                                       f"{base_n} ({base_freq:.0f}%)",
                                       f"{yr2_n} ({yr2_freq:.0f}%)",
                                       f"{yr3_n} ({yr3_freq:.0f}%)",
                                       f"{edu_mean:.2f} ({edu_sd:.2f})"
                                      ]
                       }

        demographics_summary_df = pd.DataFrame(summary_data)


        # counts of participants with demographic data 
        n_data = {'Metric': ['N', 
                             'n with age data',
                             'n with sex data',
                             'n with edss data',
                             'n with t25fw data',
                             'n with demoEHR_REC_2 data',
                             'n with duration data',
                             'n with ms subtype data',
                            'n with education data'],
                         'Statistic': [df['bw_id'].nunique(),
                                       age_count, 
                                       sex_count, 
                                       edss_count,
                                       t25fw_count, 
                                       race_count,
                                       duration_count,
                                       ms_dx_count,
                                       edu_count
                                       ]}

        n_data_df = pd.DataFrame(n_data) 

    else: 
        print('no participants') 
        demographics_summary_df = pd.DataFrame()
        n_data_df = pd.DataFrame()

    return demographics_summary_df, n_data_df

# Demographic Groups 

For each group 
- pulling demographic data from first brainwalk visit if participant has multiple visits 
- save .csv file for each group
- save summary table as .csv file 

Zeno 
1. All participants with videos: MS + HC
2. All participants with included videos: MS + HC
3. Participants with included PWS videos: MS + HC – is there a diff between groups?
4. Participants with included FW videos: MS + HC – is there a diff between groups?

Home Videos 
1. all BW participants with MS (TBD, maybe not all approached) 
2. All BW participants consented to home vids
3. All BW participants who sent home vids
4. All BW participants who sent usable/included home vids  

# All BW participants w demographic data and walking tasks 
- Just MS participants
- Need to have all demographic data -drop if missing any 'cols_to_check'

In [6]:
# All Bw Data  
bw_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis',
                       analysis_version, 
                       '000_merged_cleaned_data\mergedCols_2025_04_17_BrainWalk_AllData_Long_MM.csv')

bw_df = pd.read_csv(bw_path, index_col = 0)

In [7]:
# All BW Dataset 
bw_ms_first_visit, test_HC = split_MS_HC_first_visit(bw_df, visit_date_col = 'visit_date')
bw_ms_dem, bw_ms_counts = demographic_summary(bw_ms_first_visit)
bw_ms_first_visit.to_csv(os.path.join(out_path, 'bw_ms_first_visit_all.csv')) 
bw_ms_dem.to_csv(os.path.join(out_path, 'bw_ms_dem_all.csv'))
bw_ms_dem

total unique bw_ids in df
491
--------------
any duplicate bw_ids in first visit df?
0
--------------
df_first_visit demographic diagnosis counts
demographic_diagnosis
MS                                                                                 312
HC                                                                                  56
PD                                                                                  19
Mild TBI                                                                            18
bvFTD                                                                               15
nfvPPA                                                                              10
CBS                                                                                  5
Mild complicated TBI                                                                 3
svPPA                                                                                3
Pending                                          

Unnamed: 0,Metric,Statistic
0,N Participants,312
1,"Age (Years), Mean (SD)",53.01 (13.17)
2,"Sex (Female), N (%)",215 (70%)
3,"EDSS, Median (IQR)",2.5 (2.0)
4,"T25FW, Mean (SD)",5.74 (3.84)
5,"Race, N (%)",
6,White Not Hispanic,212 (68%)
7,Hispanic or Latino,24 (8%)
8,Asian,14 (4%)
9,Black Or African American,14 (4%)


In [8]:
# Clean BW data - has all field used in analysis wtihing range
bw_clean_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis', 
                            analysis_version, 
                            '000_merged_cleaned_data\clean_mergedCols_2025_04_17_BrainWalk_AllData_Long_MM.csv')

bw_clean_df = pd.read_csv(bw_clean_path, index_col = 0)

In [9]:
# Clean BW Dataset 
clean_bw_ms_first_visit, test_HC = split_MS_HC_first_visit(bw_clean_df, visit_date_col = 'visit_date')
clean_bw_ms_dem, clean_bw_ms_counts = demographic_summary(clean_bw_ms_first_visit)
clean_bw_ms_first_visit.to_csv(os.path.join(out_path, 'bw_ms_first_visit_clean.csv')) 
clean_bw_ms_dem.to_csv(os.path.join(out_path, 'bw_ms_dem_clean.csv'))
clean_bw_ms_dem

total unique bw_ids in df
158
--------------
any duplicate bw_ids in first visit df?
0
--------------
df_first_visit demographic diagnosis counts
demographic_diagnosis
MS    158
Name: count, dtype: int64
--------------
MS and HC rows should match from table above
rows in final MS df
158
MS df count bw_id rows with data
158
rows in final HC df
0
HC df count bw_id rows with data
0


Unnamed: 0,Metric,Statistic
0,N Participants,158
1,"Age (Years), Mean (SD)",48.43 (13.13)
2,"Sex (Female), N (%)",114 (72%)
3,"EDSS, Median (IQR)",2.5 (2.5)
4,"T25FW, Mean (SD)",5.91 (4.65)
5,"Race, N (%)",
6,White Not Hispanic,105 (66%)
7,Hispanic or Latino,18 (11%)
8,Asian,10 (6%)
9,Black Or African American,9 (6%)


# Zeno Videos 
Videos paired with BW dataset 

### PWS 

In [10]:
# clean dataframes contain all videos processed with BW data  

# PWS 
zv_pws_bw_clean_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis', 
                                    analysis_version, 
                                    '000_merged_cleaned_data\zv_bw_merged_gait_vertical_PWS_1_clean.csv')
zv_pws_bw_clean_df = pd.read_csv(zv_pws_bw_clean_path, index_col = 0)
print(zv_pws_bw_clean_df['demographic_diagnosis'].value_counts())

demographic_diagnosis
MS    208
Name: count, dtype: int64


In [11]:
# PWS Demographics - All Videos in clean dataset 
clean_zv_pws_first_visit, test_HC = split_MS_HC_first_visit(zv_pws_bw_clean_df, visit_date_col = 'visit_date_video')
clean_zv_pws_dem, clean_zv_pws_counts = demographic_summary(clean_zv_pws_first_visit)
clean_zv_pws_first_visit.to_csv(os.path.join(out_path, 'clean_zv_pws_first_visit.csv')) 
clean_zv_pws_dem.to_csv(os.path.join(out_path, 'clean_zv_pws_dem.csv'))

total unique bw_ids in df
142
--------------
any duplicate bw_ids in first visit df?
0
--------------
df_first_visit demographic diagnosis counts
demographic_diagnosis
MS    142
Name: count, dtype: int64
--------------
MS and HC rows should match from table above
rows in final MS df
142
MS df count bw_id rows with data
142
rows in final HC df
0
HC df count bw_id rows with data
0


In [12]:
clean_zv_pws_dem

Unnamed: 0,Metric,Statistic
0,N Participants,142
1,"Age (Years), Mean (SD)",48.14 (12.88)
2,"Sex (Female), N (%)",104 (73%)
3,"EDSS, Median (IQR)",2.5 (2.4)
4,"T25FW, Mean (SD)",5.82 (4.72)
5,"Race, N (%)",
6,White Not Hispanic,96 (68%)
7,Hispanic or Latino,14 (10%)
8,Asian,9 (6%)
9,Black Or African American,9 (6%)


### Fast walk 

In [13]:
# FW 
zv_fw_bw_clean_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis', 
                                    analysis_version, 
                                   '000_merged_cleaned_data\zv_bw_merged_gait_vertical_FW_1_clean.csv') 
zv_fw_bw_clean_df = pd.read_csv(zv_fw_bw_clean_path, index_col = 0) 
print(zv_fw_bw_clean_df['demographic_diagnosis'].value_counts())

demographic_diagnosis
MS    208
Name: count, dtype: int64


In [14]:
# FW Demographics - All Videos in clean dataset 
clean_zv_fw_first_visit, test_HC = split_MS_HC_first_visit(zv_fw_bw_clean_df, visit_date_col = 'visit_date_video')
clean_zv_fw_dem, clean_zv_fw_counts = demographic_summary(clean_zv_fw_first_visit)
clean_zv_fw_first_visit.to_csv(os.path.join(out_path, 'clean_zv_fw_first_visit.csv')) 
clean_zv_fw_dem.to_csv(os.path.join(out_path, 'clean_zv_fw_dem.csv'))

total unique bw_ids in df
142
--------------
any duplicate bw_ids in first visit df?
0
--------------
df_first_visit demographic diagnosis counts
demographic_diagnosis
MS    142
Name: count, dtype: int64
--------------
MS and HC rows should match from table above
rows in final MS df
142
MS df count bw_id rows with data
142
rows in final HC df
0
HC df count bw_id rows with data
0


In [15]:
clean_zv_fw_dem

Unnamed: 0,Metric,Statistic
0,N Participants,142
1,"Age (Years), Mean (SD)",48.12 (12.90)
2,"Sex (Female), N (%)",104 (73%)
3,"EDSS, Median (IQR)",2.5 (2.4)
4,"T25FW, Mean (SD)",5.82 (4.73)
5,"Race, N (%)",
6,White Not Hispanic,96 (68%)
7,Hispanic or Latino,14 (10%)
8,Asian,9 (6%)
9,Black Or African American,9 (6%)


# Home Videos  

## consent to home videos 

### All Participants who consented to home videos 
See old demographics table script

In [16]:
# REDCap report paths - participants consented to home vids
redcap_reports_path = r'C:\Users\mmccu\Box\Brainwalk\Home Video Walking\Megan Project\bw_data_and_code\home_video_feasibility\2025_04_28 RedCap Reports'
consent_base_v1_path = os.path.join(redcap_reports_path,
                                    'homevid_baseline_consent_v1.csv') 
consent_base_v2_path = os.path.join(redcap_reports_path,
                                    'homevid_baseline_consent_v2.csv') 
consent_y2_v1_path = os.path.join(redcap_reports_path,
                                    'homevid_year2_consent_v1.csv') 
consent_y2_v2_path = os.path.join(redcap_reports_path,
                                    'homevid_year2_consent_v2.csv') 
consent_y3_v1_path = os.path.join(redcap_reports_path,
                                  'homevid_year3_consent_v1.csv') 
consent_y3_v2_path = os.path.join(redcap_reports_path,
                                  'homevid_year3_consent_v2.csv')


In [17]:
# participants consented to home vids - format and merge dataframes 
consent_base_v1_df = pd.read_csv(consent_base_v1_path) 
consent_base_v2_df = pd.read_csv(consent_base_v2_path)
consent_y2_v1_df = pd.read_csv(consent_y2_v1_path)
consent_y2_v2_df = pd.read_csv(consent_y2_v2_path)
consent_y3_v1_df = pd.read_csv(consent_y3_v1_path)
consent_y3_v2_df = pd.read_csv(consent_y3_v2_path) 

# add consent version column 
consent_base_v1_df['consent_version'] = 1
consent_y2_v1_df['consent_version'] = 1
consent_y3_v1_df['consent_version'] = 1

consent_base_v2_df['consent_version'] = 2
consent_y2_v2_df['consent_version'] = 2
consent_y3_v2_df['consent_version']= 2

# rename all columns to v1 col names 
#record_id	redcap_event_name	bw_id	falls_visit_date	walking_consent_date	walking_consent_sig
consent_base_v1_df = consent_base_v1_df.rename(columns = {'falls_visit_date' : 'visit_date'}) 
consent_y2_v1_df = consent_y2_v1_df.rename(columns = {'falls_visit_date' : 'visit_date'})
consent_y3_v1_df = consent_y3_v1_df.rename(columns = {'falls_visit_date' : 'visit_date'})


consent_base_v2_df = consent_base_v2_df.rename(columns = {'falls_visit_date' : 'visit_date',
                                                          'walking_consent_date_v2' : 'walking_consent_date',
                                                          'walking_consent_sig_v2' : 'walking_consent_sig'}) 
consent_y2_v2_df = consent_y2_v2_df.rename(columns = {'falls_visit_date' : 'visit_date',
                                                      'walking_consent_date_v2' : 'walking_consent_date',
                                                      'walking_consent_sig_v2' : 'walking_consent_sig'}) 
consent_y3_v2_df = consent_y3_v2_df.rename(columns = {'falls_visit_date' : 'visit_date',
                                                      'walking_consent_date_v2' : 'walking_consent_date',
                                                      'walking_consent_sig_v2' : 'walking_consent_sig'})

# convert to date time 
consent_base_v1_df['visit_date'] = pd.to_datetime(consent_base_v1_df['visit_date'], errors = 'coerce')
consent_y2_v1_df['visit_date'] = pd.to_datetime(consent_y2_v1_df['visit_date'], errors = 'coerce')
consent_y3_v1_df['visit_date'] = pd.to_datetime(consent_y3_v1_df['visit_date'], errors = 'coerce')
consent_base_v2_df['visit_date'] = pd.to_datetime(consent_base_v2_df['visit_date'], errors = 'coerce')
consent_y2_v2_df['visit_date'] = pd.to_datetime(consent_y2_v2_df['visit_date'], errors = 'coerce')
consent_y3_v2_df['visit_date'] = pd.to_datetime(consent_y3_v2_df['visit_date'], errors = 'coerce')

# concatenate 
consent_all_df = pd.concat([consent_base_v1_df,
                            consent_base_v2_df,
                            consent_y2_v1_df,
                            consent_y2_v2_df,
                            consent_y3_v1_df,
                            consent_y3_v2_df])

In [18]:
# rename columns to help with merge 
consent_all_df = consent_all_df.rename(columns={'redcap_event_name': 'redcap_event_name_1'})

## Drop rows missing ID - error in report, duplicated with other row, maybe signed twice? 
consent_all_df = consent_all_df.dropna(subset=['bw_id'])

consent_all_df.to_csv(os.path.join(out_path, 'home_video_consent_df_all.csv'))
#consent_all_df

In [19]:
# convert bw dates to date object 
bw_df['visit_date'] = pd.to_datetime(bw_df['visit_date'], errors = 'coerce')
bw_clean_df['visit_date'] = pd.to_datetime(bw_clean_df['visit_date'], errors = 'coerce')

In [20]:
# merge bw_df with consented 
consent_all_w_bw_df = consent_all_df.merge(right = bw_df, how = 'left', on = ['bw_id', 'visit_date'])
consent_all_w_bw_df.head()
consent_all_w_bw_df.to_csv(os.path.join(out_path, 'home_video_consent_df_merged_w_bw_all.csv'))

In [21]:
# number of participants who consented at one visit, vs at multiple visits 
print((consent_all_w_bw_df['bw_id'].value_counts() == 1).sum())
print(consent_all_w_bw_df['bw_id'][consent_all_w_bw_df['bw_id'].duplicated()].nunique())

59
10


In [22]:
# selecting first visist = someone who consented to home videos at at least one visit 
consent_first_visit, test_HC = split_MS_HC_first_visit(consent_all_w_bw_df, visit_date_col = 'visit_date')
consent_dem, consent_counts = demographic_summary(consent_first_visit)
consent_first_visit.to_csv(os.path.join(out_path, 'home_video_consent_first_visit_all.csv')) 
consent_dem.to_csv(os.path.join(out_path, 'home_video_consent_dem_all.csv'))
consent_dem

total unique bw_ids in df
69
--------------
any duplicate bw_ids in first visit df?
0
--------------
df_first_visit demographic diagnosis counts
demographic_diagnosis
MS          68
Mild TBI     1
Name: count, dtype: int64
--------------
MS and HC rows should match from table above
rows in final MS df
68
MS df count bw_id rows with data
68
rows in final HC df
0
HC df count bw_id rows with data
0


Unnamed: 0,Metric,Statistic
0,N Participants,68
1,"Age (Years), Mean (SD)",48.43 (13.38)
2,"Sex (Female), N (%)",50 (74%)
3,"EDSS, Median (IQR)",2.8 (2.1)
4,"T25FW, Mean (SD)",5.79 (2.78)
5,"Race, N (%)",
6,White Not Hispanic,42 (62%)
7,Hispanic or Latino,6 (9%)
8,Asian,6 (9%)
9,Black Or African American,3 (4%)


### Consent with BW data 
merge participants who have consented with clean BW data. Clean BW data only contains participants with full datasets (no missing assessments and EDSS and T25FW in range) 

In [23]:
# merge 
consent_clean_w_bw_df = consent_all_df.merge(right = bw_clean_df, how = 'left', on = ['bw_id', 'visit_date'])
consent_clean_w_bw_df.to_csv(os.path.join(out_path, 'home_video_consent_df_merged_w_bw_clean.csv'))
#consent_clean_w_bw_df.head()

In [24]:
# drop if missing BW data 
consent_clean_w_bw_df = consent_clean_w_bw_df.dropna(subset = ['trialdate'])
#consent_clean_w_bw_df.head()

In [25]:
# selecting first visist = someone who consented to home videos at at least one visit 
consent_clean_first_visit, test_HC = split_MS_HC_first_visit(consent_clean_w_bw_df, visit_date_col = 'visit_date')
consent_clean_dem, consent_clean_counts = demographic_summary(consent_clean_first_visit)
consent_clean_first_visit.to_csv(os.path.join(out_path, 'home_video_consent_first_visit_clean.csv')) 
consent_dem.to_csv(os.path.join(out_path, 'home_video_consent_dem_clean.csv'))
consent_clean_dem

total unique bw_ids in df
57
--------------
any duplicate bw_ids in first visit df?
0
--------------
df_first_visit demographic diagnosis counts
demographic_diagnosis
MS    57
Name: count, dtype: int64
--------------
MS and HC rows should match from table above
rows in final MS df
57
MS df count bw_id rows with data
57
rows in final HC df
0
HC df count bw_id rows with data
0


Unnamed: 0,Metric,Statistic
0,N Participants,57
1,"Age (Years), Mean (SD)",47.49 (13.26)
2,"Sex (Female), N (%)",43 (75%)
3,"EDSS, Median (IQR)",2.5 (2.0)
4,"T25FW, Mean (SD)",5.56 (2.63)
5,"Race, N (%)",
6,White Not Hispanic,36 (63%)
7,Hispanic or Latino,5 (9%)
8,Asian,4 (7%)
9,Black Or African American,3 (5%)


## Did Not consent to home videos 

### Did not consent to home videos, all/raw 

In [26]:
# in raw bw dataset, but not in raw consent file 
no_consent_all_w_bw_df = bw_df[~bw_df['bw_id'].isin(consent_all_w_bw_df['bw_id'])]

# all no consent to home videos 
no_consent_ms_first_visit, test_HC = split_MS_HC_first_visit(no_consent_all_w_bw_df, visit_date_col = 'visit_date')
no_consent_ms_dem, no_consent_ms_counts = demographic_summary(no_consent_ms_first_visit)
no_consent_ms_first_visit.to_csv(os.path.join(out_path, 'home_video_no_consent_ms_first_visit_all.csv')) 
no_consent_ms_dem.to_csv(os.path.join(out_path, 'home_video_no_consent_ms_dem_all.csv'))
no_consent_ms_dem

total unique bw_ids in df
422
--------------
any duplicate bw_ids in first visit df?
0
--------------
df_first_visit demographic diagnosis counts
demographic_diagnosis
MS                                                                                 244
HC                                                                                  56
PD                                                                                  19
Mild TBI                                                                            17
bvFTD                                                                               15
nfvPPA                                                                              10
CBS                                                                                  5
Mild complicated TBI                                                                 3
svPPA                                                                                3
Pending                                          

Unnamed: 0,Metric,Statistic
0,N Participants,244
1,"Age (Years), Mean (SD)",54.38 (12.79)
2,"Sex (Female), N (%)",165 (68%)
3,"EDSS, Median (IQR)",2.5 (2.5)
4,"T25FW, Mean (SD)",5.73 (4.14)
5,"Race, N (%)",
6,White Not Hispanic,170 (70%)
7,Hispanic or Latino,18 (7%)
8,Asian,8 (3%)
9,Black Or African American,11 (5%)


### Did not consent to home videos, from clean with all BW data  

In [27]:
# in clean bw dataset, but not in clean consent file  
no_consent_clean_w_bw_df = bw_clean_df[~bw_clean_df['bw_id'].isin(consent_clean_w_bw_df['bw_id'])]

# all no consent to home videos 
no_consent_clean_first_visit, test_HC = split_MS_HC_first_visit(no_consent_clean_w_bw_df, visit_date_col = 'visit_date')
no_consent_clean_dem, no_consent_clean_counts = demographic_summary(no_consent_clean_first_visit)
no_consent_clean_first_visit.to_csv(os.path.join(out_path, 'home_video_no_consent_ms_first_visit_clean.csv')) 
no_consent_clean_dem.to_csv(os.path.join(out_path, 'home_video_no_consent_ms_dem_clean.csv'))
no_consent_clean_dem

total unique bw_ids in df
101
--------------
any duplicate bw_ids in first visit df?
0
--------------
df_first_visit demographic diagnosis counts
demographic_diagnosis
MS    101
Name: count, dtype: int64
--------------
MS and HC rows should match from table above
rows in final MS df
101
MS df count bw_id rows with data
101
rows in final HC df
0
HC df count bw_id rows with data
0


Unnamed: 0,Metric,Statistic
0,N Participants,101
1,"Age (Years), Mean (SD)",49.07 (13.07)
2,"Sex (Female), N (%)",71 (70%)
3,"EDSS, Median (IQR)",2.5 (2.5)
4,"T25FW, Mean (SD)",6.13 (5.48)
5,"Race, N (%)",
6,White Not Hispanic,69 (68%)
7,Hispanic or Latino,13 (13%)
8,Asian,6 (6%)
9,Black Or African American,6 (6%)


## Consented and sent video 

### All Videos sent - raw data frame 
Sent video, not necessarily matched to BW data 

In [28]:
### All Videos 
# Home Videos 
hv_bw_raw_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis', 
                                analysis_version, 
                                '000_merged_cleaned_data\hv_bw_merged_raw.csv') 

hv_bw_raw_df = pd.read_csv(hv_bw_raw_path, index_col = 0) 

In [29]:
# Home Demographics - All Videos in raw dataset 
raw_home_first_visit, test_HC = split_MS_HC_first_visit(hv_bw_raw_df, visit_date_col = 'visit_date_video')
raw_home_dem, raw_home_counts = demographic_summary(raw_home_first_visit)
raw_home_first_visit.to_csv(os.path.join(out_path, 'home_video_sent_first_visit_all.csv')) 
raw_home_dem.to_csv(os.path.join(out_path, 'home_video_sent_dem_all.csv'))
raw_home_dem

total unique bw_ids in df
37
--------------
any duplicate bw_ids in first visit df?
0
--------------
df_first_visit demographic diagnosis counts
demographic_diagnosis
MS    37
Name: count, dtype: int64
--------------
MS and HC rows should match from table above
rows in final MS df
37
MS df count bw_id rows with data
37
rows in final HC df
0
HC df count bw_id rows with data
0


Unnamed: 0,Metric,Statistic
0,N Participants,37
1,"Age (Years), Mean (SD)",49.74 (13.71)
2,"Sex (Female), N (%)",30 (81%)
3,"EDSS, Median (IQR)",3.0 (3.5)
4,"T25FW, Mean (SD)",5.75 (2.98)
5,"Race, N (%)",
6,White Not Hispanic,28 (76%)
7,Hispanic or Latino,1 (3%)
8,Asian,3 (8%)
9,Black Or African American,1 (3%)


### All videos in clean df - sent video and had paired BW data 

In [30]:
# Home Videos 
hv_bw_clean_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis', 
                                analysis_version, 
                                '000_merged_cleaned_data\hv_bw_merged_clean.csv') 

hv_bw_clean_df = pd.read_csv(hv_bw_clean_path, index_col = 0) 

In [31]:
# Home Demographics - All Videos in clean dataset 
clean_home_first_visit, test_HC = split_MS_HC_first_visit(hv_bw_clean_df, visit_date_col = 'visit_date_video')
clean_home_dem, clean_home_counts = demographic_summary(clean_home_first_visit)
clean_home_first_visit.to_csv(os.path.join(out_path, 'home_video_sent_first_visit_clean.csv')) 
clean_home_dem.to_csv(os.path.join(out_path, 'home_video_sent_dem_clean.csv'))

clean_home_dem

total unique bw_ids in df
31
--------------
any duplicate bw_ids in first visit df?
0
--------------
df_first_visit demographic diagnosis counts
demographic_diagnosis
MS    31
Name: count, dtype: int64
--------------
MS and HC rows should match from table above
rows in final MS df
31
MS df count bw_id rows with data
31
rows in final HC df
0
HC df count bw_id rows with data
0


Unnamed: 0,Metric,Statistic
0,N Participants,31
1,"Age (Years), Mean (SD)",47.15 (13.32)
2,"Sex (Female), N (%)",24 (77%)
3,"EDSS, Median (IQR)",3.0 (2.2)
4,"T25FW, Mean (SD)",5.64 (3.18)
5,"Race, N (%)",
6,White Not Hispanic,22 (71%)
7,Hispanic or Latino,1 (3%)
8,Asian,3 (10%)
9,Black Or African American,1 (3%)


## Consented, but did not send videos 

### Did not send videos, raw/all 

In [32]:
# ids in consent all df, but not in video data 
raw_no_videos_w_bw_df = consent_all_w_bw_df[~consent_all_w_bw_df['bw_id'].isin(hv_bw_raw_df['bw_id'])]

no_video_ms_first_visit, test_HC = split_MS_HC_first_visit(raw_no_videos_w_bw_df, visit_date_col = 'visit_date')
no_video_ms_dem, no_video_ms_counts = demographic_summary(no_video_ms_first_visit)
no_video_ms_first_visit.to_csv(os.path.join(out_path, 'home_video_not_sent_first_visit_all.csv')) 
no_video_ms_dem.to_csv(os.path.join(out_path, 'home_video_not_sent_dem_all.csv'))
no_video_ms_dem

total unique bw_ids in df
36
--------------
any duplicate bw_ids in first visit df?
0
--------------
df_first_visit demographic diagnosis counts
demographic_diagnosis
MS          35
Mild TBI     1
Name: count, dtype: int64
--------------
MS and HC rows should match from table above
rows in final MS df
35
MS df count bw_id rows with data
35
rows in final HC df
0
HC df count bw_id rows with data
0


Unnamed: 0,Metric,Statistic
0,N Participants,35
1,"Age (Years), Mean (SD)",48.70 (13.31)
2,"Sex (Female), N (%)",24 (69%)
3,"EDSS, Median (IQR)",2.5 (2.0)
4,"T25FW, Mean (SD)",5.81 (2.44)
5,"Race, N (%)",
6,White Not Hispanic,18 (51%)
7,Hispanic or Latino,5 (14%)
8,Asian,3 (9%)
9,Black Or African American,2 (6%)


### Did not send videos, clean with BW data 

In [33]:
clean_no_videos_w_bw_df = consent_clean_w_bw_df[~consent_clean_w_bw_df['bw_id'].isin(hv_bw_clean_df['bw_id'])]

no_video_clean_first_visit, test_HC = split_MS_HC_first_visit(clean_no_videos_w_bw_df, visit_date_col = 'visit_date')
no_video_clean_dem, no_video_clean_counts = demographic_summary(no_video_clean_first_visit)
no_video_clean_first_visit.to_csv(os.path.join(out_path, 'home_video_not_sent_first_visit_clean.csv')) 
no_video_clean_dem.to_csv(os.path.join(out_path, 'home_video_not_sent_dem_clean.csv'))
no_video_clean_dem

total unique bw_ids in df
27
--------------
any duplicate bw_ids in first visit df?
0
--------------
df_first_visit demographic diagnosis counts
demographic_diagnosis
MS    27
Name: count, dtype: int64
--------------
MS and HC rows should match from table above
rows in final MS df
27
MS df count bw_id rows with data
27
rows in final HC df
0
HC df count bw_id rows with data
0


Unnamed: 0,Metric,Statistic
0,N Participants,27
1,"Age (Years), Mean (SD)",48.63 (13.55)
2,"Sex (Female), N (%)",20 (74%)
3,"EDSS, Median (IQR)",2.5 (2.0)
4,"T25FW, Mean (SD)",5.43 (1.82)
5,"Race, N (%)",
6,White Not Hispanic,15 (56%)
7,Hispanic or Latino,4 (15%)
8,Asian,1 (4%)
9,Black Or African American,2 (7%)


## Videos with walking segment 

### Of all videos sent - # with at least 1 linear walking segment identified, all/raw

In [34]:
hv_bw_raw_wWalking_df = hv_bw_raw_df.loc[hv_bw_raw_df['walking_segmets_n_pose_hv'] > 0]
hv_bw_raw_wWalking_df['walking_segmets_n_pose_hv'].value_counts()

walking_segmets_n_pose_hv
5.0     18
6.0     17
4.0     14
3.0     12
7.0      4
2.0      3
1.0      2
8.0      1
11.0     1
12.0     1
Name: count, dtype: int64

In [35]:
hv_bw_raw_df.loc[pd.isna(hv_bw_raw_df['walking_segmets_n_pose_hv'])]

Unnamed: 0,video_id_date_name_pose_hv,id_date_pose_hv,task_pose_hv,frames_per_second_pose_hv,total_video_duration_sec_pose_hv,delta_pix_h_rel_median_pose_hv,walking_segmets_n_pose_hv,walking_segments_duration_mean_pose_hv,walking_segments_duration_median_pose_hv,stride_time_mean_sec_pose_hv,...,EDSS_same_before_after_MM,demoEHR_Vitals_dateDiff,tc_Examinee_Education,edss_severity_num,edss_severity_cat,t25fw_group_num,t25fw_group_cat,race_ethnicity_clean,ms_dx_condensed,bw_hv_date_diff_days
6,gait_vertical_left_BW-0025_11-28-2023,BW-0025\11-28-2023,gait_vertical_left,30,48.6,0.03,,,,,...,,-0.472106,25.0,3.0,severe,1.0,under_6,White Non Hispanic,RRMS,-1
8,gait_vertical_right_BW-0025_11-28-2023,BW-0025\11-28-2023,gait_vertical_right,30,58.766667,0.08,,,,,...,,-0.472106,25.0,3.0,severe,1.0,under_6,White Non Hispanic,RRMS,-1
39,gait_vertical_left_BW-0230_09-19-2023,BW-0230\09-19-2023,gait_vertical_left,30,53.466667,0.1,,,,,...,,-0.463947,19.0,1.0,mild,1.0,under_6,White Non Hispanic,RRMS,0
40,gait_vertical_left_BW-0232_10-09-2023,BW-0232\10-09-2023,gait_vertical_left,10,22.6,,,,,,...,,-0.522859,14.0,2.0,moderate,2.0,6_to_8,Asian,RRMS,5
43,gait_vertical_left_BW-0237_10-09-2023,BW-0237\10-09-2023,gait_vertical_left,30,32.833333,,,,,,...,,-0.553484,14.0,2.0,moderate,1.0,under_6,White Non Hispanic,RRMS,4
71,gait_vertical_right_BW-0276_01-17-2024,BW-0276\01-17-2024,gait_vertical_right,30,44.833333,,,,,,...,,26.529606,18.0,3.0,severe,3.0,over_8,Black Or African American,RRMS,-1


In [36]:
# demographics and counts 
raw_home_wWalking_first, test_HC = split_MS_HC_first_visit(hv_bw_raw_wWalking_df, visit_date_col = 'visit_date_video')
raw_home_wWalking_dem, raw_home_wWalking_counts = demographic_summary(raw_home_wWalking_first)
raw_home_wWalking_first.to_csv(os.path.join(out_path, 'home_wWalking_first_visit_all.csv')) 
raw_home_wWalking_dem.to_csv(os.path.join(out_path, 'home_wWalking_dem_all.csv'))
raw_home_wWalking_dem

total unique bw_ids in df
34
--------------
any duplicate bw_ids in first visit df?
0
--------------
df_first_visit demographic diagnosis counts
demographic_diagnosis
MS    34
Name: count, dtype: int64
--------------
MS and HC rows should match from table above
rows in final MS df
34
MS df count bw_id rows with data
34
rows in final HC df
0
HC df count bw_id rows with data
0


Unnamed: 0,Metric,Statistic
0,N Participants,34
1,"Age (Years), Mean (SD)",50.19 (13.96)
2,"Sex (Female), N (%)",27 (79%)
3,"EDSS, Median (IQR)",3.0 (3.9)
4,"T25FW, Mean (SD)",5.88 (3.08)
5,"Race, N (%)",
6,White Not Hispanic,25 (74%)
7,Hispanic or Latino,1 (3%)
8,Asian,3 (9%)
9,Black Or African American,1 (3%)


### Home videos with walking segment identified and paired BW data, clean

In [37]:
hv_bw_clean_wWalking_df = hv_bw_clean_df.loc[hv_bw_clean_df['walking_segmets_n_pose_hv'] > 0]
hv_bw_clean_wWalking_df['walking_segmets_n_pose_hv'].value_counts()

walking_segmets_n_pose_hv
6.0     15
5.0     14
3.0     11
4.0      7
7.0      4
1.0      2
2.0      2
11.0     1
12.0     1
Name: count, dtype: int64

In [38]:
# demographics and counts 
clean_home_wWalking_first, test_HC = split_MS_HC_first_visit(hv_bw_clean_wWalking_df, visit_date_col = 'visit_date_video')
clean_home_wWalking_dem, clean_home_wWalking_counts = demographic_summary(clean_home_wWalking_first)
clean_home_wWalking_first.to_csv(os.path.join(out_path, 'home_wWalking_first_visit_clean.csv')) 
clean_home_wWalking_dem.to_csv(os.path.join(out_path, 'home_wWalking_dem_clean.csv'))
clean_home_wWalking_dem

total unique bw_ids in df
28
--------------
any duplicate bw_ids in first visit df?
0
--------------
df_first_visit demographic diagnosis counts
demographic_diagnosis
MS    28
Name: count, dtype: int64
--------------
MS and HC rows should match from table above
rows in final MS df
28
MS df count bw_id rows with data
28
rows in final HC df
0
HC df count bw_id rows with data
0


Unnamed: 0,Metric,Statistic
0,N Participants,28
1,"Age (Years), Mean (SD)",47.42 (13.68)
2,"Sex (Female), N (%)",21 (75%)
3,"EDSS, Median (IQR)",3.0 (2.4)
4,"T25FW, Mean (SD)",5.78 (3.32)
5,"Race, N (%)",
6,White Not Hispanic,19 (68%)
7,Hispanic or Latino,1 (4%)
8,Asian,3 (11%)
9,Black Or African American,1 (4%)
