In [1]:
# import packages 
import os 
import pandas as pd 
import numpy as np

# Functions 

In [2]:
def merge_race_ethnicity(df): 
    df_2 = df
    df_2['race_ethnicity_clean'] = ''
    df_2 = df_2.copy()
    # White race and not hispanic or latino ethnicity = White Not Hispanic 
    df_2.loc[(df_2['clean_race'] == 'White') & (df_2['clean_ethnicity'] == 'Not Hispanic Or Latino'),
            'race_ethnicity_clean'] = 'White Not Hispanic' 

    # Exclude White Not Hispanic and ethnicity is Hispanic or Latino = 'Hispanic or Latino'
    df_2.loc[(df_2['race_ethnicity_clean'] != 'White Not Hispanic') & (df_2['clean_ethnicity'] == 'Hispanic Or Latino'),
            'race_ethnicity_clean'] = 'Hispanic or Latino'

    # Of individuals that haven't been assignd race_ethnicity_clean, go off clean_race column 
    # asian 
    df_2.loc[(df_2['race_ethnicity_clean'] == '') & (df_2['clean_race'] == 'Asian'),
            'race_ethnicity_clean'] = 'Asian' 

    # American Indian Or Alaska Native
    df_2.loc[(df_2['race_ethnicity_clean'] == '') & (df_2['clean_race'] == 'American Indian Or Alaska Native'),
            'race_ethnicity_clean'] = 'American Indian Or Alaska Native'
    
   # Black Or African American 
    df_2.loc[(df_2['race_ethnicity_clean'] == '') & (df_2['clean_race'] == 'Black Or African American'),
            'race_ethnicity_clean'] = 'Black Or African American'

    # Other Pacific Islander
    df_2.loc[(df_2['race_ethnicity_clean'] == '') & (df_2['clean_race'] == 'Other Pacific Islander'),
            'race_ethnicity_clean'] = 'Other Pacific Islander'

    # if not yet assigned -> other, unknown, Declined 
    df_2.loc[df_2['race_ethnicity_clean'] == '',
            'race_ethnicity_clean'] = 'Other/Unknown/Declined'
    return df_2


In [3]:
# split dataframe into healthy controls and participant swith MS 
# then select first visit date - maybe not baseline, but first visit with data in that dataset 

def split_MS_HC_first_visit(df): 
    df = df.copy()
    df['visit_date'] = pd.to_datetime(df['visit_date'])
    df['bw_id'] = df['bw_id'].str.strip()

    print('total unique bw_ids in df')
    print(df['bw_id'].nunique())
    print('--------------') 

    # drop full duplicate rows 
    df = df.drop_duplicates(keep='first')

    # drop dupllicated bw_id and visit_date - ex: same fw and pws 
    df = df.drop_duplicates(subset=['bw_id', 'visit_date'], keep='first')

    # first visit - keep the earliest date for each ID 
    df_first_visit = (
        df.sort_values(by=['bw_id', 'visit_date'])
        .groupby('bw_id')
        .first()
        .reset_index()
    )
    

    # check if any duplicates 
    duplicates = df_first_visit['bw_id'].duplicated().sum()
    
    print('any duplicate bw_ids in first visit df?') 
    print(duplicates)
    print('--------------')

    print('df_first_visit demographic diagnosis counts') 
    print(df_first_visit['demographic_diagnosis'].value_counts())
    print('--------------') 
    
    MS_1_df = df_first_visit.loc[df_first_visit['demographic_diagnosis'] == 'MS']
    HC_1_df = df_first_visit.loc[df_first_visit['demographic_diagnosis'] == 'HC'] 

    print('MS and HC rows should match from table above') 
    print('rows in final MS df')
    print(len(MS_1_df))
    print('MS df count bw_id rows with data') 
    print(MS_1_df['bw_id'].count()) 

    print('rows in final HC df')
    print(len(HC_1_df))
    print('HC df count bw_id rows with data') 
    print(HC_1_df['bw_id'].count()) 

    return MS_1_df, HC_1_df

In [4]:
def demographic_summary(df): 
    # summary statistics for  cohort 
    if len(df) > 0: 
        # age 
        age_count = df['demoEHR_Age'].count()
        age_mean = df['demoEHR_Age'].mean()
        age_sd = df['demoEHR_Age'].std()
        
        # sex 
        sex_count = df['clean_sex'].count()
        sex_n = df['clean_sex'].value_counts()
        sex_n_female = sex_n['Female']
        sex_freqs = df['clean_sex'].value_counts(normalize=True) * 100
        sex_freq_female = sex_freqs['Female'] 
        
        # edss 
        edss_count = df['bingoEHR_EDSS_measure_value'].count()
        edss_median = df['bingoEHR_EDSS_measure_value'].median()
        edss_iqr = df['bingoEHR_EDSS_measure_value'].quantile(0.75) - df['bingoEHR_EDSS_measure_value'].quantile(0.25)

        # t25fw 
        t25fw_count = df['msfcEHR_T25FW SPEED AVG'].count()
        t25fw_mean= df['msfcEHR_T25FW SPEED AVG'].mean()
        t25fw_sd = df['msfcEHR_T25FW SPEED AVG'].std()

        #race 
        race_count = df['race_ethnicity_clean'].count()
        race_n = df['race_ethnicity_clean'].value_counts()
        print(race_n)
        race_freq = df['race_ethnicity_clean'].value_counts(normalize = True) * 100

        try:
            white_n = race_n['White Not Hispanic']
            white_freq = race_freq['White Not Hispanic']
        except KeyError:
            white_n = 0
            white_freq = 0

        try: 
            black_n = race_n['Black Or African American']
            black_freq = race_freq['Black Or African American']
        except KeyError:
            black_n = 0
            black_freq = 0
        
        try: 
            asian_n = race_n['Asian']
            asian_freq = race_freq['Asian']
        except KeyError:
            asian_n = 0
            asian_freq = 0

        try: 
            ai_an_n = race_n['American Indian Or Alaska Native']
            ai_an_freq = race_freq['American Indian Or Alaska Native']
        except KeyError:
            ai_an_n = 0
            ai_an_freq = 0

        try: 
            opi_n = race_n['Other Pacific Islander']
            opi_freq = race_freq['Other Pacific Islander']
        except KeyError:
            opi_n = 0
            opi_freq = 0
             
        try: 
            hispanic_n = race_n['Hispanic or Latino']
            hispanic_freq = race_freq['Hispanic or Latino']
        except KeyError: 
            hispanic_n = 0
            hispanic_freq = 0

        try: 
            decl_n = race_n['Other/Unknown/Declined']
            decl_freq = race_freq['Other/Unknown/Declined']
        except KeyError: 
            decl_n = 0
            decl_freq = 0

        # disease duration 
        duration_count = df['demoEHR_DiseaseDuration'].count()
        duration_mean = df['demoEHR_DiseaseDuration'].mean()
        duration_sd = df['demoEHR_DiseaseDuration'].std()

        # MS subtype 
        #'bingoEHR_DX_MS DX'
        ms_dx_count = df['bingoEHR_DX_MS DX'].count()
        ms_dx_n = df['bingoEHR_DX_MS DX'].value_counts()
        ms_dx_freq = df['bingoEHR_DX_MS DX'].value_counts(normalize = True) * 100

        try:
            rrms_n = ms_dx_n['RRMS (Relapsing-remitting Multiple Sclerosis)']
            rrms_freq = ms_dx_freq['RRMS (Relapsing-remitting Multiple Sclerosis)']
        except KeyError:
            rrms_n = 0
            rrms_freq = 0

        try: 
            spms_n = ms_dx_n['SPMS (Secondary-progressive Multiple Sclerosis)']
            spms_freq = ms_dx_freq['SPMS (Secondary-progressive Multiple Sclerosis)']
        except KeyError:
            spms_n = 0
            spms_freq = 0

        try: 
            ppms_n = ms_dx_n['PPMS (Primary-progressive Multiple Sclerosis)']
            ppms_freq = ms_dx_freq['PPMS (Primary-progressive Multiple Sclerosis)']
        except KeyError:
            ppms_n = 0
            ppms_freq = 0

        try: 
            prms_n = ms_dx_n['PRMS (Progressive-relapsing Multiple Sclerosis)']
            prms_freq = ms_dx_freq['PRMS (Progressive-relapsing Multiple Sclerosis)']
        except KeyError:
            prms_n = 0
            prms_freq = 0 

        try: 
            sns_n = ms_dx_n['MS, Subtype Not Specified']
            sns_freq = ms_dx_freq['MS, Subtype Not Specified']
        except KeyError:
            sns_n = 0
            sns_freq = 0

        try: 
            pend_n = ms_dx_n['pending']
            pend_freq = ms_dx_freq['pending']
        except KeyError:
            pend_n = 0
            pend_freq = 0

        try: 
            abn_n = ms_dx_n['abnormal MRI']
            abn_freq = ms_dx_freq['abnormal MRI']
        except KeyError:
            abn_n = 0
            abn_freq = 0


        
        # summary data 
        summary_data = {'Metric': ['N', 
                                      'Age (Years, Mean (SD))',
                                      'Sex (Female, n (%))',
                                      'EDSS (Median (IQR))', 
                                      'T25FW (Seconds, Mean (SD))',
                                      'Race (n, %)',
                                      'White Not Hispanic',
                                      'Hispanic or Latino',
                                      'Asian',
                                      'Black Or African American',
                                      'American Indian Or Alaska Native',
                                      'Other Pacific Islander',
                                      'Other/Unknown/Declined',
                                      'Disease Duration (Years, Mean (SD))',
                                      'MS Subtype (n, %)',
                                      'RRMS (Relapsing-remitting Multiple Sclerosis)',
                                      'SPMS (Secondary-progressive Multiple Sclerosis)',
                                      'PPMS (Primary-progressive Multiple Sclerosis)',
                                      'PRMS (Progressive-relapsing Multiple Sclerosis)',
                                      'MS, Subtype Not Specified',
                                      'pending',
                                      'abnormal MRI'],
                         'Statistic': [df['bw_id'].nunique(),
                                       f"{age_mean:.2f} ({age_sd:.2f})",  # Mean (SD)
                                       f"{sex_n_female} ({sex_freq_female:.0f}%)",
                                       f"{edss_median:.1f} ({edss_iqr:.1f})", 
                                       f"{t25fw_mean:.2f} ({t25fw_sd:.2f})",
                                       np.nan, 
                                       f"{white_n} ({white_freq:.0f}%)",
                                       f"{hispanic_n} ({hispanic_freq:.0f}%)",
                                       f"{asian_n} ({asian_freq:.0f}%)",
                                       f"{black_n} ({black_freq:.0f}%)",
                                       f"{ai_an_n} ({ai_an_freq:.0f}%)",
                                       f"{opi_n} ({opi_freq:.0f}%)",
                                       f"{decl_n} ({decl_freq:.0f}%)",
                                       f"{duration_mean:.2f} ({duration_sd:.2f})",  # Mean (SD)
                                       np.nan,
                                       f"{rrms_n} ({rrms_freq:.0f}%)",
                                       f"{spms_n} ({spms_freq:.0f}%)",
                                       f"{ppms_n} ({ppms_freq:.0f}%)",
                                       f"{prms_n} ({prms_freq:.0f}%)",
                                       f"{sns_n} ({sns_freq:.0f}%)",
                                       f"{pend_n} ({pend_freq:.0f}%)",
                                       f"{abn_n} ({abn_freq:.0f}%)"]
                       }

        demographics_summary_df = pd.DataFrame(summary_data)


        # counts of participants with demographic data 
        n_data = {'Metric': ['N', 
                             'n with age data',
                             'n with sex data',
                             'n with edss data',
                             'n with t25fw data',
                             'n with demoEHR_REC_2 data',
                             'n with duration data',
                             'n with ms subtype data'],
                         'Statistic': [df['bw_id'].nunique(),
                                       age_count, 
                                       sex_count, 
                                       edss_count,
                                       t25fw_count, 
                                       race_count,
                                       duration_count,
                                       ms_dx_count
                                       ]}

        n_data_df = pd.DataFrame(n_data) 

    else: 
        print('no participants') 
        demographics_summary_df = pd.DataFrame()
        n_data_df = pd.DataFrame()

    return demographics_summary_df, n_data_df

# file paths 

In [5]:
# outputs 
version = '004'
output_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis',
                           version,
                           'demographics')

if not os.path.exists(output_path):
        os.makedirs(output_path)

In [6]:
# input folder 
input_parent_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis', 
                                 version) 
input_parent_path

'C:\\Users\\mmccu\\Box\\MM_Personal\\5_Projects\\BoveLab\\3_Data_and_Code\\gait_bw_zeno_home_analysis\\004'

In [7]:
# all zeno videos 
all_zv_w_bw_path = os.path.join(input_parent_path, 
                                'video_visit_participant_counts',
                                'all_zv_videos_merged_w_bw.csv')

In [8]:
# all zeno included PWS 
zv_pws_inclu_w_bw_path = os.path.join(input_parent_path, 
                                      'zv_bw_merged_gait_vertical_PWS_1.csv') 
                                

In [9]:
# all zeno included FW 
zv_fw_inclu_w_bw_path = os.path.join(input_parent_path, 
                                      'zv_bw_merged_gait_vertical_FW_1.csv') 


In [10]:
# all bw participants with ms - maybe not using?? 
# tbd if not all partiicpants approached for home videos 
bw_path = r'C:\Users\mmccu\AppData\Local\Temp\ccsecure\2025_01_24_BrainWalk_AllData_Long_MM.xlsx'


In [11]:
# all participants who sent home videos 
all_hv_path = os.path.join(input_parent_path, 
                           'home_feasibility_reliability\home_vids_all_w_bw.csv') 
                           

In [12]:
# home videos included in analysis 
included_hv_w_bw_path = os.path.join(input_parent_path, 'hv_bw_merged.csv') 

In [13]:
# participants consented to home vids
    # need to save (either feas or counting step) - pick which one aligns col names the best 
redcap_reports_path = r'C:\Users\mmccu\Box\Brainwalk\Home Video Walking\Megan Project\bw_data_and_code\home_video_feasibility\2025_01_08 RedCap Reports'
consent_base_v1_path = os.path.join(redcap_reports_path,
                                    'homevid_baseline_consent_v1.csv') 
consent_base_v2_path = os.path.join(redcap_reports_path,
                                    'homevid_baseline_consent_v2.csv') 
consent_y2_v1_path = os.path.join(redcap_reports_path,
                                    'homevid_year2_consent_v1.csv') 
consent_y2_v2_path = os.path.join(redcap_reports_path,
                                    'homevid_year2_consent_v2.csv') 
consent_y3_v1_path = os.path.join(redcap_reports_path,
                                  'homevid_year3_consent_v1.csv') 
consent_y3_v2_path = os.path.join(redcap_reports_path,
                                  'homevid_year3_consent_v2.csv') 

In [14]:
# participants consented to home vids - format and merge dataframes 
consent_base_v1_df = pd.read_csv(consent_base_v1_path) 
consent_base_v2_df = pd.read_csv(consent_base_v2_path)
consent_y2_v1_df = pd.read_csv(consent_y2_v1_path)
consent_y2_v2_df = pd.read_csv(consent_y2_v2_path)
consent_y3_v1_df = pd.read_csv(consent_y3_v1_path)
consent_y3_v2_df = pd.read_csv(consent_y3_v2_path) 

# add consent version column 
consent_base_v1_df['consent_version'] = 1
consent_y2_v1_df['consent_version'] = 1
consent_y3_v1_df['consent_version'] = 1

consent_base_v2_df['consent_version'] = 2
consent_y2_v2_df['consent_version'] = 2
consent_y3_v2_df['consent_version']= 2

# rename all columns to v1 col names 
#record_id	redcap_event_name	bw_id	falls_visit_date	walking_consent_date	walking_consent_sig
consent_base_v1_df = consent_base_v1_df.rename(columns = {'falls_visit_date' : 'visit_date'}) 
consent_y2_v1_df = consent_y2_v1_df.rename(columns = {'falls_visit_date' : 'visit_date'})
consent_y3_v1_df = consent_y3_v1_df.rename(columns = {'falls_visit_date' : 'visit_date'})


consent_base_v2_df = consent_base_v2_df.rename(columns = {'falls_visit_date' : 'visit_date',
                                                          'walking_consent_date_v2' : 'walking_consent_date',
                                                          'walking_consent_sig_v2' : 'walking_consent_sig'}) 
consent_y2_v2_df = consent_y2_v2_df.rename(columns = {'falls_visit_date' : 'visit_date',
                                                      'walking_consent_date_v2' : 'walking_consent_date',
                                                      'walking_consent_sig_v2' : 'walking_consent_sig'}) 
consent_y3_v2_df = consent_y3_v2_df.rename(columns = {'falls_visit_date' : 'visit_date',
                                                      'walking_consent_date_v2' : 'walking_consent_date',
                                                      'walking_consent_sig_v2' : 'walking_consent_sig'})

# convert to date time 
consent_base_v1_df['visit_date'] = pd.to_datetime(consent_base_v1_df['visit_date'], errors = 'coerce')
consent_y2_v1_df['visit_date'] = pd.to_datetime(consent_y2_v1_df['visit_date'], errors = 'coerce')
consent_y3_v1_df['visit_date'] = pd.to_datetime(consent_y3_v1_df['visit_date'], errors = 'coerce')
consent_base_v2_df['visit_date'] = pd.to_datetime(consent_base_v2_df['visit_date'], errors = 'coerce')
consent_y2_v2_df['visit_date'] = pd.to_datetime(consent_y2_v2_df['visit_date'], errors = 'coerce')
consent_y3_v2_df['visit_date'] = pd.to_datetime(consent_y3_v2_df['visit_date'], errors = 'coerce')

# concatenate 
consent_all_df = pd.concat([consent_base_v1_df,
                            consent_base_v2_df,
                            consent_y2_v1_df,
                            consent_y2_v2_df,
                            consent_y3_v1_df,
                            consent_y3_v2_df])

#consent_all_df['visit_date'] = pd.to_datetime(consent_all_df['visit_date'], errors = 'coerce')
consent_all_df.to_csv(os.path.join(redcap_reports_path, 'all_home_vid_consent.csv'))

# run function on datasets below 

Save sumamry table - pulling demographic data from first brainwalk visit if participant has multiple visits

Zeno 
1. All participants with videos: MS + HC
2. All participants with included videos: MS + HC
3. Participants with included PWS videos: MS + HC – is there a diff between groups?
4. Participants with included FW videos: MS + HC – is there a diff between groups?

Home Videos 
1. all BW participants with MS (TBD, maybe not all approached) 
2. All BW participants consented to home vids
3. All BW participants who sent home vids
4. All BW participants who sent usable/included home vids  

### Zeno videos 

In [15]:
# all zeno videos 
all_zv_w_bw_df = pd.read_csv(all_zv_w_bw_path, index_col = 0) 
all_zv_w_bw_df = merge_race_ethnicity(all_zv_w_bw_df)

all_zv_ms_df, all_zv_hc_df = split_MS_HC_first_visit(all_zv_w_bw_df)

all_zv_ms_dem, all_zv_ms_n = demographic_summary(all_zv_ms_df)
all_zv_hc_dem, all_zv_hc_n = demographic_summary(all_zv_hc_df)

# save outputs 
all_zv_ms_dem.to_csv(os.path.join(output_path, 'zeno_all_vids_ms_demographics.csv')) 
all_zv_hc_dem.to_csv(os.path.join(output_path, 'zeno_all_vids_hc_demographics.csv')) 

all_zv_ms_n.to_csv(os.path.join(output_path, 'zeno_all_vids_ms_counts.csv')) 
all_zv_hc_n.to_csv(os.path.join(output_path, 'zeno_all_vids_hc_counts.csv')) 

total unique bw_ids in df
215
--------------
any duplicate bw_ids in first visit df?
0
--------------
df_first_visit demographic diagnosis counts
demographic_diagnosis
MS    174
HC     41
Name: count, dtype: int64
--------------
MS and HC rows should match from table above
rows in final MS df
174
MS df count bw_id rows with data
174
rows in final HC df
41
HC df count bw_id rows with data
41
race_ethnicity_clean
White Not Hispanic                  116
Hispanic or Latino                   17
Asian                                14
Black Or African American            12
Other/Unknown/Declined               12
American Indian Or Alaska Native      2
Other Pacific Islander                1
Name: count, dtype: int64
race_ethnicity_clean
White Not Hispanic        22
Asian                     10
Other/Unknown/Declined     6
Hispanic or Latino         3
Name: count, dtype: int64


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


In [16]:
# all zeno included PWS 
zv_pws_inclu_w_bw_df = pd.read_csv(zv_pws_inclu_w_bw_path, index_col = 0)
zv_pws_inclu_w_bw_df = merge_race_ethnicity(zv_pws_inclu_w_bw_df) 

# rename columns to align with functions above 
zv_pws_inclu_w_bw_df = zv_pws_inclu_w_bw_df.rename(columns = {'visit_date_video' : 'video_date'}) 

# demographics summary 
zv_pws_inclu_ms_df, zv_pws_inclu_hc_df = split_MS_HC_first_visit(zv_pws_inclu_w_bw_df)

zv_pws_inclu_ms_dem, zv_pws_inclu_ms_n = demographic_summary(zv_pws_inclu_ms_df)
zv_pws_inclu_hc_dem,  zv_pws_inclu_hc_n = demographic_summary(zv_pws_inclu_hc_df)

# save outputs 
zv_pws_inclu_ms_dem.to_csv(os.path.join(output_path, 'zeno_pws_inclu_ms_demographics.csv')) 
zv_pws_inclu_hc_dem.to_csv(os.path.join(output_path, 'zeno_pws_inclu_hc_demographics.csv')) 

zv_pws_inclu_ms_n.to_csv(os.path.join(output_path, 'zeno_pws_inclu_ms_counts.csv')) 
zv_pws_inclu_hc_n.to_csv(os.path.join(output_path, 'zeno_pws_inclu_hc_counts.csv')) 

total unique bw_ids in df
179
--------------
any duplicate bw_ids in first visit df?
0
--------------
df_first_visit demographic diagnosis counts
demographic_diagnosis
MS    146
HC     33
Name: count, dtype: int64
--------------
MS and HC rows should match from table above
rows in final MS df
146
MS df count bw_id rows with data
146
rows in final HC df
33
HC df count bw_id rows with data
33
race_ethnicity_clean
White Not Hispanic                  99
Hispanic or Latino                  14
Asian                               11
Black Or African American           10
Other/Unknown/Declined               9
American Indian Or Alaska Native     2
Other Pacific Islander               1
Name: count, dtype: int64


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


race_ethnicity_clean
White Not Hispanic        18
Asian                      9
Other/Unknown/Declined     5
Hispanic or Latino         1
Name: count, dtype: int64


In [17]:
# all Zeno included FW 
zv_fw_inclu_w_bw_df = pd.read_csv(zv_fw_inclu_w_bw_path, index_col = 0)
zv_fw_inclu_w_bw_df = merge_race_ethnicity(zv_fw_inclu_w_bw_df) 

# rename columns to align with functions above 
zv_fw_inclu_w_bw_df = zv_fw_inclu_w_bw_df.rename(columns = {'visit_date_video' : 'video_date'}) 

# demographics summary 
zv_fw_inclu_ms_df, zv_fw_inclu_hc_df = split_MS_HC_first_visit(zv_fw_inclu_w_bw_df)

zv_fw_inclu_ms_dem, zv_fw_inclu_ms_n = demographic_summary(zv_fw_inclu_ms_df)
zv_fw_inclu_hc_dem, zv_fw_inclu_hc_n = demographic_summary(zv_fw_inclu_hc_df)

# save outputs
zv_fw_inclu_ms_dem.to_csv(os.path.join(output_path, 'zeno_fw_inclu_ms_demographics.csv')) 
zv_fw_inclu_hc_dem.to_csv(os.path.join(output_path, 'zeno_fw_inclu_hc_demographics.csv'))

zv_fw_inclu_ms_n.to_csv(os.path.join(output_path, 'zeno_fw_inclu_ms_counts.csv')) 
zv_fw_inclu_hc_n.to_csv(os.path.join(output_path, 'zeno_fw_inclu_hc_counts.csv'))

total unique bw_ids in df
177
--------------
any duplicate bw_ids in first visit df?
0
--------------
df_first_visit demographic diagnosis counts
demographic_diagnosis
MS    142
HC     35
Name: count, dtype: int64
--------------
MS and HC rows should match from table above
rows in final MS df
142
MS df count bw_id rows with data
142
rows in final HC df
35
HC df count bw_id rows with data
35
race_ethnicity_clean
White Not Hispanic                  94
Hispanic or Latino                  13
Asian                               12
Black Or African American           10
Other/Unknown/Declined              10
American Indian Or Alaska Native     2
Other Pacific Islander               1
Name: count, dtype: int64
race_ethnicity_clean
White Not Hispanic        20
Asian                      9
Other/Unknown/Declined     5
Hispanic or Latino         1
Name: count, dtype: int64


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


In [18]:
# All participant's included Zeno - merge Zeno FW and PWs 
zv_all_includ_w_bw_df = pd.concat([zv_pws_inclu_w_bw_df, zv_fw_inclu_w_bw_df])
zv_all_includ_w_bw_df = zv_all_includ_w_bw_df.drop_duplicates(keep = 'first')
zv_all_includ_w_bw_df = merge_race_ethnicity(zv_all_includ_w_bw_df) 
                                                                 
# demographics summary 
zv_all_inclu_ms_df, zv_all_inclu_hc_df = split_MS_HC_first_visit(zv_all_includ_w_bw_df)

# demographics
zv_all_inclu_ms_dem, zv_all_inclu_ms_n = demographic_summary(zv_all_inclu_ms_df)
zv_all_inclu_hc_dem, zv_all_inclu_hc_n = demographic_summary(zv_all_inclu_hc_df)

# save outputs 
zv_all_inclu_ms_dem.to_csv(os.path.join(output_path, 'zeno_all_inclu_ms_demographics.csv')) 
zv_all_inclu_hc_dem.to_csv(os.path.join(output_path, 'zeno_all_inclu_hc_demographics.csv'))

zv_all_inclu_ms_n.to_csv(os.path.join(output_path, 'zeno_all_inclu_ms_counts.csv')) 
zv_all_inclu_hc_n.to_csv(os.path.join(output_path, 'zeno_all_inclu_hc_counts.csv'))

total unique bw_ids in df
193
--------------
any duplicate bw_ids in first visit df?
0
--------------
df_first_visit demographic diagnosis counts
demographic_diagnosis
MS    156
HC     37
Name: count, dtype: int64
--------------
MS and HC rows should match from table above
rows in final MS df
156
MS df count bw_id rows with data
156
rows in final HC df
37
HC df count bw_id rows with data
37
race_ethnicity_clean
White Not Hispanic                  106
Hispanic or Latino                   14
Asian                                13
Black Or African American            10
Other/Unknown/Declined               10
American Indian Or Alaska Native      2
Other Pacific Islander                1
Name: count, dtype: int64
race_ethnicity_clean
White Not Hispanic        21
Asian                     10
Other/Unknown/Declined     5
Hispanic or Latino         1
Name: count, dtype: int64


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


### Home Videos 

In [19]:
# all bw participants with ms - maybe not using?? 
# tbd if not all partiicpants approached for home videos 
bw_df = pd.read_excel(bw_path, 
                     index_col = None, 
                     usecols = ['bw_id', 'record_id', 'trialdate', 'visit_date', 'demoEHR_DiseaseDuration',
                                'clean_sex', 'demoEHR_Age', 'demographic_diagnosis', 'bingoEHR_DX_MS DX', 'clean_race',	
                                'clean_ethnicity', 'bingoEHR_EDSS_measure_value', 'msfcEHR_T25FW SPEED AVG']) 

# only ms participants have sent back videos 
bw_ms_df = bw_df.loc[bw_df['demographic_diagnosis'] == 'MS']
bw_ms_df = merge_race_ethnicity(bw_ms_df) 

# demographics summary 
all_bw_ms_df, all_bw_hc_df = split_MS_HC_first_visit(bw_ms_df)

all_bw_ms_dem, all_bw_ms_n = demographic_summary(all_bw_ms_df)
all_bw_hc_dem, all_bw_hc_n = demographic_summary(all_bw_hc_df)

all_bw_ms_dem.to_csv(os.path.join(output_path, 'home_all_bw_ms_demographics.csv')) 
all_bw_ms_n.to_csv(os.path.join(output_path, 'home_all_bw_ms_counts.csv')) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2['race_ethnicity_clean'] = ''


total unique bw_ids in df
184
--------------
any duplicate bw_ids in first visit df?
0
--------------
df_first_visit demographic diagnosis counts
demographic_diagnosis
MS    184
Name: count, dtype: int64
--------------
MS and HC rows should match from table above
rows in final MS df
184
MS df count bw_id rows with data
184
rows in final HC df
0
HC df count bw_id rows with data
0
race_ethnicity_clean
White Not Hispanic                  122
Hispanic or Latino                   19
Asian                                14
Other/Unknown/Declined               14
Black Or African American            12
American Indian Or Alaska Native      2
Other Pacific Islander                1
Name: count, dtype: int64
no participants


In [20]:
# participants consented to home videos 
# merge w bw data 
consent_all_w_bw_df = consent_all_df.merge(right = bw_df, how = 'left', on = ['bw_id', 'visit_date'])
# one mild TBI? 
consent_all_w_bw_df = consent_all_w_bw_df.loc[consent_all_w_bw_df['demographic_diagnosis'] == 'MS']
# two with signed home walking, but no BW ID or demographics 
consent_all_w_bw_df.dropna(subset=['bw_id'], inplace=True)
consent_all_w_bw_df = consent_all_w_bw_df.sort_values(by='bw_id')
consent_all_w_bw_df.to_csv(os.path.join(redcap_reports_path, 'all_home_vid_ms_consent_w_bw.csv'))

consent_all_w_bw_df = merge_race_ethnicity(consent_all_w_bw_df)
# demographics summary 
consent_all_ms_df, consent_all_hc_df = split_MS_HC_first_visit(consent_all_w_bw_df)
consent_all_ms_dem, consent_all_ms_n = demographic_summary(consent_all_ms_df)

consent_all_ms_dem.to_csv(os.path.join(output_path, 'home_consented_ms_demographics.csv'))
consent_all_ms_n.to_csv(os.path.join(output_path, 'home_consented_ms_counts.csv'))

total unique bw_ids in df
66
--------------
any duplicate bw_ids in first visit df?
0
--------------
df_first_visit demographic diagnosis counts
demographic_diagnosis
MS    66
Name: count, dtype: int64
--------------
MS and HC rows should match from table above
rows in final MS df
66
MS df count bw_id rows with data
66
rows in final HC df
0
HC df count bw_id rows with data
0
race_ethnicity_clean
White Not Hispanic                  42
Other/Unknown/Declined               8
Asian                                6
Hispanic or Latino                   5
Black Or African American            3
Other Pacific Islander               1
American Indian Or Alaska Native     1
Name: count, dtype: int64


In [21]:
# participants that did not consent to home videos 
hv_no_consent_w_bw_df = bw_ms_df.loc[~bw_ms_df['bw_id'].isin(consent_all_w_bw_df['bw_id'])]
hv_no_consent_w_bw_df = merge_race_ethnicity(hv_no_consent_w_bw_df)

hv_no_consent_ms_df, hv_no_consent_hc_df = split_MS_HC_first_visit(hv_no_consent_w_bw_df)
hv_no_consent_ms_dem, hv_no_consent_ms_n = demographic_summary(hv_no_consent_ms_df)

hv_no_consent_ms_dem.to_csv(os.path.join(output_path, 'home_no_consent_ms_demographics.csv'))
hv_no_consent_ms_n.to_csv(os.path.join(output_path, 'home_no_consent_ms_counts.csv'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2['race_ethnicity_clean'] = ''


total unique bw_ids in df
118
--------------
any duplicate bw_ids in first visit df?
0
--------------
df_first_visit demographic diagnosis counts
demographic_diagnosis
MS    118
Name: count, dtype: int64
--------------
MS and HC rows should match from table above
rows in final MS df
118
MS df count bw_id rows with data
118
rows in final HC df
0
HC df count bw_id rows with data
0
race_ethnicity_clean
White Not Hispanic                  80
Hispanic or Latino                  14
Black Or African American            9
Asian                                8
Other/Unknown/Declined               6
American Indian Or Alaska Native     1
Name: count, dtype: int64


In [22]:
# all participants who sent home videos 
hv_all_vids_df = pd.read_csv(all_hv_path, index_col = 0) 
hv_all_vids_df = merge_race_ethnicity(hv_all_vids_df)

# demographics summary 
hv_all_vids_ms_df, hv_all_vids_hc_df = split_MS_HC_first_visit(hv_all_vids_df)
hv_all_vids_ms_dem, hv_all_vids_ms_n  = demographic_summary(hv_all_vids_ms_df)

hv_all_vids_ms_dem.to_csv(os.path.join(output_path, 'home_all_vids_ms_demographics.csv')) 
hv_all_vids_ms_n.to_csv(os.path.join(output_path, 'home_all_vids_ms_counts.csv')) 

## ISSUE - some participants who sent videos are not included on RedCap consent reports 

total unique bw_ids in df
35
--------------
any duplicate bw_ids in first visit df?
0
--------------
df_first_visit demographic diagnosis counts
demographic_diagnosis
MS    35
Name: count, dtype: int64
--------------
MS and HC rows should match from table above
rows in final MS df
35
MS df count bw_id rows with data
35
rows in final HC df
0
HC df count bw_id rows with data
0
race_ethnicity_clean
White Not Hispanic           26
Other/Unknown/Declined        3
Asian                         3
Other Pacific Islander        1
Hispanic or Latino            1
Black Or African American     1
Name: count, dtype: int64


In [23]:
# participants that consented but did not send videos 
#ids in consent_all not in  hv_all_vids_df
hv_no_vids_sent_df = consent_all_ms_df.loc[~consent_all_ms_df['bw_id'].isin(hv_all_vids_df['bw_id'])]
hv_no_vids_sent_df = merge_race_ethnicity(hv_no_vids_sent_df)

# demographics summary 
hv_no_vids_sent_ms_df, hv_no_vids_sent_hc_df = split_MS_HC_first_visit(hv_no_vids_sent_df)
hv_no_vids_sent_ms_dem, hv_no_vids_sent_ms_n  = demographic_summary(hv_no_vids_sent_ms_df)

hv_no_vids_sent_ms_dem.to_csv(os.path.join(output_path, 'home_no_vids_ms_demographics.csv')) 
hv_no_vids_sent_ms_n.to_csv(os.path.join(output_path, 'home_no_vids_ms_counts.csv'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2['race_ethnicity_clean'] = ''


total unique bw_ids in df
35
--------------
any duplicate bw_ids in first visit df?
0
--------------
df_first_visit demographic diagnosis counts
demographic_diagnosis
MS    35
Name: count, dtype: int64
--------------
MS and HC rows should match from table above
rows in final MS df
35
MS df count bw_id rows with data
35
rows in final HC df
0
HC df count bw_id rows with data
0
race_ethnicity_clean
White Not Hispanic                  20
Other/Unknown/Declined               5
Hispanic or Latino                   4
Asian                                3
Black Or African American            2
American Indian Or Alaska Native     1
Name: count, dtype: int64


In [24]:
# videos included in analysis 
hv_all_inclu_w_bw_df = pd.read_csv(included_hv_w_bw_path, index_col = 0) 
hv_all_inclu_w_bw_df = merge_race_ethnicity(hv_all_inclu_w_bw_df)

# demographics summary 
hv_all_inclu_ms_df, hv_all_inclu_hc_df = split_MS_HC_first_visit(hv_all_inclu_w_bw_df)
hv_all_inclu_ms_dem, hv_all_inclu_ms_n = demographic_summary(hv_all_inclu_ms_df)

hv_all_inclu_ms_dem.to_csv(os.path.join(output_path, 'home_all_inclu_ms_demographics.csv')) 
hv_all_inclu_ms_n.to_csv(os.path.join(output_path, 'home_all_inclu_ms_counts.csv')) 

total unique bw_ids in df
27
--------------
any duplicate bw_ids in first visit df?
0
--------------
df_first_visit demographic diagnosis counts
demographic_diagnosis
MS    27
Name: count, dtype: int64
--------------
MS and HC rows should match from table above
rows in final MS df
27
MS df count bw_id rows with data
27
rows in final HC df
0
HC df count bw_id rows with data
0
race_ethnicity_clean
White Not Hispanic        20
Other/Unknown/Declined     3
Asian                      2
Other Pacific Islander     1
Hispanic or Latino         1
Name: count, dtype: int64


In [25]:
# participants that sent videos but none were included in analysis 
#ids in hv_all_vids_df not in  hv_all_inclu_w_bw_df
hv_excluded_vids_df = hv_all_vids_df.loc[~hv_all_vids_df['bw_id'].isin(hv_all_inclu_w_bw_df['bw_id'])]
hv_excluded_vids_df = merge_race_ethnicity(hv_excluded_vids_df)

# demographics summary 
hv_excluded_vids_ms_df, hv_excluded_vids_hc_df = split_MS_HC_first_visit(hv_excluded_vids_df)
hv_excluded_vids_ms_dem, hv_excluded_vids_ms_n = demographic_summary(hv_excluded_vids_ms_df)

hv_excluded_vids_ms_dem.to_csv(os.path.join(output_path, 'home_all_exclu_ms_demographics.csv')) 
hv_excluded_vids_ms_n.to_csv(os.path.join(output_path, 'home_all_exclu_ms_counts.csv')) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2['race_ethnicity_clean'] = ''


total unique bw_ids in df
8
--------------
any duplicate bw_ids in first visit df?
0
--------------
df_first_visit demographic diagnosis counts
demographic_diagnosis
MS    8
Name: count, dtype: int64
--------------
MS and HC rows should match from table above
rows in final MS df
8
MS df count bw_id rows with data
8
rows in final HC df
0
HC df count bw_id rows with data
0
race_ethnicity_clean
White Not Hispanic           6
Asian                        1
Black Or African American    1
Name: count, dtype: int64
