In [1]:
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import scipy.stats as stats 
import numpy as np
import os 

# Gait Metric Analysis 
- For zeno videos: accuracy of video metrics vs mat metrics
- For zeno and home videos: associations of video metrics with clinical outcomes

# Define Analysis Functions 

In [2]:
# create ordinal value of EDSS severity 
# 0-2 (mild), 2.5-4 (moderate), 4.5+ (severe)

# Function to categorize EDSS severity
def categorize_edss(edss_value):
    if 0 <= edss_value <= 2:
        return 1, 'mild'
    elif 2.5 <= edss_value <= 4:
        return 2, 'moderate'
    elif edss_value >= 4.5:
        return 3, 'severe'
    else:
        return None, None  # Handle cases outside the defined ranges

In [3]:
# create ordinal value for T25FW 
def categorize_t25fw(t25fw_value):
    if 0 <= t25fw_value < 6:
        return 1, 'under_6'
    elif 6 <= t25fw_value <= 7.99:
        return 2, '6_to_8'
    elif t25fw_value >= 8:
        return 3, 'over_8'
    else:
        return None, None  # Handle cases outside the defined ranges

In [4]:
# task = gait_vertical_PWS_1 or gait_vertical_FW_1
def merge_bw_zv(bw_df, zv_df, task, out_path):

    # filter zv to only include one task (bw drop columns, zv drop rows) 
    zv_task_df = zv_df[zv_df['task_pose_zv'] == task]
    print('confirm all one task')
    print(pd.unique(zv_task_df['task_pose_zv']))

    print('total zeno videos') 
    print(len(zv_task_df))

    # drop bw columns to only include one task 
    if task == 'gait_vertical_PWS_1':
        bw_df = bw_df.drop(['FW_stridetimesecmean', 'FW_stridetimeseccv','FW_cadencestepsminmean','FW_totaldsupportmean', 
                            'FW_singlesupportmean','FW_totaldsupportratiolr', 'FW_singlesupportratiolr', 
                            'FW_stridewidthcmmean','FW_stridewidthcmsd'], axis = 1)
    elif task == 'gait_vertical_FW_1':
         bw_df = bw_df.drop(['PWS_stridetimesecmean', 'PWS_stridetimeseccv','PWS_cadencestepsminmean','PWS_totaldsupportmean', 
                             'PWS_singlesupportmean','PWS_totaldsupportratiolr', 'PWS_singlesupportratiolr', 
                             'PWS_stridewidthcmmean','PWS_stridewidthcmsd'], axis = 1)
        

    # filter bw ids dataset to only those included in video data set 
    zv_in_bw_df = bw_df[bw_df['bw_id'].isin(zv_task_df['id_video'])]
    zv_in_bw_df.to_csv(os.path.join(out_path, 'zv_id_in_bw_df_' + task + '.csv')) # save excel 

    print('total bw rows with id in video dataset') 
    print(len(zv_in_bw_df))

    # merge bw data set rows with zeno videos rows 
        # merge bw data set rows with zeno videos rows 
        # id and date needs to be the same 
        # should only use each brainwalk visit once - once PWS_1 video per person 

    merged_bw_zv = []

    # Loop through each row in zv_task_df
    for index, zv_row in zv_task_df.iterrows():
   
        current_id = zv_row['id_video']
        current_date = zv_row['visit_date_video']
        zv_row_df = pd.DataFrame([zv_row])

        # Find rows in brainwalk data set with same id and same date as current zv data 
        zv_in_bw_current_id_rows = zv_in_bw_df[(zv_in_bw_df['bw_id'] == current_id) & (zv_in_bw_df['visit_date'] == current_date)]
        #zv_in_bw_current_id_date_rows = zv_in_bw_current_id_rows[zv_in_bw_current_id_rows['visit_date'] == current_date]
   
        if len(zv_in_bw_current_id_rows) == 1: 
            bw_row_to_merge = zv_in_bw_current_id_rows
            # merge bw and zv, store merged row
            merged_row = zv_row_df.merge(bw_row_to_merge, left_on='id_video', right_on='bw_id')
            merged_bw_zv.append(merged_row)
        
        # if more than one row for the id and date, pick one with least na values 
        elif len(zv_in_bw_current_id_rows) > 1:
            bw_row_to_merge = zv_in_bw_current_id_rows.loc[[zv_in_bw_current_id_rows.isna().sum(axis=1).idxmin()]]
            # merge bw and zv, store merged row
            merged_row = zv_row_df.merge(bw_row_to_merge, left_on='id_video', right_on='bw_id')
            merged_bw_zv.append(merged_row)

            print('multiple rows for the id and date combo')
            print(current_id)
            print(current_date)

        else: 
            print('No matching id and daterow from video vs mat')
            print(current_id)
            print(current_date)


    # merge all bw and zv data together 
    merged_bw_zv_df = pd.concat(merged_bw_zv)
    merged_bw_zv_df = merged_bw_zv_df.reset_index(drop=True) # reset index 

    # check same ID for each row 
    print('mismatched zeno video vs brainwalk id')
    print(sum(merged_bw_zv_df['id_video'] != merged_bw_zv_df['bw_id']))

    print('mismatched zeno video vs brainwalk date')
    print(sum(merged_bw_zv_df['visit_date_video'] != merged_bw_zv_df['visit_date']))

    # saved merged df for future reference 
    merged_bw_zv_df.to_csv(os.path.join(out_path,  'zv_bw_merged_' + task + '.csv'))

    return merged_bw_zv_df
    

In [5]:
# merge home video data with preferred walking speed mat data 
# participants walk at preferred pace at home 

def merge_bw_hv(bw_df, hv_df, task, out_path):
    # filter zv to only include one task (bw drop columns, zv drop rows) 
    hv_task_df = hv_df[hv_df['task_pose_hv'] == task]
    print('confirm all one task')
    print(pd.unique(hv_task_df['task_pose_hv']))

    print('total home videos') 
    print(len(hv_task_df))

    # drop FW data from bw dataset 
   # bw_df = bw_df.drop(['FW_stridetimesecmean', 'FW_stridetimeseccv','FW_cadencestepsminmean','FW_totaldsupportmean',
                     #   'FW_singlesupportmean','FW_totaldsupportratiolr', 'FW_singlesupportratiolr', 
                     #   'FW_stridewidthcmmean','FW_stridewidthcmsd'], axis = 1)

    # filter bw ids dataset to only those included in video data set 
    hv_in_bw_df = bw_df[bw_df['bw_id'].isin(hv_task_df['id_video'])]
    hv_in_bw_df.to_csv(os.path.join(out_path, 'hv_id_in_bw_df.csv')) # save excel 

    print('total bw rows with id in video dataset') 
    print(len(hv_in_bw_df))

    # Track used rows from zv_in_bw_df
    used_indices = set()

    # Helper function to find the closest date
    def find_closest_date_unique(row, in_bw_df):
        # Filter rows with the same 'bw_id' and not already used
        filtered_df = in_bw_df[(in_bw_df['bw_id'] == row['id_video'])] # & (~in_bw_df.index.isin(used_indices)) - add after video'] for unique only
        if filtered_df.empty:
            return None
        
        # Find the closest date
        closest_idx = (filtered_df['visit_date'] - row['visit_date_video']).abs().idxmin()
       # used_indices.add(closest_idx)  # Mark the row as used
        return filtered_df.loc[closest_idx]

    # Apply the helper function row-wise
    closest_rows = hv_task_df.apply(
        lambda row: find_closest_date_unique(row, hv_in_bw_df), axis=1
    )
    
    # Convert the results into a DataFrame
   # closest_rows_df = pd.DataFrame(closest_rows.tolist(), index=hv_task_df.index)

    # Merge the original `zv_task_df` with `closest_rows_df`
    merged_bw_hv_df = hv_task_df.merge(closest_rows, left_index=True, right_index=True, suffixes=('', '_closest'))

    # add column for date diff 
    merged_bw_hv_df['bw_hv_abs_date_diff'] = abs(merged_bw_hv_df['visit_date'] - merged_bw_hv_df['visit_date_video'])
    
    # check same ID for each row 
    print('mismatched home video vs brainwalk id')
    print(sum(merged_bw_hv_df['id_video'] != merged_bw_hv_df['bw_id']))
    
    return merged_bw_hv_df

In [6]:
# df input should be merged df - both video and bw data 

def print_video_counts(df):
    # number of zeno videos and participants included 
    print('total videos - df length: ' + 
          str(len(df))) 

    print('unique demographic_diagnosis in df: ' + 
         str(pd.unique(df['demographic_diagnosis'])))
    
    print('num videos with demographic_diagnosis == HC: ' + 
         str(len(df[df['demographic_diagnosis'] == 'HC']))) 

    print('num videos demographic_diagnosis == MS: ' + 
         str(len(df[df['demographic_diagnosis'] == 'MS']))) 

    print('------')

    print('unique id_video (participants) in df: ' + 
          str(len(pd.unique(df['id_video'])))) 

    print('num participants with demographic_diagnosis == HC: ' + 
         str(len(pd.unique(df['id_video'][df['demographic_diagnosis'] == 'HC'])))) 
    
    print('num participants demographic_diagnosis == MS: ' + 
         str(len(pd.unique(df['id_video'][df['demographic_diagnosis'] == 'MS']))))

    print('------') 
    
    print('number of participants with multiple videos in dataset: ' + 
         str(df['id_video'][df['id_video'].duplicated()].nunique()))

    print('number of participants with one video: ' + 
          str((df['id_video'].value_counts() == 1).sum()))

In [7]:
# cols to check 
    # string, either - PWS, FW, t25fw, or edss 

def drop_cols_missing_data(df, cols_to_check):
    # drop row if all PWS mat vars are nan
    if cols_to_check == 'pws': 
        subset_columns = ['PWS_stridetimesecmean',
                          'PWS_stridetimeseccv',
                          'PWS_cadencestepsminmean',
                          'PWS_totaldsupportmean',
                          'PWS_singlesupportmean', 
                          'PWS_totaldsupportratiolr',
                          'PWS_singlesupportratiolr',
                          'PWS_stridewidthcmmean', 
                          'PWS_stridewidthcmsd']

    # drop row if all FW mat vars are nan 
    elif cols_to_check == 'fw':
        subset_columns = ['FW_stridetimesecmean',
                          'FW_stridetimeseccv',
                          'FW_cadencestepsminmean',
                          'FW_totaldsupportmean',
                          'FW_singlesupportmean', 
                          'FW_totaldsupportratiolr',
                          'FW_singlesupportratiolr',
                          'FW_stridewidthcmmean', 
                          'FW_stridewidthcmsd']
        
    # drop row if edss is nan
    elif cols_to_check == 'edss':
        subset_columns = ['bingoEHR_EDSS_measure_value']

    # drop row is tw5fw is nan 
    elif cols_to_check == 't25fw':
        subset_columns = ['msfcEHR_T25FW SPEED AVG']

    df_missing_rows_dropped = df.dropna(axis = 0, 
                                        how = 'all',
                                        subset = subset_columns)

    return df_missing_rows_dropped


In [8]:
# test normality: histograms and shapiro test 
def hist_and_shapiro(df, hist_out_path): 
    results = []
    histogram_folder = os.path.join(hist_out_path, 'histograms')
    if not os.path.exists(histogram_folder):
        os.makedirs(histogram_folder)

    for column in df.columns:
        if df[column].dtype == 'float64':
            # histogram 
            plt.figure(figsize=(8, 6))
            plt.hist(df[column], bins=30, color='skyblue', edgecolor='black')
            plt.title(f'Histogram of {column}')
            plt.xlabel(column)
            plt.ylabel('Frequency')
            plt.grid(True)
            plt.savefig(os.path.join(histogram_folder, f'{column}.png'))
            plt.close()

            # shapiro wilks test 
            # check for missing data 
            non_missing_data = df[column].dropna()
            n = len(non_missing_data) 

            if n > 3: 
                # Perform Shapiro-Wilk test
                stat, p_value = stats.shapiro(non_missing_data)  
            else: 
                stat = np.nan
                p_value = np.nan 
            
            results.append({'Column': column, 'non_missing_observations': n, 'Statistic': stat, 'P-value': p_value})

    # Create DataFrame with results
    shapiro_results_df = pd.DataFrame(results)
    
    # if p value less than 0.05, data is not normally distributed 
    shapiro_results_df['normal'] = shapiro_results_df['P-value'].apply(lambda x: 'no' if x < .05 else 'yes')
    shapiro_results_df['test'] = shapiro_results_df['P-value'].apply(lambda x: 'spearman' if x < .05 else 'pearson')
    shapiro_results_df.round(3)
    
    return(shapiro_results_df)

In [9]:
# correlation of same metrics from difference data sources 

def metric_correlation(df, video_columns, bw_columns, output_folder_path): 
    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)
    
    # create empty list to store results 
    corr_results_all = [] 
    clean_df = pd.DataFrame() 
    
    for metric_i, current_metric in enumerate(video_columns): 
        current_vid_col = video_columns[metric_i]
        current_bw_col = bw_columns[metric_i]

        # Drop rows with NaN values in either column - required to run spearman r 
        clean_df = df.dropna(subset=[current_vid_col, current_bw_col])
        
        # plot 
        #sns.lmplot(x = current_bw_col, y = current_vid_col, data = clean_df, ci = None)
        sns.scatterplot(x = current_bw_col, y = current_vid_col, data = clean_df)
        # Set the x and y axis limits to the same range
     #   plt.axis('square')  
      #  min_val = min(clean_df[current_vid_col].min(), clean_df[current_vid_col].min())  
      #  max_val = max(clean_df[current_vid_col].max(), clean_df[current_vid_col].max())  
      #  plt.xlim(min_val, max_val)
      #  plt.ylim(min_val, max_val) 
        plt.savefig(os.path.join(output_folder_path, str(current_vid_col + '_vs_' + current_bw_col + '.png')))
        plt.close()

        # run spearman correlation and append   
        statistic, p_value = stats.spearmanr(clean_df[current_bw_col], clean_df[current_vid_col])
        corr_results_all.append({'bw_column': current_bw_col, 
                                 'video_column': current_vid_col, 
                                 'corr_method': 'spearman' , 
                                 'rs': statistic, 
                                 'p_value' : p_value,
                                 'n observations': len(clean_df)})

    # Create DataFrame with results
    corr_results_df = pd.DataFrame(corr_results_all)
    corr_results_df = corr_results_df.round(3)

    return corr_results_df
    

In [10]:
# use same columns as metric correlations 
def calculate_metric_mean_error(df, video_columns, bw_columns, units, output_folder_path):
    # one dot = participant 

    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path) 
        
    mean_error_all = [] 

    for metric_i, current_metric in enumerate(video_columns): 
        current_vid_col = video_columns[metric_i]
        current_bw_col = bw_columns[metric_i]
        current_unit = units[metric_i]

        # Drop rows with NaN values in either column 
        clean_df = df.dropna(subset=[current_vid_col, current_bw_col])

        current_metric_diff = clean_df[current_bw_col] - clean_df[current_vid_col]
        current_mean_diff = current_metric_diff.mean()
        current_abs_mean_diff = abs(current_metric_diff).mean()

        # calculate mean ground truth data 
        bw_mean = clean_df[current_bw_col].mean()
        mean_err_per = (current_mean_diff / bw_mean) * 100 
        mae_per = (current_abs_mean_diff / bw_mean) * 100 

        # plot 
        fig, ax1 = plt.subplots()
        sns.boxplot(y=current_metric_diff, ax=ax1, fill = False, dodge = True, fliersize = 0)
        sns.stripplot(y = current_metric_diff, ax = ax1, color = 'black', dodge = True)
        fig.suptitle('Mat Metric - Video Metric')
        ax1.set_title(current_bw_col + ' - ' + current_vid_col)
        # center plot at zero
        ymin, ymax = plt.ylim()
        plt.ylim(min(ymin, -ymax), max(ymax, -ymin))
        plt.ylabel(current_unit)
        # add line at zero
        plt.axhline(y=0, color='grey', linestyle='--')
        plt.tight_layout()
        plt.savefig(os.path.join(output_folder_path, str(current_vid_col + '_vs_' + current_bw_col + '_diff_box.png')))
        plt.close()

        # mean difference 
        mean_error_all.append({'bw_column': current_bw_col, 
                               'video_column': current_vid_col,
                               'n' : len(clean_df), 
                               'bw_metric_mean' : bw_mean,
                               'mean_error': current_mean_diff, 
                               'mean_abs_error' : current_abs_mean_diff, 
                               'mean_error_%_of_mean' : mean_err_per,
                               'mae_%_of_mean' : mae_per})

    
     # Create DataFrame with results
    mean_error_df = pd.DataFrame(mean_error_all)
    mean_error_df = mean_error_df.round(3)
    
    return mean_error_df

In [11]:
# use same columns as metric correlation 

def bland_altman_plot(df, video_columns, bw_columns, units, output_folder_path):
     
    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)

    for metric_i, current_metric in enumerate(video_columns): 
        current_bw_col = bw_columns[metric_i]
        current_vid_col = video_columns[metric_i]
        current_unit = current_unit = units[metric_i]

        clean_df = df.dropna(subset=[current_vid_col, current_bw_col])

    
        # Compute the mean and the difference
        mean_measurements = (clean_df[current_bw_col] + clean_df[current_vid_col]) / 2
        diff_measurements = clean_df[current_bw_col] - clean_df[current_vid_col]  # Difference between measurements

        # Mean difference and standard deviation of the difference
        mean_diff = np.mean(diff_measurements)
        std_diff = np.std(diff_measurements)

        # Plot the data
        plt.figure(figsize=(8, 6))
        plt.scatter(mean_measurements, diff_measurements, alpha=0.5)
    
        # Add mean difference line and limits of agreement (±1.96*std)
        plt.axhline(mean_diff, color='black', linestyle='--', label=f'Mean diff: {mean_diff:.2f}')
        plt.axhline(mean_diff + 1.96 * std_diff, color='red', linestyle='--', label=f'+1.96 SD: {mean_diff + 1.96 * std_diff:.2f}')
        plt.axhline(mean_diff - 1.96 * std_diff, color='blue', linestyle='--', label=f'-1.96 SD: {mean_diff - 1.96 * std_diff:.2f}')
        plt.axhline(y=0, color='grey', linestyle='--')
    
        # Labels and title
        plt.xlabel('Mean of Zeno Mat vs Video Pose Metric (' + current_unit + ')') 
        plt.ylabel('Zeno mat - Video Pose Metric (' + current_unit + ')')
        plt.title(current_bw_col + ' vs ' + current_vid_col)
        plt.legend()
        plt.savefig(os.path.join(output_folder_path,  str(current_vid_col + '_vs_' + current_bw_col + '_blandalt.png')))
        plt.close()

In [12]:
# correlation with clinical outcomes 
def outcome_correlation(df, output_folder_path, video_task_str, outcome_str): 
    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)

    if not os.path.exists(os.path.join(output_folder_path, 'edss_plot')):
        os.makedirs(os.path.join(output_folder_path, 'edss_plot'))

    if not os.path.exists(os.path.join(output_folder_path, 't25fw_plot')):
        os.makedirs(os.path.join(output_folder_path, 't25fw_plot'))

    # drop date time columns 
    if 'bw_hv_abs_date_diff' in df.columns:
        df = df.drop(columns=['bw_hv_abs_date_diff'])
        
    # drop nonnumeric columns 
    numeric_cols = df.select_dtypes(include=['number']).columns
    ordinal_cols = df.select_dtypes(include=['category']).columns

    # Keep only numeric and ordinal columns
    df_num = df[numeric_cols.union(ordinal_cols)]

    # Create an empty DataFrame to store the Spearman correlation coefficients
    n_cols = df_num.shape[1]

    corr_matrix = pd.DataFrame(np.zeros((n_cols, n_cols)), columns=df_num.columns, index=df_num.columns)
    pvalue_matrix = pd.DataFrame(np.zeros((n_cols, n_cols)), columns=df_num.columns, index=df_num.columns)
    n_videos_matrix = pd.DataFrame(np.zeros((n_cols, n_cols)), columns=df_num.columns, index=df_num.columns)
    
    # Compute Spearman correlation for each pair of columns
    for col1 in df_num.columns:
        for col2 in df_num.columns:
            # drop rows if col1 and col2 are both nan
            df_num_clean = df.dropna(subset=[col1, col2])

            # spearman correlation 
            corr, p_value = stats.spearmanr(df_num_clean[col1], df_num_clean[col2])

            # save results in matrix 
            corr_matrix.loc[col1, col2] = corr
            pvalue_matrix.loc[col1, col2] = p_value
            n_videos_matrix.loc[col1, col2] = len(df_num_clean) # number of rows with data for both columns 

            if col1 == 'bingoEHR_EDSS_measure_value':
                sns.scatterplot(data=df_num_clean, x=col1, y=col2) 
                plt.savefig(os.path.join(output_folder_path, 'edss_plot', str(video_task_str + '_' + col2 + '.png')))
                plt.close()

            if col1 == 'msfcEHR_T25FW SPEED AVG': 
                sns.scatterplot(data=df_num_clean, x=col1, y=col2) 
                plt.savefig(os.path.join(output_folder_path, 't25fw_plot', str(video_task_str + '_' + col2 + '.png')))
                plt.close()

    #  Plot and save the heatmap 
    plt.figure(figsize=(8, 6))
    sns.heatmap(corr_matrix, annot=False, cmap="coolwarm", center=0)
    plt.title("Spearman Rank Correlation Heatmap")
    plt.savefig(os.path.join(output_folder_path,  video_task_str + '_' + outcome_str + '_heatmap.png'))
    plt.close()

    # select specific columns from the three matrices and concatenate into single data frame 
    if outcome_str == 'edss': 
        corr_df = pd.concat([corr_matrix['bingoEHR_EDSS_measure_value'],
                             pvalue_matrix['bingoEHR_EDSS_measure_value'],
                             n_videos_matrix['bingoEHR_EDSS_measure_value'],
                             corr_matrix['edss_severity_cat'],
                             pvalue_matrix['edss_severity_cat'],
                             n_videos_matrix['edss_severity_cat']],
                            axis = 1)
        
        corr_df.columns = ['edss_score_statistic', 'edss_score_p_value', 'edss_score_n_videos',
                           'edss_severity_cat_statistic', 'edss_severity_cat_p_value', 'edss_severity_n_videos']

    elif outcome_str == 't25fw': 
        corr_df = pd.concat([corr_matrix['msfcEHR_T25FW SPEED AVG'],
                             pvalue_matrix['msfcEHR_T25FW SPEED AVG'],
                             n_videos_matrix['msfcEHR_T25FW SPEED AVG']], 
                     axis = 1)
        
        corr_df.columns = ['t25fw_correlation_statistic', 't25fw_correlation_p_value', 't25fw_correlation_n_videos']


    corr_df = corr_df.round(3)

    return corr_df


In [13]:
def check_ttest_anova_assumptions(df, group_col, vid_metric_col_suffix, mat_metric_col_prefix, output_folder_path, video_task_str): 
    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)

    # select numeric values with metric_col_suffix (either all zv or hv) - only video metrics 
    df_metrics = df.select_dtypes(include=['number'])
    df_metrics = df_metrics.loc[:, 
        df_metrics.columns.str.endswith(vid_metric_col_suffix) | df_metrics.columns.str.startswith(mat_metric_col_prefix)]

    # count number of cateogries in group_col 
    x_cat_groups = df[group_col].unique()
    print('groups')
    print(x_cat_groups)

    # create save folder for histograms 
    if not os.path.exists(os.path.join(output_folder_path, 'check_normality')):
        os.makedirs(os.path.join(output_folder_path, 'check_normality'))
    
    # loop through each metric column and plot vs group_col variable histogram 
    for col_i, current_metric_col in enumerate(df_metrics.columns): 

        # check normality of each y column, grouped by current x value 
        plt.figure(figsize = (10,6))

        # Save for storing shapiro results and standard deviation 
        results_text_on_plot = []
        
        for current_group in x_cat_groups: 
            current_group_values = df[df[group_col] == current_group][current_metric_col]
            sns.histplot(data = current_group_values, alpha = 0.5, label = f'Group {current_group}')

            if current_group_values.count() > 3:
                # perform shapiro walks test for normality 
                stat, p_value = stats.shapiro(current_group_values, nan_policy='omit')
                # save standard deviation for each group 
                sd = current_group_values.std(skipna = True)
                
            else:  
                stat = np.nan
                p_value = np.nan
                sd = np.nan

            results_text_on_plot.append((current_group, np.round(stat, decimals=3), np.round(p_value, decimals = 3), np.round(sd, decimals = 3)))

        # plot title and legend 
        plt.title('Metric Values by group') 
        plt.legend()
        
        # Annotate Shapiro-Wilk results on the plot
        plt.text(0.05, 0.95, 'Hypothesis test assumptions', transform=plt.gca().transAxes)
        text_y_position = 0.9  # Start near the top of the plot
        for result in results_text_on_plot:
            current_group, stat, p_value, sd = result
            plt.text(
                0.05, text_y_position, 
                f'Group {current_group}: Shapiro Stat ={stat}, Shapiro p={p_value}, SD = {sd}', 
                transform=plt.gca().transAxes
            )
            text_y_position -= 0.05  # Move down for the next annotation
        
        plt.savefig(os.path.join(output_folder_path, 'check_normality', current_metric_col + '_hist.png'))
        plt.close()

In [14]:
def boxplot_cat_allmetrics(df, group_col, metric_col_suffix, output_folder_path, video_task_str): 

    # create save folder for boxplots 
    if not os.path.exists(os.path.join(output_folder_path, 'all_video_metrics')):
        os.makedirs(os.path.join(output_folder_path, 'all_video_metrics'))
    
    # select numeric values with metric_col_suffix (either all zv or hv) - only metrics 
    df_metrics = df.select_dtypes(include=['number'])
    df_metrics = df_metrics.loc[:, df_metrics.columns.str.endswith(metric_col_suffix)]

    # loop through each metric column and plot boxplot with individual data points, grouped by group_col on x axis 
    for col_i, current_metric_col in enumerate(df_metrics.columns): 
        
        # BOXPLOT 
        fig, ax = plt.subplots(figsize=(10, 6))
        sns.boxplot(data=df, x=group_col, y=current_metric_col, width = 0.75, fliersize=0,  fill = False)
        # Add in points to show each observation
        sns.stripplot(data=df, x=group_col, y=current_metric_col, size=4, color=".3")
        ax.set_title(f'{current_metric_col} vs {group_col}') 

        ax.set_ylim(bottom=0)

        # Save the plot
        fig_path = os.path.join(output_folder_path, 'all_video_metrics', video_task_str + '_' + current_metric_col + '.png')
        fig.savefig(fig_path)
        plt.close()


In [15]:
# Mann whitney U function - loop through all y vals 
def mannwhitneyu_all_metrics(df, group_col, group_1, group_2, metric_col_suffix, output_folder_path, video_task_str): 
    
    # select numeric values with metric_col_suffix (either all zv or hv) - only metrics 
    df_metrics = df.select_dtypes(include=['number'])
    df_metrics = df_metrics.loc[:, df_metrics.columns.str.endswith(metric_col_suffix)]

    # loop through each metric column, group by group 1 or two, run mann whitney on this group 
    stats_results = []
    for col_i, current_metric_col in enumerate(df_metrics.columns): 
        group1_data = df[df[group_col] == group_1][current_metric_col]
        group1_n = group1_data.count()
        group2_data = df[df[group_col] == group_2][current_metric_col] 
        group2_n = group2_data.count()
        U1, p = stats.mannwhitneyu(group1_data, group2_data, nan_policy='omit')
        stats_results.append((current_metric_col, group_col, 
                              group_1, group1_n, 
                              round(group1_data.mean(),3),round(group1_data.median(),3), round(group1_data.std(),3),
                              group_2, group2_n, 
                              round(group2_data.mean(),3),round(group2_data.median(),3), round(group2_data.std(),3),
                              'mannwhitneyu', round(U1, 3), round(p, 3)))

    # create save folder for excel  
    if not os.path.exists(os.path.join(output_folder_path, 'all_video_metrics')):
        os.makedirs(os.path.join(output_folder_path, 'all_video_metrics'))

    stats_results_df = pd.DataFrame(stats_results, columns = ['metric', group_col, 
                                                              'group1', 'group1_n', 
                                                              'group1_mean', 'group1_median', 'group1_std',
                                                              'group2', 'group2_n', 
                                                              'group2_mean', 'group2_median', 'group2_std',
                                                              'stats_test', 
                                                              'U1', 'p'])
    
    stats_results_df.to_csv(os.path.join(output_folder_path, 'all_video_metrics', 'mannwhitney_' + video_task_str + '.csv'))
    return(stats_results_df)

In [16]:
# Kruskal Wallace and Dunn's test function for three groups 
def kruskalwallace_all_metrics(df, group_col, group_1, group_2, group_3, metric_col_suffix, output_folder_path, video_task_str):
    # select numeric values with metric_col_suffix (either all zv or hv) - only metrics
    df_metrics = df.select_dtypes(include=['number'])
    df_metrics = df_metrics.loc[:, df_metrics.columns.str.endswith(metric_col_suffix)]

    # loop through each metric column, group by three groups and run kruskal wallace 
    stats_results = []
    for col_i, current_metric_col in enumerate(df_metrics.columns):
        group1_data = df[df[group_col] == group_1][current_metric_col]
        group1_n = group1_data.count()
        group2_data = df[df[group_col] == group_2][current_metric_col]
        group2_n = group2_data.count()
        group3_data = df[df[group_col] == group_3][current_metric_col]
        group3_n = group3_data.count()
         
        H_stat, p = stats.kruskal(group1_data, group2_data, group3_data, nan_policy = 'omit') 
        stats_results.append((current_metric_col, group_col, 
                              group_1, group1_n, 
                              round(group1_data.mean(),3),round(group1_data.median(),3), round(group1_data.std(),3),
                              group_2, group2_n,
                              round(group2_data.mean(),3),round(group2_data.median(),3), round(group2_data.std(),3),
                              group_3, group3_n,
                              round(group3_data.mean(),3),round(group3_data.median(),3), round(group3_data.std(),3),
                              'kruskal', round(H_stat, 3), round(p, 3))) 

    # save results for each metric
    stats_results_df = pd.DataFrame(stats_results, 
                                    columns = ['metric', group_col, 
                                               'group1', 'group1_n', 
                                               'group1_mean', 'group1_median', 'group1_std',
                                               'group2', 'group2_n', 
                                               'group2_mean', 'group2_median', 'group2_std',
                                               'group3', 'group3_n',
                                               'group3_mean', 'group3_median', 'group3_std',
                                               'stats_test', 'H', 'p'])


    # create save folder for excel  
    if not os.path.exists(os.path.join(output_folder_path, 'all_video_metrics')):
        os.makedirs(os.path.join(output_folder_path,'all_video_metrics'))
        
    stats_results_df.to_csv(os.path.join(output_folder_path, 'all_video_metrics', 'kruskal_' + video_task_str + '.csv'))
    return(stats_results_df)

In [17]:
def boxplot_vid_and_mat_inperson(video_pws_columns, video_fw_columns, mat_pws_columns, 
                                   mat_fw_columns, pws_data, fw_data, group_col,
                                   output_folder_path):

    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)
    
    # preferred walking speed ----------------------------------
    for metric_i, current_metric in enumerate(video_pws_columns):
        current_vid_pws_column = video_pws_columns[metric_i]
        current_mat_pws_column = mat_pws_columns[metric_i]

        # only select vars needed for plotting 
        subset_pws_df = pws_data.loc[:, ['id_date_pose_zv', 'task_pose_zv', group_col, current_vid_pws_column, current_mat_pws_column]]

        # rename var - add pws to be able to differentiate from fw video data 
        new_current_vid_pws_column = 'pws_' + current_vid_pws_column
        subset_pws_df.rename(columns = {current_vid_pws_column : new_current_vid_pws_column}, inplace = True)

        # melt long 
        pws_long_df = pd.melt(subset_pws_df,
                          id_vars=['id_date_pose_zv', group_col],  # Columns to keep
                          value_vars=[new_current_vid_pws_column, current_mat_pws_column],  
                          var_name='data_source',  
                          value_name='metric_value')
        

        # fast walking speed -----------------------------
        current_vid_fw_column = video_fw_columns[metric_i]
        current_mat_fw_column = mat_fw_columns[metric_i]

        # only select vars needed for plotting 
        subset_fw_df = fw_data.loc[:, ['id_date_pose_zv', 'task_pose_zv', group_col, current_vid_fw_column, current_mat_fw_column]]

        # rename var - add fw to be able to differentiate from pws video data 
        new_current_vid_fw_column = 'fw_' + current_vid_fw_column
        subset_fw_df.rename(columns = {current_vid_fw_column : new_current_vid_fw_column}, inplace = True)

        # melt long 
        fw_long_df = pd.melt(subset_fw_df,
                          id_vars=['id_date_pose_zv', group_col],  # Columns to keep
                          value_vars=[new_current_vid_fw_column, current_mat_fw_column],  
                          var_name='data_source',  
                          value_name='metric_value')
        
        # merge pw and fw and plot 
        all_long_df = pd.concat([pws_long_df, fw_long_df], ignore_index = True)

        fig, ax = plt.subplots(figsize=(10, 6))
        sns.boxplot(data=all_long_df, x=group_col, y='metric_value', hue='data_source', dodge = True, fliersize=0, fill = False)
        sns.stripplot(data=all_long_df, x=group_col, y='metric_value', hue ='data_source', dodge = True,  size=4, legend = False)
        plt.savefig(os.path.join(output_folder_path, current_metric + '_all_by' + group_col + '.png'))
        plt.close()

In [18]:
def boxplot_vid_and_mat_home(video_pws_columns, mat_pws_columns, mat_fw_columns, 
                             home_df,  group_col, output_folder_path):
    
    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)
    
    for metric_i, current_metric in enumerate(video_pws_columns):
        current_vid_pws_column = video_pws_columns[metric_i]
        current_mat_pws_column = mat_pws_columns[metric_i]
        current_mat_fw_column = mat_fw_columns[metric_i]

        # only select vars needed for plotting 
        subset_home_df = home_df.loc[:, ['id_date_pose_hv', 'task_pose_hv', group_col, 
                                              current_vid_pws_column, current_mat_pws_column, current_mat_fw_column]]

        # rename var - add pws for legend 
        new_current_vid_pws_column = 'pws_' + current_vid_pws_column
        subset_home_df.rename(columns = {current_vid_pws_column : new_current_vid_pws_column}, inplace = True)

        # melt long 
        home_long_df = pd.melt(subset_home_df,
                          id_vars=['id_date_pose_hv', group_col],  # Columns to keep
                          value_vars=[new_current_vid_pws_column, current_mat_pws_column, current_mat_fw_column],  
                          var_name='data_source',  
                          value_name='metric_value')

        fig, ax = plt.subplots(figsize=(10, 6))
        sns.boxplot(data=home_long_df, x=group_col, y='metric_value', hue='data_source', dodge = True, fliersize=0, fill = False)
        sns.stripplot(data=home_long_df, x=group_col, y='metric_value', hue ='data_source', dodge = True,  size=4, legend = False)
        plt.savefig(os.path.join(output_folder_path, current_metric + '_all_by' + group_col + '.png'))
        plt.close()

In [19]:
def metric_quartile_analysis(df, metric_col_suffix, anova_kruskal_cols, output_folder_path): 

    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)

    # select numeric values with metric_col_suffix (either all zv or hv) - only metrics 
    df_metrics = df.select_dtypes(include=['number'])
    df_metrics = df_metrics.loc[:, df_metrics.columns.str.endswith(metric_col_suffix)]
    
    if metric_col_suffix == 'zv': 
        df_metrics = df_metrics.drop('walk_segment_pose_zv', axis = 1)
    elif metric_col_suffix == 'hv':
        df_metrics = df_metrics.drop('walk_segment_pose_hv', axis = 1)
        
    video_columns = df_metrics.columns 
    
    # make blank df to store new quartile columns names 
    quartile_col_names = []
    
    # add new column with quartile and calculate describe stats for each metric quartiles 
    for metric_i, current_metric in enumerate(video_columns):
    
        # new column - for each metric, 1 = 0-.25 quantile, 2 = .25-.5, 3 = .5-.75, 4 = .75-1
        df = df.copy()
        df.loc[:, current_metric + '_quartile'] = pd.qcut(df[current_metric], 
                                                       q = 4, 
                                                       labels = ['Quartile 1',
                                                                 'Quartile 2',
                                                                 'Quartile 3',
                                                                 'Quartile 4'])
        quartile_col_names.append(current_metric + '_quartile')
        
        # group by the quartile columns and calculate describe stats for each metrics columns; save .csv 
        current_metric_descr_df = df.groupby([current_metric + '_quartile'], sort=True, observed = True).describe().round(3)
        current_metric_descr_df.to_csv(os.path.join(output_folder_path,  current_metric + '_describe_by_quartile.csv'))

    # Compare differences in metrics and outcomes between each video metrics quartile 
    # just run kruskal. describe by summary statistics saved as individual .csv file for each video metric 

    # save blank array to save results 
    print('running kruskal and plotting boxplots for each quartile')
    for quartile_col_i, current_quartile_col in enumerate(quartile_col_names):
        stats_results = []
        for stats_test_i, current_values_col in enumerate(anova_kruskal_cols): 
            # ---------------------------------------------------------
            # compare groups of current values col by quartile of current metric (current_quartile_col) 
            q1_data = df[df[current_quartile_col] == 'Quartile 1'][current_values_col]
            q2_data = df[df[current_quartile_col] == 'Quartile 2'][current_values_col]
            q3_data = df[df[current_quartile_col] == 'Quartile 3'][current_values_col]
            q4_data = df[df[current_quartile_col] == 'Quartile 4'][current_values_col]

            H_stat, p = stats.kruskal(q1_data, q2_data, q3_data, q4_data, nan_policy = 'omit')     
            stats_results.append((current_quartile_col, current_values_col,
                                  'kruskal', round(H_stat, 3), round(p, 3))) 

            # boxplot  ---------------------------------------------------
            fig, ax = plt.subplots(figsize=(10, 6))
            sns.boxplot(data=df, x=current_quartile_col, y=current_values_col, width = 0.75, fliersize=0,  fill = False)
            # Add in points to show each observation
            sns.stripplot(data=df, x=current_quartile_col, y=current_values_col, size=4, color=".3")
            ax.set_ylim(bottom=0)

            # Save the plot
            fig_path = os.path.join(output_folder_path, 
                                    current_quartile_col + '_' + current_values_col + '_boxplot.png')
            fig.savefig(fig_path)
            plt.close()

        # -----------------------------------------------
        # save on .csv file per metric grouped by quartile 
        stats_results_df = pd.DataFrame(stats_results,
                                        columns = ['quartile_grouped_by', 'values_compared_column',
                                               'stats_test', 'H', 'p'])
        stats_results_df.to_csv(os.path.join(output_folder_path,  current_quartile_col + '_kruskal_by_quartile.csv'))


# Run analysis 

## Manually update output folder, load data, and format columns 


In [20]:
# version 
version = '004'

In [21]:
# output folders 
#merged_data_path = r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis\004'
merged_data_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis', 
                                version)

# out_path = r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis\004\video_vs_mat_and_outcomes'
out_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis', 
                        version, 
                        'video_vs_mat_and_outcomes') 

In [22]:
# load variables of interest 

# zeno video metrics 
zv_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code',
                       'gait_bw_zeno_outputs_' + version, 
                       'gait_bw_zeno_outputs_' + version +'_pose_metrics_all.csv')
      
zv_df = pd.read_csv(zv_path, index_col = 0)

# home video metrics 
hv_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code',
                       'gait_bw_home_outputs_' + version, 
                       'gait_bw_home_outputs_' + version + '_pose_metrics_all.csv')
                       
hv_df = pd.read_csv(hv_path, index_col = 0)

# BW and zeno mat metrics 
# decrypted file - may need to decrypt again if says file doesn't exit 
# copied file saved in Megan Project folder in Brainwalk box. 
# if issues decrypting, try copying original file again and then decrypting 
bw_df = pd.read_excel(r'C:\Users\mmccu\AppData\Local\Temp\ccsecure\2024_10_15_BrainWalk_AllData_Long_MM.xlsx', 
                     index_col = None, 
                     usecols = ['bw_id', 'record_id', 'visit_date', 'demoEHR_DiseaseDuration',
                                'demoEHR_GENDER', 'demoEHR_Age', 'demographic_diagnosis', 'bingoEHR_DX_MS DX', 'demoEHR_REC_1',	
                                'demoEHR_REC_2', 'bingoEHR_EDSS_measure_value', 'msfcEHR_T25FW SPEED AVG',
                                'falls_number', 'falls_since_lastsurvey', 'near_falls', 'near_falls_no', 
                                'strength_lt_leg', 'strength_max', 'strength_rt_leg', 'sum_total_scores',
                                'PWS_stridetimesecmean', 'PWS_stridetimeseccv','PWS_cadencestepsminmean','PWS_totaldsupportmean', 
                                'PWS_singlesupportmean','PWS_totaldsupportratiolr', 'PWS_singlesupportratiolr', 
                                'PWS_stridewidthcmmean','PWS_stridewidthcmsd',
                                'FW_stridetimesecmean', 'FW_stridetimeseccv','FW_cadencestepsminmean','FW_totaldsupportmean', 
                                'FW_singlesupportmean','FW_totaldsupportratiolr', 'FW_singlesupportratiolr', 
                                'FW_stridewidthcmmean','FW_stridewidthcmsd'])

In [23]:
# rename zeno video variables 
# ad zv to all zeno volumn names 
zv_df = zv_df.add_suffix('_zv')

# add bw id and video date to zv df 
zv_df['id_video'] = zv_df['id_date_pose_zv'].str.extract(r'(BW-\d{4})')
zv_df['visit_date_video'] = zv_df['id_date_pose_zv'].str[8:]
zv_df['visit_date_video'] = pd.to_datetime(zv_df['visit_date_video'].str.replace('_', '-'), format='%Y-%m-%d')

In [24]:
# add hv to all home column names 
hv_df = hv_df.add_suffix('_hv')

# add bw id and video date to hv df 
hv_df['id_video'] = hv_df['id_date_pose_hv'].str.extract(r'(BW-\d{4})')
hv_df['visit_date_video'] = hv_df['id_date_pose_hv'].str[8:]
hv_df['visit_date_video'] = pd.to_datetime(hv_df['visit_date_video'], errors = 'coerce')

  hv_df['visit_date_video'] = pd.to_datetime(hv_df['visit_date_video'], errors = 'coerce')


In [25]:
# add columns for ordinal EDSS severity and t25fw 
bw_df['edss_severity_num'], bw_df['edss_severity_cat'] = zip(*bw_df['bingoEHR_EDSS_measure_value'].apply(categorize_edss))
bw_df['edss_severity_cat'] = pd.Categorical(bw_df['edss_severity_cat'], categories=["mild", "moderate", "severe"], ordered=True)

# Apply the function to create new columns
bw_df['t25fw_group_num'], bw_df['t25fw_group_cat'] = zip(*bw_df['msfcEHR_T25FW SPEED AVG'].apply(categorize_t25fw))
bw_df['t25fw_group_cat'] = pd.Categorical(bw_df['t25fw_group_cat'], categories=["under_6", "6_to_8", "over_8"], ordered=True)
bw_df.head()

Unnamed: 0,bw_id,record_id,visit_date,demoEHR_DiseaseDuration,demoEHR_GENDER,demoEHR_Age,bingoEHR_DX_MS DX,demoEHR_REC_1,demoEHR_REC_2,bingoEHR_EDSS_measure_value,...,near_falls,near_falls_no,strength_lt_leg,strength_max,strength_rt_leg,sum_total_scores,edss_severity_num,edss_severity_cat,t25fw_group_num,t25fw_group_cat
0,BW-0001,1,2022-08-26,,,,,,,,...,,,,,,46.0,,,,
1,BW-0006,10,2022-09-26,17.0,Female,64.0,PPMS (Primary-progressive Multiple Sclerosis),WhiteAsian,WhiteNonHispanic,3.0,...,No,,,,,46.0,2.0,moderate,1.0,under_6
2,BW-0086,100,2023-08-08,,,,,,,,...,No,,,,,46.0,,,,
3,BW-0087,101,2023-01-18,,,,,,,,...,Yes,5.0,,,,46.0,,,,
4,BW-0088,102,2023-01-23,0.0,Female,33.0,pending,Other/Decline/Unknown,Other/Decline/Unknown,,...,Yes,,,,,46.0,,,2.0,6_to_8


### Merge video data with brainwalk data

In [26]:
# zeno videos - preferred walking speed 
bw_zv_pws_df = merge_bw_zv(bw_df, zv_df, 'gait_vertical_PWS_1', merged_data_path) 

confirm all one task
['gait_vertical_PWS_1']
total zeno videos
253
total bw rows with id in video dataset
285
No matching id and daterow from video vs mat
BW-0010
2022-10-05 00:00:00
multiple rows for the id and date combo
BW-0010
2023-10-18 00:00:00
No matching id and daterow from video vs mat
BW-0031
2023-04-21 00:00:00
No matching id and daterow from video vs mat
BW-0044
2024-04-08 00:00:00
No matching id and daterow from video vs mat
BW-0063
2024-07-15 00:00:00
No matching id and daterow from video vs mat
BW-0067
2024-01-09 00:00:00
No matching id and daterow from video vs mat
BW-0092
2024-08-19 00:00:00
No matching id and daterow from video vs mat
BW-0110
2024-09-16 00:00:00
No matching id and daterow from video vs mat
BW-0121
2022-07-20 00:00:00
No matching id and daterow from video vs mat
BW-0162
2024-09-16 00:00:00
No matching id and daterow from video vs mat
BW-0166
2024-06-17 00:00:00
No matching id and daterow from video vs mat
BW-0277
2024-08-23 00:00:00
No matching id and 

In [27]:
print('----- print bw_zv_pws_df video counts ----')
print_video_counts(bw_zv_pws_df)

----- print bw_zv_pws_df video counts ----
total videos - df length: 241
unique demographic_diagnosis in df: ['MS' 'HC']
num videos with demographic_diagnosis == HC: 34
num videos demographic_diagnosis == MS: 207
------
unique id_video (participants) in df: 177
num participants with demographic_diagnosis == HC: 33
num participants demographic_diagnosis == MS: 144
------
number of participants with multiple videos in dataset: 62
number of participants with one video: 115


In [28]:
# zeno videos - fast walking speed 
bw_zv_fw_df = merge_bw_zv(bw_df, zv_df, 'gait_vertical_FW_1', merged_data_path)

confirm all one task
['gait_vertical_FW_1']
total zeno videos
245
total bw rows with id in video dataset
281
No matching id and daterow from video vs mat
BW-0010
2022-10-05 00:00:00
multiple rows for the id and date combo
BW-0010
2023-10-18 00:00:00
multiple rows for the id and date combo
BW-0023
2023-10-23 00:00:00
No matching id and daterow from video vs mat
BW-0031
2023-04-21 00:00:00
No matching id and daterow from video vs mat
BW-0036
2024-04-02 00:00:00
No matching id and daterow from video vs mat
BW-0044
2023-05-02 00:00:00
multiple rows for the id and date combo
BW-0044
2023-11-30 00:00:00
No matching id and daterow from video vs mat
BW-0044
2024-04-08 00:00:00
No matching id and daterow from video vs mat
BW-0063
2024-07-15 00:00:00
No matching id and daterow from video vs mat
BW-0067
2024-01-09 00:00:00
No matching id and daterow from video vs mat
BW-0110
2024-09-16 00:00:00
No matching id and daterow from video vs mat
BW-0162
2024-09-16 00:00:00
No matching id and daterow fro

In [29]:
print('---- print bw_zv_fw_df video counts ----')
print_video_counts(bw_zv_fw_df)

---- print bw_zv_fw_df video counts ----
total videos - df length: 233
unique demographic_diagnosis in df: ['MS' 'HC']
num videos with demographic_diagnosis == HC: 37
num videos demographic_diagnosis == MS: 196
------
unique id_video (participants) in df: 176
num participants with demographic_diagnosis == HC: 35
num participants demographic_diagnosis == MS: 141
------
number of participants with multiple videos in dataset: 56
number of participants with one video: 120


In [30]:
# home videos - merge home videos and BW 
bw_hv_r_pws_df = merge_bw_hv(bw_df, hv_df, 'gait_vertical_right', merged_data_path)
bw_hv_l_pws_df = merge_bw_hv(bw_df, hv_df, 'gait_vertical_left', merged_data_path)

confirm all one task
['gait_vertical_right']
total home videos
31
total bw rows with id in video dataset
51
mismatched home video vs brainwalk id
0
confirm all one task
['gait_vertical_left']
total home videos
30
total bw rows with id in video dataset
50
mismatched home video vs brainwalk id
0


In [31]:
# concatenate right and left 
bw_hv_pws_df = pd.concat([bw_hv_r_pws_df, bw_hv_l_pws_df], axis = 0).sort_index()
bw_hv_pws_df['edss_severity_cat'] = pd.Categorical(bw_hv_pws_df['edss_severity_cat'], categories=["mild", "moderate", "severe"], ordered=True)
print('---- print bw_hv_pws_df video counts ----')
print_video_counts(bw_hv_pws_df)

# save merged df  
bw_hv_pws_df.to_csv(os.path.join(merged_data_path,  'hv_bw_merged.csv'))

---- print bw_hv_pws_df video counts ----
total videos - df length: 61
unique demographic_diagnosis in df: ['MS']
num videos with demographic_diagnosis == HC: 0
num videos demographic_diagnosis == MS: 61
------
unique id_video (participants) in df: 27
num participants with demographic_diagnosis == HC: 0
num participants demographic_diagnosis == MS: 27
------
number of participants with multiple videos in dataset: 26
number of participants with one video: 1


In [32]:
bw_hv_pws_df.head()

Unnamed: 0,id_date_pose_hv,video_id_date_name_pose_hv,task_pose_hv,total_video_duration_sec_pose_hv,walking_segmets_n_pose_hv,walking_segments_duration_mean_pose_hv,walking_segments_duration_median_pose_hv,stride_time_mean_sec_pose_hv,stride_time_median_sec_pose_hv,stride_time_std_pose_hv,...,near_falls_no,strength_lt_leg,strength_max,strength_rt_leg,sum_total_scores,edss_severity_num,edss_severity_cat,t25fw_group_num,t25fw_group_cat,bw_hv_abs_date_diff
0,BW-0018\10-24-23,gait_vertical_left_BW-0018_10-24-23,gait_vertical_left,26.8,1,3.17,3.17,1.008,1.05,0.171,...,,I can easily raise it and keep it raised,0.0,I can easily raise it and keep it raised,2.0,2.0,moderate,1.0,under_6,0 days
1,BW-0018\10-24-23,gait_vertical_right_BW-0018_10-24-23,gait_vertical_right,24.4,1,2.4,2.4,,,,...,,I can easily raise it and keep it raised,0.0,I can easily raise it and keep it raised,2.0,2.0,moderate,1.0,under_6,0 days
2,BW-0023\05-05-23,gait_vertical_left_BW-0023_05-05-23,gait_vertical_left,35.97,4,4.42,4.33,1.027,1.033,0.066,...,2.0,I can easily raise it and keep it raised,0.0,I can easily raise it and keep it raised,15.0,2.0,moderate,1.0,under_6,171 days
3,BW-0023\05-05-23,gait_vertical_right_BW-0023_05-05-23,gait_vertical_right,38.47,5,4.78,4.63,1.07,1.033,0.098,...,2.0,I can easily raise it and keep it raised,0.0,I can easily raise it and keep it raised,15.0,2.0,moderate,1.0,under_6,171 days
4,BW-0023\10-23-23,gait_vertical_left_BW-0023_10-23-23,gait_vertical_left,36.1,3,4.19,3.83,0.998,0.983,0.087,...,2.0,I can easily raise it and keep it raised,0.0,I can easily raise it and keep it raised,15.0,2.0,moderate,1.0,under_6,0 days


### Drop rows with missing brainwalk mat data 
- use these data frames when comparing the video metrics to mat metrics 

In [33]:
# zeno preferred walk - participants with pws videos and pws mat metrics
bw_zv_pws_df_2 = drop_cols_missing_data(bw_zv_pws_df, cols_to_check = 'pws')
print_video_counts(bw_zv_pws_df_2)

total videos - df length: 237
unique demographic_diagnosis in df: ['MS' 'HC']
num videos with demographic_diagnosis == HC: 34
num videos demographic_diagnosis == MS: 203
------
unique id_video (participants) in df: 175
num participants with demographic_diagnosis == HC: 33
num participants demographic_diagnosis == MS: 142
------
number of participants with multiple videos in dataset: 60
number of participants with one video: 115


In [34]:
# zeno fast walk - participants with FWS videos and fWS mat metrics
bw_zv_fw_df_2 = drop_cols_missing_data(bw_zv_fw_df, cols_to_check = 'fw')
print_video_counts(bw_zv_fw_df_2)

total videos - df length: 230
unique demographic_diagnosis in df: ['MS' 'HC']
num videos with demographic_diagnosis == HC: 37
num videos demographic_diagnosis == MS: 193
------
unique id_video (participants) in df: 173
num participants with demographic_diagnosis == HC: 35
num participants demographic_diagnosis == MS: 138
------
number of participants with multiple videos in dataset: 56
number of participants with one video: 117


In [35]:
# home videos - participants with home videos and pws mat metrics 
bw_hv_pws_df_2 = drop_cols_missing_data(bw_hv_pws_df, cols_to_check = 'pws')
print_video_counts(bw_hv_pws_df_2)

total videos - df length: 55
unique demographic_diagnosis in df: ['MS']
num videos with demographic_diagnosis == HC: 0
num videos demographic_diagnosis == MS: 55
------
unique id_video (participants) in df: 24
num participants with demographic_diagnosis == HC: 0
num participants demographic_diagnosis == MS: 24
------
number of participants with multiple videos in dataset: 23
number of participants with one video: 1


In [36]:
# home videos - participants with home videos and fw mat metrics 
bw_hv_fw_df_2 = drop_cols_missing_data(bw_hv_pws_df, cols_to_check = 'fw')
print_video_counts(bw_hv_fw_df_2)

total videos - df length: 61
unique demographic_diagnosis in df: ['MS']
num videos with demographic_diagnosis == HC: 0
num videos demographic_diagnosis == MS: 61
------
unique id_video (participants) in df: 27
num participants with demographic_diagnosis == HC: 0
num participants demographic_diagnosis == MS: 27
------
number of participants with multiple videos in dataset: 26
number of participants with one video: 1


### Drop rows with missing brainwalk clinical outcomes measures 
- use these data frames when comparing video metrics to clinical outcomes (ie - exclude participants with missing edss)

In [37]:
# zeno preferred walk - participants with pws videos and t25fw 
bw_zv_pws_t25fw_df = drop_cols_missing_data(bw_zv_pws_df, cols_to_check = 't25fw')
print_video_counts(bw_zv_pws_t25fw_df) 

total videos - df length: 146
unique demographic_diagnosis in df: ['MS']
num videos with demographic_diagnosis == HC: 0
num videos demographic_diagnosis == MS: 146
------
unique id_video (participants) in df: 124
num participants with demographic_diagnosis == HC: 0
num participants demographic_diagnosis == MS: 124
------
number of participants with multiple videos in dataset: 22
number of participants with one video: 102


In [38]:
# zeno preferred walk - participants with pws videos and edss 
bw_zv_pws_edss_df = drop_cols_missing_data(bw_zv_pws_df, cols_to_check = 'edss')
print_video_counts(bw_zv_pws_edss_df) 

total videos - df length: 198
unique demographic_diagnosis in df: ['MS']
num videos with demographic_diagnosis == HC: 0
num videos demographic_diagnosis == MS: 198
------
unique id_video (participants) in df: 137
num participants with demographic_diagnosis == HC: 0
num participants demographic_diagnosis == MS: 137
------
number of participants with multiple videos in dataset: 59
number of participants with one video: 78


In [39]:
# zeno fast walk - participants with fw videos and t25fw 
bw_zv_fw_t25fw_df = drop_cols_missing_data(bw_zv_fw_df, cols_to_check = 't25fw')
print_video_counts(bw_zv_fw_t25fw_df) 

total videos - df length: 145
unique demographic_diagnosis in df: ['MS']
num videos with demographic_diagnosis == HC: 0
num videos demographic_diagnosis == MS: 145
------
unique id_video (participants) in df: 122
num participants with demographic_diagnosis == HC: 0
num participants demographic_diagnosis == MS: 122
------
number of participants with multiple videos in dataset: 22
number of participants with one video: 100


In [40]:
# zeno fast walk - participants with fw videos and edss 
bw_zv_fw_edss_df = drop_cols_missing_data(bw_zv_fw_df, cols_to_check = 'edss')
print_video_counts(bw_zv_fw_edss_df) 

total videos - df length: 191
unique demographic_diagnosis in df: ['MS']
num videos with demographic_diagnosis == HC: 0
num videos demographic_diagnosis == MS: 191
------
unique id_video (participants) in df: 137
num participants with demographic_diagnosis == HC: 0
num participants demographic_diagnosis == MS: 137
------
number of participants with multiple videos in dataset: 53
number of participants with one video: 84


In [41]:
# home videos - participants with home videos and t25fw 
bw_hv_t25fw_df = drop_cols_missing_data(bw_hv_pws_df, cols_to_check = 't25fw')
print_video_counts(bw_hv_t25fw_df)

total videos - df length: 57
unique demographic_diagnosis in df: ['MS']
num videos with demographic_diagnosis == HC: 0
num videos demographic_diagnosis == MS: 57
------
unique id_video (participants) in df: 26
num participants with demographic_diagnosis == HC: 0
num participants demographic_diagnosis == MS: 26
------
number of participants with multiple videos in dataset: 25
number of participants with one video: 1


In [42]:
# home videos - participants with home videos and edss 
bw_hv_edss_df = drop_cols_missing_data(bw_hv_pws_df, cols_to_check = 'edss')
print_video_counts(bw_hv_edss_df)

total videos - df length: 59
unique demographic_diagnosis in df: ['MS']
num videos with demographic_diagnosis == HC: 0
num videos demographic_diagnosis == MS: 59
------
unique id_video (participants) in df: 26
num participants with demographic_diagnosis == HC: 0
num participants demographic_diagnosis == MS: 26
------
number of participants with multiple videos in dataset: 25
number of participants with one video: 1


## Correlation: Mat Metrics vs Video Metrics 

### Check Normality and save histograms 
Checks normality and plots histogram for each column in dataframe where type == float 

In [43]:
# PWS Zeno Videos - video of walk on mat 
hist_out_path_1 = (os.path.join(out_path, '00_hist_shapiro_all', 'zv_vs_bw_pws'))
if not os.path.exists(hist_out_path_1):
    os.makedirs(hist_out_path_1)
     
bw_zv_pws_2_shapiro = hist_and_shapiro(bw_zv_pws_df_2, hist_out_path_1)

# save in test normality folder
bw_zv_pws_2_shapiro.to_csv(os.path.join(hist_out_path_1, 'zv_vs_bw_pws_shapiro_results.csv'))

In [44]:
# FWS Zeno Videos - video of walk on mat 
hist_out_path_2 = (os.path.join(out_path, '00_hist_shapiro_all', 'zv_vs_bw_fw'))
if not os.path.exists(hist_out_path_2):
    os.makedirs(hist_out_path_2)

bw_zv_fw_2_shapiro = hist_and_shapiro(bw_zv_fw_df_2, hist_out_path_2)

# save in test normality folder
bw_zv_fw_2_shapiro.to_csv(os.path.join(hist_out_path_2, 'zv_vs_bw_fw_shapiro_results.csv'))

In [45]:
# Home videos - video closest to walk on mat date 
hist_out_path_3 = (os.path.join(out_path, '00_hist_shapiro_all', 'hv_all_vs_bw_pws'))
if not os.path.exists(hist_out_path_3):
    os.makedirs(hist_out_path_3)

bw_hv_pws_2_shapiro = hist_and_shapiro(bw_hv_pws_df_2, hist_out_path_3)

# save in test normality folder 
bw_hv_pws_2_shapiro.to_csv(os.path.join(hist_out_path_3, 'hv_all_vs_bw_pws_shapiro_results.csv'))

In [46]:
# Home videos - video closest to walk on mat date 
hist_out_path_4 = (os.path.join(out_path, '00_hist_shapiro_all', 'hv_all_vs_bw_fw'))
if not os.path.exists(hist_out_path_4):
    os.makedirs(hist_out_path_4)

bw_hv_fw_2_shapiro = hist_and_shapiro(bw_hv_fw_df_2, hist_out_path_4)

# save in test normality folder 
bw_hv_fw_2_shapiro.to_csv(os.path.join(hist_out_path_4, 'hv_all_vs_bw_fw_shapiro_results.csv'))

### Run correlation 

In [47]:
# set cols to compare 
# column pairs to evaluate matching metrics 
# for each of the column pairs below (zv 1 vs bw1, zv 2 vs bw 3, etc), run and save correlation 
zv_colnames = ['stride_time_median_sec_pose_zv', 
               'stride_time_mean_sec_pose_zv',
               'foot1_gait_cycle_time_mean_pose_zv',
               'stride_time_cv_pose_zv', 
               'mean_cadence_step_per_min_pose_zv',
               'foot1_double_support_per_mean_pose_zv',
               'foot1_single_support_per_mean_pose_zv',
               'stride_width_median_cm_pose_zv',
               'stride_width_mean_cm_pose_zv',
               'stride_width_std_pose_zv']

hv_colnames = ['stride_time_median_sec_pose_hv', 
               'stride_time_mean_sec_pose_hv',
               'foot1_gait_cycle_time_mean_pose_hv', 
               'stride_time_cv_pose_hv', 
               'mean_cadence_step_per_min_pose_hv',
               'foot1_double_support_per_mean_pose_hv',
               'foot1_single_support_per_mean_pose_hv',
               'stride_width_median_cm_pose_hv',
               'stride_width_mean_cm_pose_hv',
               'stride_width_std_pose_hv']
               
bw_pws_colnames = ['PWS_stridetimesecmean', 
                   'PWS_stridetimesecmean',
                   'PWS_stridetimesecmean',
                   'PWS_stridetimeseccv',
                   'PWS_cadencestepsminmean',
                   'PWS_totaldsupportmean',
                   'PWS_singlesupportmean', 
                   'PWS_stridewidthcmmean',
                   'PWS_stridewidthcmmean',
                   'PWS_stridewidthcmsd']

bw_fw_colnames = ['FW_stridetimesecmean', 
                  'FW_stridetimesecmean', 
                  'FW_stridetimesecmean', 
                  'FW_stridetimeseccv',
                   'FW_cadencestepsminmean',
                   'FW_totaldsupportmean',
                   'FW_singlesupportmean', 
                   'FW_stridewidthcmmean',
                  'FW_stridewidthcmmean',
                   'FW_stridewidthcmsd']

units = ['seconds',
         'seconds',
         'seconds',
         'CV%',
         'steps/min',
         '%',
         '%', 
         'cm',
         'cm',
         'cm']


In [48]:
# PWS Zeno Videos - video of walk on mat 
corr_out_path1 = os.path.join(out_path, 'zv_vs_bw_pws_metric_correlation')

bw_zv_pws_corr_results_df = metric_correlation(bw_zv_pws_df_2, zv_colnames, bw_pws_colnames, corr_out_path1)
bw_zv_pws_corr_results_df.to_csv(os.path.join(corr_out_path1, 'bw_zv_pws_metric_correlation.csv'))

In [49]:
# FW Zeno Videos - video of walk on mat 
corr_out_path2 = os.path.join(out_path, 'zv_vs_bw_fw_metric_correlation')

bw_zv_fw_corr_results_df = metric_correlation(bw_zv_fw_df_2, zv_colnames, bw_fw_colnames, corr_out_path2)
bw_zv_fw_corr_results_df.to_csv(os.path.join(corr_out_path2, 'bw_zv_fw_metric_correlation.csv'))

In [50]:
# Home videos - video closest to PWS walk on mat date 
corr_out_path3 = os.path.join(out_path, 'hv_vs_bw_pws_metric_correlation')

bw_hv_pws_corr_results_df = metric_correlation(bw_hv_pws_df_2, hv_colnames, bw_pws_colnames, corr_out_path3)
bw_hv_pws_corr_results_df.to_csv(os.path.join(corr_out_path3, 'bw_hv_pws_metric_correlation.csv'))

In [51]:
# Home videos - video closest to FW walk on mat date 
corr_out_path4 = os.path.join(out_path, 'hv_vs_bw_fw_metric_correlation')

bw_hv_pws_corr_results_df = metric_correlation(bw_hv_pws_df_2, hv_colnames, bw_fw_colnames, corr_out_path4)
bw_hv_pws_corr_results_df.to_csv(os.path.join(corr_out_path4, 'bw_hv_fw_metric_correlation.csv'))

## Mean Error: BW mat metrics vs Zeno video metrics 

In [52]:
# zeno video vs brainwalk mat: preferred walking speed 
mae_out_path1 = os.path.join(out_path, 'zv_vs_bw_pws_metric_mean_error')

bw_zv_pws_mae_df = calculate_metric_mean_error(bw_zv_pws_df_2, zv_colnames, bw_pws_colnames, units, mae_out_path1)
bw_zv_pws_mae_df.to_csv(os.path.join(mae_out_path1, 'bw_zv_pws_metric_mean_error_mae.csv'))


In [53]:
# zeno video vs brainwalk mat: fast walking speed 
mae_out_path2 = os.path.join(out_path, 'zv_vs_bw_fw_metric_mean_error')

bw_zv_fw_mae_df = calculate_metric_mean_error(bw_zv_fw_df_2, zv_colnames, bw_fw_colnames, units, mae_out_path2)
bw_zv_fw_mae_df.to_csv(os.path.join(mae_out_path2, 'bw_zv_fw_metric_mean_error_mae.csv'))

## Bland Altman: BW mat metrics vs Zeno video metrics 

In [54]:
# preferred walking speed 
bland_alt_path1 = os.path.join(out_path, 'zv_vs_bw_pws_metric_BlandAlt')
bland_altman_plot(bw_zv_pws_df_2, zv_colnames, bw_pws_colnames, units, bland_alt_path1)

In [55]:
# fast walking speed 
# preferred walking speed 
bland_alt_path2 = os.path.join(out_path, 'zv_vs_bw_fw_metric_BlandAlt')
bland_altman_plot(bw_zv_fw_df_2, zv_colnames, bw_fw_colnames, units, bland_alt_path2)

## Correlation of Gait Metrics vs Clinical Outcomes 
- includes both mat and video gait metrics

In [56]:
# In person zeno videos + mat at PWS vs EDSS score and EDSS severity 
outcome_corr_path1 = os.path.join(out_path, 'zv_vs_bw_pws_outcome_corr')
zv_pws_vs_edss_corr = outcome_correlation(bw_zv_pws_edss_df, outcome_corr_path1, 'zv_bw_pws', 'edss')
zv_pws_vs_edss_corr.to_csv(os.path.join(outcome_corr_path1, 'zv_bw_pws_edss_corr.csv'))

In [57]:
# In person zeno videos at PWS vs T25FW 
outcome_corr_path1 = os.path.join(out_path, 'zv_vs_bw_pws_outcome_corr')
zv_pws_vs_t25fw_corr = outcome_correlation(bw_zv_pws_t25fw_df, outcome_corr_path1, 'zv_bw_pws', 't25fw')
zv_pws_vs_t25fw_corr.to_csv(os.path.join(outcome_corr_path1, 'zv_bw_pws_t25fw_corr.csv'))

  corr, p_value = stats.spearmanr(df_num_clean[col1], df_num_clean[col2])
  corr, p_value = stats.spearmanr(df_num_clean[col1], df_num_clean[col2])


In [58]:
# In person zeno videos + mat at FW vs EDSS score and EDSS Severity 
outcome_corr_path2 = os.path.join(out_path, 'zv_vs_bw_fw_outcome_corr')
zv_fw_vs_edss_corr = outcome_correlation(bw_zv_fw_edss_df, outcome_corr_path2, 'zv_bw_fw', 'edss')
zv_fw_vs_edss_corr.to_csv(os.path.join(outcome_corr_path2, 'zv_bw_fw_edss_corr.csv'))

In [59]:
# in person zeno videos + mat at FW vs T25FW 
outcome_corr_path2 = os.path.join(out_path, 'zv_vs_bw_fw_outcome_corr')
zv_fw_vs_t25fw_corr = outcome_correlation(bw_zv_fw_t25fw_df, outcome_corr_path2, 'zv_bw_fw', 't25fw')
zv_fw_vs_t25fw_corr.to_csv(os.path.join(outcome_corr_path2, 'zv_bw_fw_t25fw_corr.csv'))

  corr, p_value = stats.spearmanr(df_num_clean[col1], df_num_clean[col2])
  corr, p_value = stats.spearmanr(df_num_clean[col1], df_num_clean[col2])


### Not sure if enough videos for correlation
Especially for only right or only left 

In [60]:
# bw home videos + in-person pws speed vs EDSS score and EDSS severity 
# all videos - can be two rows per particiant, one for gait vertical right and one for gait vertical left 
outcome_corr_path3 = os.path.join(out_path, 'hv_vs_bw_pws_outcome_corr')
hv_pws_vs_edss_corr = outcome_correlation(bw_hv_edss_df, outcome_corr_path3, 'hv_all_bw_pws', 'edss')
hv_pws_vs_edss_corr.to_csv(os.path.join(outcome_corr_path3, 'hv_all_bw_pws_edss_corr.csv'))


# only "gait_vertical_left" 
bw_hv_edss_df_left = bw_hv_edss_df.loc[bw_hv_edss_df['task_pose_hv'] == 'gait_vertical_left']
hv_left_pws_vs_edss_corr = outcome_correlation(bw_hv_edss_df_left, outcome_corr_path3, 'hv_left_bw_pws', 'edss')
hv_left_pws_vs_edss_corr.to_csv(os.path.join(outcome_corr_path3, 'hv_left_bw_pws_edss_corr.csv'))

# only "gait_vertical_right" 
bw_hv_edss_df_right = bw_hv_edss_df.loc[bw_hv_edss_df['task_pose_hv'] == 'gait_vertical_right']
hv_right_pws_vs_edss_corr = outcome_correlation(bw_hv_edss_df_right, outcome_corr_path3, 'hv_right_bw_pws', 'edss')
hv_right_pws_vs_edss_corr.to_csv(os.path.join(outcome_corr_path3, 'hv_right_bw_pws_edss_corr.csv'))

  corr, p_value = stats.spearmanr(df_num_clean[col1], df_num_clean[col2])
  corr, p_value = stats.spearmanr(df_num_clean[col1], df_num_clean[col2])
  corr, p_value = stats.spearmanr(df_num_clean[col1], df_num_clean[col2])
  corr, p_value = stats.spearmanr(df_num_clean[col1], df_num_clean[col2])
  corr, p_value = stats.spearmanr(df_num_clean[col1], df_num_clean[col2])
  corr, p_value = stats.spearmanr(df_num_clean[col1], df_num_clean[col2])
  corr, p_value = stats.spearmanr(df_num_clean[col1], df_num_clean[col2])
  corr, p_value = stats.spearmanr(df_num_clean[col1], df_num_clean[col2])


In [61]:
# bw_hv_t25fw_df
# bw home videos + in-person pws speed vs T25FW 
outcome_corr_path3 = os.path.join(out_path, 'hv_vs_bw_pws_outcome_corr')
hv_pws_vs_t25fw_corr = outcome_correlation(bw_hv_t25fw_df, outcome_corr_path3, 'hv_all_bw_pws', 't25fw')
hv_pws_vs_t25fw_corr.to_csv(os.path.join(outcome_corr_path3, 'hv_all_bw_pws_t25fw_corr.csv'))

# only "gait_vertical_left" 
bw_hv_t25fw_df_left = bw_hv_t25fw_df.loc[bw_hv_t25fw_df['task_pose_hv'] == 'gait_vertical_left']
hv_left_pws_vs_t25fw_corr = outcome_correlation(bw_hv_t25fw_df_left, outcome_corr_path3, 'hv_left_bw_pws', 't25fw')
hv_left_pws_vs_t25fw_corr.to_csv(os.path.join(outcome_corr_path3, 'hv_left_bw_pws_t25fw_corr.csv'))

# only "gait_vertical_right" 
bw_hv_t25fw_df_right = bw_hv_t25fw_df.loc[bw_hv_t25fw_df['task_pose_hv'] == 'gait_vertical_right']
hv_right_pws_vs_t25fw_corr = outcome_correlation(bw_hv_t25fw_df_right, outcome_corr_path3, 'hv_right_bw_pws', 't25fw')
hv_right_pws_vs_t25fw_corr.to_csv(os.path.join(outcome_corr_path3, 'hv_right_bw_pws_t25fw_corr.csv'))

  corr, p_value = stats.spearmanr(df_num_clean[col1], df_num_clean[col2])
  corr, p_value = stats.spearmanr(df_num_clean[col1], df_num_clean[col2])
  corr, p_value = stats.spearmanr(df_num_clean[col1], df_num_clean[col2])
  corr, p_value = stats.spearmanr(df_num_clean[col1], df_num_clean[col2])
  corr, p_value = stats.spearmanr(df_num_clean[col1], df_num_clean[col2])
  corr, p_value = stats.spearmanr(df_num_clean[col1], df_num_clean[col2])
  corr, p_value = stats.spearmanr(df_num_clean[col1], df_num_clean[col2])
  corr, p_value = stats.spearmanr(df_num_clean[col1], df_num_clean[col2])
  corr, p_value = stats.spearmanr(df_num_clean[col1], df_num_clean[col2])
  corr, p_value = stats.spearmanr(df_num_clean[col1], df_num_clean[col2])
  corr, p_value = stats.spearmanr(df_num_clean[col1], df_num_clean[col2])


## Analyze difference between categorical groups (diagnoses, edss severity, etc)
- To-do: currently all non-parametric test, check individually before writing manuscript - most seem non-parametric but some should maybe be t-test or anova

### All video metrics by HC vs MS 

In [62]:
# in person preferred walk 
outpath_1 = os.path.join(out_path, 'zv_vs_bw_pws_metric_by_demographic_diagnosis')
bw_zv_pws_df_2.loc[:, 'demographic_diagnosis'] = pd.Categorical(bw_zv_pws_df_2.loc[:,'demographic_diagnosis'], categories=['MS', 'HC'], ordered=True)

# histograms 
check_ttest_anova_assumptions(bw_zv_pws_df_2, 
                              group_col = 'demographic_diagnosis', 
                              vid_metric_col_suffix = 'zv',
                              mat_metric_col_prefix = 'PWS_',
                              output_folder_path = outpath_1, 
                              video_task_str = 'zv_bw_pws')

# boxplots 
boxplot_cat_allmetrics(bw_zv_pws_df_2, 
            group_col = 'demographic_diagnosis', 
            metric_col_suffix = 'zv',
            output_folder_path = outpath_1, 
            video_task_str = 'zv_bw_pws')


# Mannwhitneyu test 
pws_demographic_diagnosis_results =  mannwhitneyu_all_metrics(bw_zv_pws_df_2,
                                group_col = 'demographic_diagnosis',
                                group_1 = 'HC', 
                                group_2 = 'MS', 
                                metric_col_suffix = 'zv', 
                                output_folder_path = outpath_1, 
                                video_task_str = 'zv_bw_pws')

groups
['MS' 'HC']


In [63]:
# in person fast walk
outpath_1b = os.path.join(out_path, 'zv_vs_bw_fw_metric_by_demographic_diagnosis')
bw_zv_fw_df_2.loc[:, 'demographic_diagnosis'] = pd.Categorical(bw_zv_fw_df_2.loc[:,'demographic_diagnosis'], categories=['MS', 'HC'], ordered=True)

# histograms 
check_ttest_anova_assumptions(bw_zv_fw_df_2, 
                              group_col = 'demographic_diagnosis', 
                              vid_metric_col_suffix = 'zv',
                              mat_metric_col_prefix = 'FW_',
                              output_folder_path = outpath_1b, 
                              video_task_str = 'zv_bw_fw')

# boxplots 
boxplot_cat_allmetrics(bw_zv_fw_df_2, 
            group_col = 'demographic_diagnosis', 
            metric_col_suffix = 'zv',
            output_folder_path = outpath_1b, 
            video_task_str = 'zv_bw_fw')

# Mannwhitneyu test 
pws_demographic_diagnosis_results =  mannwhitneyu_all_metrics(bw_zv_fw_df_2,
                                group_col = 'demographic_diagnosis',
                                group_1 = 'HC', 
                                group_2 = 'MS', 
                                metric_col_suffix = 'zv', 
                                output_folder_path = outpath_1b, 
                                video_task_str = 'zv_bw_fw')

groups
['MS' 'HC']


  plt.savefig(os.path.join(output_folder_path, 'check_normality', current_metric_col + '_hist.png'))


In [64]:
# home video - not applicable - only MS participants 

### All video metrics by EDSS severity 

In [65]:
print(bw_zv_pws_edss_df['edss_severity_cat'].unique())

['moderate', 'mild', 'severe']
Categories (3, object): ['mild' < 'moderate' < 'severe']


In [66]:
# in person preferred walk 
outpath_2 = os.path.join(out_path, 'zv_vs_bw_pws_metric_by_edss_severity')

# dataset = participants with MS 
check_ttest_anova_assumptions(bw_zv_pws_edss_df, 
                              group_col = 'edss_severity_cat', 
                              vid_metric_col_suffix = 'zv', 
                              mat_metric_col_prefix = 'PWS_',
                              output_folder_path = outpath_2, 
                              video_task_str = 'zv_bw_pws')



boxplot_cat_allmetrics(bw_zv_pws_edss_df, 
            group_col = 'edss_severity_cat', 
            metric_col_suffix = 'zv',
            output_folder_path = outpath_2, 
            video_task_str = 'zv_bw_pws')


zv_pws_edss_severity_kruskal = kruskalwallace_all_metrics(df = bw_zv_pws_edss_df,
                                                          group_col = 'edss_severity_cat',
                                                          group_1 = 'mild',
                                                          group_2 = 'moderate',
                                                          group_3 = 'severe',
                                                          metric_col_suffix = 'zv',
                                                          output_folder_path = outpath_2,
                                                          video_task_str = 'zv_bw_pws')


groups
['moderate', 'mild', 'severe']
Categories (3, object): ['mild' < 'moderate' < 'severe']


In [67]:
# in person fast walk 
outpath_2b = os.path.join(out_path, 'zv_vs_bw_fw_metric_by_edss_severity')

# histograms 
check_ttest_anova_assumptions(bw_zv_fw_edss_df, 
                              group_col = 'edss_severity_cat', 
                              vid_metric_col_suffix = 'zv', 
                              mat_metric_col_prefix = 'FW_',
                              output_folder_path = outpath_2b, 
                              video_task_str = 'zv_bw_fw')


# boxplots 
boxplot_cat_allmetrics(bw_zv_fw_edss_df, 
            group_col = 'edss_severity_cat', 
            metric_col_suffix = 'zv',
            output_folder_path = outpath_2b, 
            video_task_str = 'zv_bw_fw')

zv_fw_edss_severity_kruskal = kruskalwallace_all_metrics(df = bw_zv_fw_edss_df,
                                                          group_col = 'edss_severity_cat',
                                                          group_1 = 'mild',
                                                          group_2 = 'moderate',
                                                          group_3 = 'severe',
                                                          metric_col_suffix = 'zv',
                                                          output_folder_path = outpath_2b,
                                                          video_task_str = 'zv_bw_fw')

groups
['moderate', 'mild', 'severe']
Categories (3, object): ['mild' < 'moderate' < 'severe']


In [68]:
# home videos 
outpath_2c = os.path.join(out_path, 'hv_vs_bw_pws_metric_by_edss_severity')

# histograms 
check_ttest_anova_assumptions(bw_hv_edss_df, 
                              group_col = 'edss_severity_cat', 
                              vid_metric_col_suffix = 'hv', 
                              mat_metric_col_prefix = 'PWS_',
                              output_folder_path = outpath_2c, 
                              video_task_str = 'hv_bw_pws')


# boxplots 
boxplot_cat_allmetrics(bw_hv_edss_df, 
            group_col = 'edss_severity_cat', 
            metric_col_suffix = 'hv',
            output_folder_path = outpath_2c, 
            video_task_str = 'hv_bw_pws')

hv_pws_edss_severity_kruskal = kruskalwallace_all_metrics(df = bw_hv_edss_df,
                                                          group_col = 'edss_severity_cat',
                                                          group_1 = 'mild',
                                                          group_2 = 'moderate',
                                                          group_3 = 'severe',
                                                          metric_col_suffix = 'hv',
                                                          output_folder_path = outpath_2c,
                                                          video_task_str = 'hv_bw_pws')

groups
['moderate', 'severe', 'mild']
Categories (3, object): ['mild' < 'moderate' < 'severe']


### All Video metrics by T25FW time category  

In [69]:
# in person preferred walk 
outpath_4 = os.path.join(out_path, 'zv_vs_bw_pws_metric_by_t25fw_group')

# dataset = participants with MS 
check_ttest_anova_assumptions(bw_zv_pws_t25fw_df, 
                              group_col = 't25fw_group_cat', 
                              vid_metric_col_suffix = 'zv', 
                              mat_metric_col_prefix = 'PWS_',
                              output_folder_path = outpath_4, 
                              video_task_str = 'zv_bw_pws')



boxplot_cat_allmetrics(bw_zv_pws_t25fw_df, 
            group_col = 't25fw_group_cat', 
            metric_col_suffix = 'zv',
            output_folder_path = outpath_4, 
            video_task_str = 'zv_bw_pws')

zv_bw_pws_kruskal = kruskalwallace_all_metrics(df = bw_zv_pws_t25fw_df,
                                                          group_col = 't25fw_group_cat',
                                                          group_1 = 'under_6',
                                                          group_2 = '6_to_8',
                                                          group_3 = 'over_8',
                                                          metric_col_suffix = 'zv',
                                                          output_folder_path = outpath_4,
                                                          video_task_str = 'zv_bw_pws')

groups
['6_to_8', 'under_6', 'over_8']
Categories (3, object): ['under_6' < '6_to_8' < 'over_8']


In [70]:
# in person fast walk 
outpath_4b = os.path.join(out_path, 'zv_vs_bw_fw_metric_by_t25fw_group')

# dataset = participants with MS 
check_ttest_anova_assumptions(bw_zv_fw_t25fw_df,
                              group_col = 't25fw_group_cat', 
                              vid_metric_col_suffix = 'zv',
                              mat_metric_col_prefix = 'FW_',
                              output_folder_path = outpath_4b, 
                              video_task_str = 'zv_bw_fw')


boxplot_cat_allmetrics(bw_zv_fw_t25fw_df, 
            group_col = 't25fw_group_cat', 
            metric_col_suffix = 'zv',
            output_folder_path = outpath_4b, 
            video_task_str = 'zv_bw_fw')

zv_bw_fw_kruskal = kruskalwallace_all_metrics(df = bw_zv_fw_t25fw_df,
                                                          group_col = 't25fw_group_cat',
                                                          group_1 = 'under_6',
                                                          group_2 = '6_to_8',
                                                          group_3 = 'over_8',
                                                          metric_col_suffix = 'zv',
                                                          output_folder_path = outpath_4b,
                                                          video_task_str = 'zv_bw_fw')

groups
['6_to_8', 'under_6', 'over_8']
Categories (3, object): ['under_6' < '6_to_8' < 'over_8']


In [71]:
# home videos 
outpath_4c = os.path.join(out_path, 'hv_vs_bw_pws_metric_by_t25fw_group')

# dataset = participants with MS 
check_ttest_anova_assumptions(bw_hv_t25fw_df, 
                              group_col = 't25fw_group_cat', 
                              vid_metric_col_suffix = 'hv', 
                              mat_metric_col_prefix = 'PWS_', 
                              output_folder_path = outpath_4c, 
                              video_task_str = 'hv_bw_pws')



boxplot_cat_allmetrics(bw_hv_t25fw_df, 
            group_col = 't25fw_group_cat', 
            metric_col_suffix = 'hv',
            output_folder_path = outpath_4c, 
            video_task_str = 'hv_bw_pws')

hv_bw_pws_kruskal = kruskalwallace_all_metrics(df = bw_hv_t25fw_df,
                                                          group_col = 't25fw_group_cat',
                                                          group_1 = 'under_6',
                                                          group_2 = '6_to_8',
                                                          group_3 = 'over_8',
                                                          metric_col_suffix = 'hv',
                                                          output_folder_path = outpath_4c,
                                                          video_task_str = 'hv_bw_pws')

groups
['under_6' '6_to_8' 'over_8']


### Boxplots - trends between video and mat on single plot, only video metrics with corresponding Zeno mat metric 
- in-person: mat pws, mat fw, in-person vid pws, in-perso vid fw 

In [72]:
# in-person videos - EDSS severity 
output_path_1 = os.path.join(out_path, 'zv_bw_pws_and_fw_by_edss_severity')

boxplot_vid_and_mat_inperson(video_pws_columns = zv_colnames,
                               video_fw_columns = zv_colnames, 
                               mat_pws_columns = bw_pws_colnames, 
                               mat_fw_columns = bw_fw_colnames, 
                               pws_data = bw_zv_pws_edss_df,
                               fw_data = bw_zv_fw_edss_df,
                               group_col = 'edss_severity_cat', 
                               output_folder_path = output_path_1)

In [73]:
# in-person videos - T25FW groups 
output_path_2 = os.path.join(out_path, 'zv_bw_pws_and_fw_by_t25fw')

boxplot_vid_and_mat_inperson(video_pws_columns = zv_colnames,
                               video_fw_columns = zv_colnames, 
                               mat_pws_columns = bw_pws_colnames, 
                               mat_fw_columns = bw_fw_colnames, 
                               pws_data = bw_zv_pws_t25fw_df,
                               fw_data = bw_zv_fw_t25fw_df,
                               group_col = 't25fw_group_cat', 
                               output_folder_path = output_path_2)

- home: home vid pws, mat pws + mat fw 

In [74]:
# home videos - EDSS severity 
output_path_3 = os.path.join(out_path, 'hv_bw_pws_and_fw_by_edss_severity')

boxplot_vid_and_mat_home(video_pws_columns = hv_colnames, 
                         mat_pws_columns = bw_pws_colnames, 
                         mat_fw_columns = bw_fw_colnames,
                         home_df = bw_hv_edss_df, 
                         group_col = 'edss_severity_cat',
                         output_folder_path = output_path_3) 


In [75]:
# home videos - T25FW groups 
output_path_4 = os.path.join(out_path, 'hv_bw_pws_and_fw_by_t25fw')

boxplot_vid_and_mat_home(video_pws_columns = hv_colnames, 
                         mat_pws_columns = bw_pws_colnames, 
                         mat_fw_columns = bw_fw_colnames,
                         home_df = bw_hv_t25fw_df, 
                         group_col = 't25fw_group_cat',
                         output_folder_path = output_path_4) 

## Quartiles 
- Group into metric quartiles and calculate summary 
- Kruskal wallace  for each group
- plot after see data 

In [76]:
# in-person preferred walking speed video metrics 
anova_kruskal_cols = zv_colnames + ['bingoEHR_EDSS_measure_value'] + ['msfcEHR_T25FW SPEED AVG'] 
output_1 = os.path.join(out_path, 'zv_vs_bw_pws_quartiles')

metric_quartile_analysis(bw_zv_pws_df_2, 
                         metric_col_suffix = 'zv', 
                         anova_kruskal_cols = anova_kruskal_cols, 
                         output_folder_path = output_1) 

ValueError: Bin edges must be unique: Index([1.0, 2.0, 2.0, 3.0, 8.0], dtype='float64', name='walking_segmets_n_pose_zv').
You can drop duplicate edges by setting the 'duplicates' kwarg

In [None]:
# in-person fast walking speed video metrics 
anova_kruskal_cols = zv_colnames + ['bingoEHR_EDSS_measure_value'] + ['msfcEHR_T25FW SPEED AVG'] 
output_2 = os.path.join(out_path, 'zv_vs_bw_fw_quartiles')

metric_quartile_analysis(bw_zv_fw_df_2, 
                         metric_col_suffix = 'zv', 
                         anova_kruskal_cols = anova_kruskal_cols, 
                         output_folder_path = output_2) 

In [None]:
# home videos 
anova_kruskal_cols = hv_colnames + ['bingoEHR_EDSS_measure_value'] + ['msfcEHR_T25FW SPEED AVG'] 
output_3 = os.path.join(out_path, 'hv_vs_bw_pws_quartiles')

metric_quartile_analysis(bw_hv_pws_df_2, 
                         metric_col_suffix = 'hv', 
                         anova_kruskal_cols = anova_kruskal_cols, 
                         output_folder_path = output_3) 

## Other future ideas
- Remove outliers if needed??
- update to only include one mat metric per participant, if they have two home videso
- cluster, PCA, kernel: predict edss given set of metrics (from zeno, then say X amount of people at home able to collect good videos)
- longitudanl from people with multiple follow ups?
- Random forrest - decision tree
- binary classifier/cutoff score 