In [1]:
import pandas as pd 
import numpy as np
import os 
import scipy.stats as stats
import seaborn as sns 
import matplotlib.pyplot as plt 

## Set input and output folders 

In [2]:
# analysis folder version  
analysis_version = '005'


In [3]:
out_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis',
                        analysis_version, 
                        '003_scatter_video_vs_outcomes')

if not os.path.exists(out_path): 
    os.makedirs(out_path)

### Load Clean Data - no missing BW Data 
May be missing video data - see excel with counts 

In [4]:
# PWS 
zv_pws_bw_clean_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis', 
                                    analysis_version, 
                                    '000_merged_cleaned_data\zv_bw_merged_gait_vertical_PWS_1_clean.csv')
zv_pws_bw_clean_df = pd.read_csv(zv_pws_bw_clean_path, index_col = 0)

# FW 
zv_fw_bw_clean_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis', 
                                    analysis_version, 
                                   '000_merged_cleaned_data\zv_bw_merged_gait_vertical_FW_1_clean.csv') 
zv_fw_bw_clean_df = pd.read_csv(zv_fw_bw_clean_path, index_col = 0) 

# Home Videos 
hv_bw_clean_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis', 
                                analysis_version, 
                                '000_merged_cleaned_data\hv_bw_merged_clean.csv') 

hv_bw_clean_df = pd.read_csv(hv_bw_clean_path, index_col = 0) 

## Scatter plot, correlation, heatmat 
- scatter of each video metric vs: EDSS |  T25FW | Zeno PWS Velocity | Zeno FW Velocity
- heatmap
- .csv file of correlation matrix 

In [11]:
# correlation with clinical outcomes 
def video_vs_outcome_scatter(df, output_folder_path, subfolder_name, color_col): 
    
    out_plots_path = os.path.join(output_folder_path,  subfolder_name + '_scatterplots')
    
    if not os.path.exists(out_plots_path):
        os.makedirs(out_plots_path)

    if not os.path.exists(os.path.join(out_plots_path, 'edss')):
        os.makedirs(os.path.join(out_plots_path, 'edss'))

    if not os.path.exists(os.path.join(out_plots_path, 't25fw')):
        os.makedirs(os.path.join(out_plots_path, 't25fw'))

    if not os.path.exists(os.path.join(out_plots_path, 'PWS_velocity')):
        os.makedirs(os.path.join(out_plots_path, 'PWS_velocity'))

    if not os.path.exists(os.path.join(out_plots_path, 'FW_velocity')):
        os.makedirs(os.path.join(out_plots_path, 'FW_velocity'))

    # drop date time columns 
    if 'bw_hv_abs_date_diff' in df.columns:
        df = df.drop(columns=['bw_hv_abs_date_diff'])
        
    # Keep only numeric and ordinal columns
    numeric_cols = df.select_dtypes(include=['number']).columns
    ordinal_cols = df.select_dtypes(include=['category']).columns
    df_num = df[numeric_cols.union(ordinal_cols)]

    # Create an empty DataFrame to store the Spearman correlation coefficients
    n_cols = df_num.shape[1]

    corr_matrix = pd.DataFrame(np.zeros((n_cols, n_cols)), columns=df_num.columns, index=df_num.columns)
    pvalue_matrix = pd.DataFrame(np.zeros((n_cols, n_cols)), columns=df_num.columns, index=df_num.columns)
    n_videos_matrix = pd.DataFrame(np.zeros((n_cols, n_cols)), columns=df_num.columns, index=df_num.columns)
    
    for col1 in df_num.columns:
        for col2 in df_num.columns:

            df_num_clean = df.dropna(subset=[col1, col2]) # drop rows if col1 and col2 are both nan

            # unless all values the same in one column, run spearman correlation
            if df_num_clean[col1].nunique() <= 1 or df_num_clean[col2].nunique() <= 1: 
                corr = np.nan
                p_value = np.nan 
            else: 
                corr, p_value = stats.spearmanr(df_num_clean[col1], df_num_clean[col2])

            # save results in matrix 
            corr_matrix.loc[col1, col2] = round(corr, 2)
            pvalue_matrix.loc[col1, col2] = round(p_value, 3)
            n_videos_matrix.loc[col1, col2] = len(df_num_clean) # number of rows with data for both columns 

            # scatterplot of EDSS, T25FW, and mat velocity 
            if col1 == 'bingoEHR_EDSS_measure_value':
                sns.scatterplot(data=df_num_clean, x=col2, y=col1, hue = color_col) 
                plt.title(subfolder_name)
                plt.savefig(os.path.join(out_plots_path, 'edss', col2 + '.png'))
                plt.close()

            if col1 == 'msfcEHR_T25FW SPEED AVG':
                sns.scatterplot(data=df_num_clean, x=col2, y=col1, hue = color_col) 
                plt.title(subfolder_name)
                plt.savefig(os.path.join(out_plots_path, 't25fw', col2 + '.png'))
                plt.close()

            if col1 == 'PWS_velocitycmsecmean': 
                sns.scatterplot(data=df_num_clean, x=col2, y=col1, hue = color_col)
                plt.title(subfolder_name)
                plt.savefig(os.path.join(out_plots_path, 'PWS_velocity', col2 + '.png'))
                plt.close()

            if col1 == 'FW_velocitycmsecmean': 
                sns.scatterplot(data=df_num_clean, x=col2, y=col1, hue = color_col) 
                plt.title(subfolder_name)
                plt.savefig(os.path.join(out_plots_path, 'FW_velocity', col2 + '.png'))
                plt.close()
    
    #  Plot and save the heatmap 
    plt.figure(figsize=(8, 6))
    sns.heatmap(corr_matrix, annot=False, cmap="coolwarm", center=0)
    plt.title("Spearman Rank Correlation Heatmap")
    plt.savefig(os.path.join(output_folder_path, subfolder_name + '_heatmap.png'))
    plt.close()

    # save correlation matrix 
    corr_matrix.to_csv(os.path.join(output_folder_path, subfolder_name + '_corr_matrix.csv'))
    pvalue_matrix.to_csv(os.path.join(output_folder_path, subfolder_name + '_pvalue_matrix.csv'))
    n_videos_matrix.to_csv(os.path.join(output_folder_path, subfolder_name + '_n_pairs_matrix.csv'))

# Plot each numeric columns vs EDSS, T25FW, PWS and FW velocity 
Color by one column 

In [12]:
# PWS 
# color dots by how fast person is walking in video metrics were derived from 
video_vs_outcome_scatter(df = zv_pws_bw_clean_df, 
                         output_folder_path = out_path, 
                         subfolder_name = 'zeno_pws',
                         color_col = 'PWS_velocitycmsecmean')

In [13]:
# FW 
# color dots by how fast person is walking in video metrics were derived from 
video_vs_outcome_scatter(df = zv_fw_bw_clean_df, 
                         output_folder_path = out_path, 
                         subfolder_name = 'zeno_fw', 
                        color_col = 'FW_velocitycmsecmean')

In [14]:
# Home Videos  
# color dots by most recent preferred walking speed task 
video_vs_outcome_scatter(df = hv_bw_clean_df, 
                         output_folder_path = out_path, 
                         subfolder_name = 'home',
                        color_col = 'PWS_velocitycmsecmean')