In [1]:
import pandas as pd 
import numpy as np
import os 
import seaborn as sns 
import matplotlib.pyplot as plt 
import scipy.stats as stats 

In [2]:
# analysis version 

In [3]:
# analysis folder version  
analysis_version = '008'

In [4]:
out_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis',
                        analysis_version, 
                        '005_home video reliability')

if not os.path.exists(out_path): 
    os.makedirs(out_path)

## Functions 

In [5]:
# correlation of same metrics from difference data sources (left vs right videos)

def metric_correlation_rl(df, left_columns, right_columns, output_folder_path): 
    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)
    
    # create empty list to store results 
    corr_results_all = [] 
    clean_df = pd.DataFrame() 
    
    for metric_i, current_metric in enumerate(left_columns): 
        current_left_col = left_columns[metric_i]
        current_right_col = right_columns[metric_i]

        # Drop rows with NaN values in either column - required to run spearman r 
        clean_df = df.dropna(subset=[current_left_col, current_right_col])
        
        # plot 
        sns.scatterplot(x = current_right_col, y = current_left_col, data = clean_df)
        # Set the x and y axis limits to the same range
        # plt.axis('square')  
        min_val = min(clean_df[current_left_col].min(), clean_df[current_right_col].min())  
        max_val = max(clean_df[current_left_col].max(), clean_df[current_right_col].max())  
        plt.xlim(min_val, max_val)
        plt.ylim(min_val, max_val) 
        plt.savefig(os.path.join(output_folder_path, str(current_left_col) +  '_.png'))
        plt.close()

        # run spearman correlation and append   
        statistic, p_value = stats.spearmanr(clean_df[current_right_col], clean_df[current_left_col])
        corr_results_all.append({'right_column': current_right_col, 
                                 'left_column': current_left_col, 
                                 'corr_method': 'spearman', 
                                 'rs': round(statistic, 2), 
                                 'p_value' : round(p_value, 3),
                                 'n pairs': len(clean_df)})

    # Create DataFrame with results
    corr_results_df = pd.DataFrame(corr_results_all)

    return corr_results_df

In [6]:
# use same columns as metric correlations 
def calculate_metric_mean_diff(df, left_columns, right_columns, output_folder_path):
    # one dot = participant 

    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path) 
        
    mean_error_all = [] 

    for metric_i, current_metric in enumerate(left_columns): 
        current_left_col = left_columns[metric_i]
        current_right_col = right_columns[metric_i]

        # Drop rows with NaN values in either column 
        clean_df = df.dropna(subset=[current_left_col, current_right_col])

        current_metric_diff = clean_df[current_right_col] - clean_df[current_left_col]
        current_mean_diff = round(current_metric_diff.mean(), 2)
        current_abs_mean_diff = round(abs(current_metric_diff).mean(),2)

        # calculate mean ground truth data 
        rl_mean = round(clean_df[[current_left_col, current_right_col]].mean().mean(), 2)
        mean_err_per = round((current_mean_diff / rl_mean) * 100,2)
        mae_per = round((current_abs_mean_diff / rl_mean) * 100, 2)

        # plot 
        fig, ax1 = plt.subplots()
        sns.boxplot(y=current_metric_diff, ax=ax1, fill = False, dodge = True, fliersize = 0)
        sns.stripplot(y = current_metric_diff, ax = ax1, color = 'black', dodge = True)
        fig.suptitle('Right - Left Video')
        ax1.set_title(current_right_col)
        # center plot at zero
        ymin, ymax = plt.ylim()
        plt.ylim(min(ymin, -ymax), max(ymax, -ymin))
        plt.ylabel('Error')

        # add line at zero
        plt.axhline(y=0, color='grey', linestyle='--')
        plt.tight_layout()
        plt.savefig(os.path.join(output_folder_path, str(current_left_col + '_diff_box.png')))
        plt.close()

        # mean difference 
        mean_error_all.append({'right_col': current_right_col, 
                               'left_col': current_left_col,
                               'n' : len(clean_df), 
                               'mean_all_l_and_r_metrics' : rl_mean,
                               'mean_difference': current_mean_diff, 
                               'mean_diff_%_of_mean' : mean_err_per,
                               'mean_abs_difference' : current_abs_mean_diff, 
                               'mean_abs_diff_%_of_mean' : mae_per,
                               'formatted_mean_diff' : f"{current_mean_diff} ({mean_err_per}%)",
                              'formatted_abs_mean_diff': f"{current_abs_mean_diff} ({mae_per}%)"})

    
     # Create DataFrame with results
    mean_error_df = pd.DataFrame(mean_error_all)
    #mean_error_df = mean_error_df.round(2)
    
    return mean_error_df

In [7]:
# use same columns as metric correlation 

def bland_altman_plot(df, left_columns, right_columns, output_folder_path):
     
    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)

    for metric_i, current_metric in enumerate(left_columns): 
        current_right_col = right_columns[metric_i]
        current_left_col = left_columns[metric_i]

        clean_df = df.dropna(subset=[current_left_col, current_right_col])
    
        # Compute the mean and the difference
        mean_measurements = (clean_df[current_right_col] + clean_df[current_left_col]) / 2
        diff_measurements = clean_df[current_right_col] - clean_df[current_left_col]  # Difference between measurements

        # Mean difference and standard deviation of the difference
        mean_diff = np.mean(diff_measurements)
        std_diff = np.std(diff_measurements)

        # Plot the data
        plt.figure(figsize=(8, 6))
        plt.scatter(mean_measurements, diff_measurements, alpha=1)
    
        # Add mean difference line and limits of agreement (±1.96*std)
        plt.axhline(mean_diff, color='black', linestyle='--', label=f'Mean diff: {mean_diff:.2f}')
        plt.axhline(mean_diff + 1.96 * std_diff, color='red', linestyle='--', label=f'+1.96 SD: {mean_diff + 1.96 * std_diff:.2f}')
        plt.axhline(mean_diff - 1.96 * std_diff, color='blue', linestyle='--', label=f'-1.96 SD: {mean_diff - 1.96 * std_diff:.2f}')
        plt.axhline(y=0, color='grey', linestyle='--')
    
        # Labels and title
        plt.xlabel('Mean of Right and Left Turns Video Metric') 
        plt.ylabel('Right Video - Left Video Metric')
        plt.suptitle('Right vs Left')
        plt.title(current_right_col)
        plt.legend()
        plt.savefig(os.path.join(output_folder_path,  str(current_left_col + '_blandalt.png')))
        plt.close()

## Load data 

In [8]:
# load hv clean data 
# Home Videos 
hv_bw_clean_path = os.path.join(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\gait_bw_zeno_home_analysis', 
                                analysis_version, 
                                '000_merged_cleaned_data\hv_bw_merged_clean.csv') 

hv_bw_clean_df = pd.read_csv(hv_bw_clean_path, index_col = 0) 

In [9]:
# clean - right_2 --> right and same for left 

hv_bw_clean_df['task_pose_hv'] = hv_bw_clean_df['task_pose_hv'].replace('gait_vertical_left_2', 'gait_vertical_left')
hv_bw_clean_df['task_pose_hv'] = hv_bw_clean_df['task_pose_hv'].replace('gait_vertical_right_2', 'gait_vertical_right')
hv_bw_clean_df['task_pose_hv'].value_counts()

task_pose_hv
gait_vertical_right    41
gait_vertical_left     40
Name: count, dtype: int64

In [10]:
# select only video columns 
hv_subset_cols =  [col for col in hv_bw_clean_df.columns if col.endswith('hv')] 
hv_subset_df = hv_bw_clean_df.loc[:, hv_subset_cols]

# make one row per date and id combo - one row with both left and right video 
hv_subset_wide_df = hv_subset_df.pivot(index='id_date_pose_hv', columns='task_pose_hv').reset_index()
hv_subset_wide_df.columns = ['_'.join(col) if isinstance(col, tuple) else col for col in hv_subset_wide_df.columns]

In [11]:
hv_subset_wide_df.columns

Index(['id_date_pose_hv_', 'video_id_date_name_pose_hv_gait_vertical_left',
       'video_id_date_name_pose_hv_gait_vertical_right',
       'frames_per_second_pose_hv_gait_vertical_left',
       'frames_per_second_pose_hv_gait_vertical_right',
       'total_video_duration_sec_pose_hv_gait_vertical_left',
       'total_video_duration_sec_pose_hv_gait_vertical_right',
       'delta_pix_h_rel_median_pose_hv_gait_vertical_left',
       'delta_pix_h_rel_median_pose_hv_gait_vertical_right',
       'walking_segmets_n_pose_hv_gait_vertical_left',
       ...
       'frameDiff_hs2_to1b_std_pose_hv_gait_vertical_left',
       'frameDiff_hs2_to1b_std_pose_hv_gait_vertical_right',
       'frameDiff_to1a_hs1a_std_pose_hv_gait_vertical_left',
       'frameDiff_to1a_hs1a_std_pose_hv_gait_vertical_right',
       'frameDiff_to1b_hs1b_std_pose_hv_gait_vertical_left',
       'frameDiff_to1b_hs1b_std_pose_hv_gait_vertical_right',
       'frameDiff_to2_hs2_std_pose_hv_gait_vertical_left',
       'frameDiff_

In [12]:

hv_subset_wide_df = hv_subset_wide_df.drop(['video_id_date_name_pose_hv_gait_vertical_left', 'video_id_date_name_pose_hv_gait_vertical_right',
                                           'frames_per_second_pose_hv_gait_vertical_right', 'frames_per_second_pose_hv_gait_vertical_left'],
                                          axis = 1)
hv_subset_wide_df.head()
hv_subset_wide_df.to_csv(os.path.join(out_path, 'hv_metrics_only_wide.csv'))

In [13]:
left_hv_colnames = [col for col in hv_subset_wide_df.columns if col.endswith('left')] 
right_hv_colnames = [col for col in hv_subset_wide_df.columns if col.endswith('right')] 

In [14]:
# double check if using this script - left and right are ordered the same 
    # first right metric is same as first left metric 

for colname_i, current_metric in enumerate(left_hv_colnames): 
    print(left_hv_colnames[colname_i])
    print(right_hv_colnames[colname_i])
    print('------------')

total_video_duration_sec_pose_hv_gait_vertical_left
total_video_duration_sec_pose_hv_gait_vertical_right
------------
delta_pix_h_rel_median_pose_hv_gait_vertical_left
delta_pix_h_rel_median_pose_hv_gait_vertical_right
------------
walking_segmets_n_pose_hv_gait_vertical_left
walking_segmets_n_pose_hv_gait_vertical_right
------------
walking_segments_duration_mean_pose_hv_gait_vertical_left
walking_segments_duration_mean_pose_hv_gait_vertical_right
------------
walking_segments_duration_median_pose_hv_gait_vertical_left
walking_segments_duration_median_pose_hv_gait_vertical_right
------------
stride_time_mean_sec_pose_hv_gait_vertical_left
stride_time_mean_sec_pose_hv_gait_vertical_right
------------
stride_time_median_sec_pose_hv_gait_vertical_left
stride_time_median_sec_pose_hv_gait_vertical_right
------------
stride_time_std_pose_hv_gait_vertical_left
stride_time_std_pose_hv_gait_vertical_right
------------
stride_time_cv_pose_hv_gait_vertical_left
stride_time_cv_pose_hv_gait_vertic

## Test-Retest Reliability 
- how similar are metrics sent by same person on the same day 

### All pairs 

In [15]:
# Spearman R correlation for each metric 
corr_out_path = os.path.join(out_path, 'right_vs_left_metric_correlation')

hv_lr_corr_results_df = metric_correlation_rl(hv_subset_wide_df, left_hv_colnames, right_hv_colnames, corr_out_path)
hv_lr_corr_results_df.to_csv(os.path.join(corr_out_path, 'correlation_results_rl.csv'))

In [16]:
# Mean Difference and Mean Absolute Difference Right vs Left 
mean_diff_path = os.path.join(out_path, 'right_vs_left_diff')
mean_diff_df = calculate_metric_mean_diff(hv_subset_wide_df, left_hv_colnames, right_hv_colnames, mean_diff_path)
mean_diff_df.to_csv(os.path.join(mean_diff_path, 'right_left_diff_results_rl.csv'))

In [17]:
# bland altman 
ba_path = os.path.join(out_path,  'right_vs_left_metric_blandalt')
bland_altman_plot(hv_subset_wide_df, left_hv_colnames, right_hv_colnames, ba_path)