In [1]:
import pandas as pd 
import numpy as np 
import os 
import seaborn as sns 
import matplotlib.pyplot as plt
import math
import scipy.signal as sig
import scipy.stats as stats 

In [2]:
def merge_bw_zv(bw_df, zv_df):

    print('total bw rows with id in video dataset') 
    print(len(zv_df))

    # merge bw data set rows with zeno videos rows 
        # merge bw data set rows with zeno videos rows 
        # id and date needs to be the same 
        # should only use each brainwalk visit once - once PWS_1 video per person 

    merged_bw_zv = []

    # Loop through each row in av_df
    for index, zv_row in zv_df.iterrows():
   
        current_id = zv_row['id_video']
        current_date = zv_row['visit_date_video']
        zv_row_df = pd.DataFrame([zv_row])

        # Find rows in brainwalk data set with same id and same date as current zv data 
        zv_in_bw_current_id_rows = bw_df[(bw_df['bw_id'] == current_id) & (bw_df['trialdate'] == current_date)]
        #zv_in_bw_current_id_date_rows = zv_in_bw_current_id_rows[zv_in_bw_current_id_rows['visit_date'] == current_date]
   
        if len(zv_in_bw_current_id_rows) == 1: 
            bw_row_to_merge = zv_in_bw_current_id_rows
            # merge bw and zv, store merged row
            merged_row = zv_row_df.merge(bw_row_to_merge, left_on='id_video', right_on='bw_id')
            merged_bw_zv.append(merged_row)
        
        # if more than one row for the id and date, pick one with least na values 
        elif len(zv_in_bw_current_id_rows) > 1:
            bw_row_to_merge = zv_in_bw_current_id_rows.loc[[zv_in_bw_current_id_rows.isna().sum(axis=1).idxmin()]]
            # merge bw and zv, store merged row
            merged_row = zv_row_df.merge(bw_row_to_merge, left_on='id_video', right_on='bw_id')
            merged_bw_zv.append(merged_row)

            print('multiple rows for the id and date combo')
            print(current_id)
            print(current_date)

        else: 
            print('No matching id and daterow from video vs mat')
            print(current_id)
            print(current_date)


    # merge all bw and zv data together 
    merged_bw_zv_df = pd.concat(merged_bw_zv)
    merged_bw_zv_df = merged_bw_zv_df.reset_index(drop=True) # reset index 

    # check same ID for each row 
    print('mismatched zeno video vs brainwalk id')
    print(sum(merged_bw_zv_df['id_video'] != merged_bw_zv_df['bw_id']))

    print('mismatched zeno video vs brainwalk date')
    print(sum(merged_bw_zv_df['visit_date_video'] != merged_bw_zv_df['trialdate']))

    # saved merged df for future reference 
#    merged_bw_zv_df.to_csv(os.path.join(out_path,  'zv_bw_merged_' + task + '.csv'))

    return merged_bw_zv_df

In [3]:
# load brainwalkd data 
bw_df = pd.read_excel(r'C:\Users\mmccu\AppData\Local\Temp\ccsecure\2025_01_24_BrainWalk_AllData_Long_MM.xlsx', 
                     index_col = None, 
                     usecols = ['bw_id', 'record_id', 'trialdate', 'visit_date', 'bingoEHR_EDSS_measure_value', 'msfcEHR_T25FW SPEED AVG',
                                'PWS_velocitycmsecmean', 'FW_velocitycmsecmean'])

In [4]:
# load depth proxy data and 
depth_proxy_describe_all = pd.read_csv(r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\test_velocity_01302025\zeno_all_height_proxy_outputs\depth_proxy_all.csv',
                                      index_col = 0)
depth_proxy_describe_all['visit_date_video'] = pd.to_datetime(depth_proxy_describe_all['visit_date_video'].str.replace('_', '-'), format='%Y-%m-%d')

In [5]:
# add bw data to describe df 
depth_proxy_w_bw = merge_bw_zv(bw_df, depth_proxy_describe_all)

total bw rows with id in video dataset
621
No matching id and daterow from video vs mat
BW-0121
2022-07-20 00:00:00
No matching id and daterow from video vs mat
BW-0322
2024-06-10 00:00:00
No matching id and daterow from video vs mat
BW-0322
2024-06-10 00:00:00
mismatched zeno video vs brainwalk id
0
mismatched zeno video vs brainwalk date
0


In [6]:
def scatterplot_all_vs_one(df, column_name, output_path, folder_name):
    
    for current_col_2 in df.columns:
        clean_df = df.dropna(subset=[column_name, current_col_2])

        # consistent for all plots 
        col_1_data = clean_df[column_name]

        # loop through each other column 
        print(current_col_2)
        col_2_data =  clean_df[current_col_2]

        # scatterplot 
        plt.scatter(col_1_data, col_2_data, alpha = 0.5)
        plt.xlabel(column_name)
        plt.ylabel(current_col_2) 

        if not os.path.exists(os.path.join(output_path, folder_name)): 
            os.makedirs(os.path.join(output_path, folder_name))

        plt.savefig(os.path.join(output_path, folder_name, current_col_2 + '.png')) 
        plt.close()

        # correlation 
        corr_results = stats.spearmanr(col_1_data, col_2_data)
        print(f"r statistic: {corr_results.statistic : .2f}")
        print(f"p value: {corr_results.pvalue :.2f}")
        print('-----------------')
        

In [7]:
output_path = r'C:\Users\mmccu\Box\MM_Personal\5_Projects\BoveLab\3_Data_and_Code\test_velocity_01302025\zeno_all_height_proxy_outputs\scatterplots'
if not os.path.exists(output_path): 
    os.makedirs(output_path)

In [8]:
depth_proxy_w_bw_fw = depth_proxy_w_bw.loc[depth_proxy_w_bw['filename'].str.contains('FW_1')]  
depth_proxy_w_bw_pws = depth_proxy_w_bw.loc[depth_proxy_w_bw['filename'].str.contains('PWS_1')]

In [9]:
# plot all vars from fast walk videos vs t25fw 
scatterplot_all_vs_one(df = depth_proxy_w_bw_fw,
                       column_name = 'msfcEHR_T25FW SPEED AVG', 
                       output_path = output_path, 
                       folder_name = 't25fw_vs_depth_proxies')

delta_pix_h_count
r statistic:  0.48
p value: 0.00
-----------------
delta_pix_h_mean
r statistic: -0.30
p value: 0.00
-----------------
delta_pix_h_std
r statistic: -0.04
p value: 0.57
-----------------
delta_pix_h_min
r statistic: -0.33
p value: 0.00
-----------------
delta_pix_h_25%
r statistic: -0.40
p value: 0.00
-----------------
delta_pix_h_50%
r statistic: -0.42
p value: 0.00
-----------------
delta_pix_h_75%
r statistic: -0.35
p value: 0.00
-----------------
delta_pix_h_max
r statistic: -0.03
p value: 0.69
-----------------
delta_pix_h_rel_count
r statistic:  0.46
p value: 0.00
-----------------
delta_pix_h_rel_mean
r statistic: -0.48
p value: 0.00
-----------------
delta_pix_h_rel_std
r statistic: -0.17
p value: 0.01
-----------------
delta_pix_h_rel_min
r statistic: -0.39
p value: 0.00
-----------------
delta_pix_h_rel_25%
r statistic: -0.52
p value: 0.00
-----------------
delta_pix_h_rel_50%
r statistic: -0.61
p value: 0.00
-----------------
delta_pix_h_rel_75%
r statistic:

DTypePromotionError: The DType <class 'numpy.dtypes.Float64DType'> could not be promoted by <class 'numpy.dtypes.DateTime64DType'>. This means that no common DType exists for the given inputs. For example they cannot be stored in a single array unless the dtype is `object`. The full list of DTypes is: (<class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.DateTime64DType'>)

In [10]:
# plot all vars vs PWS velocity 
scatterplot_all_vs_one(df = depth_proxy_w_bw_pws,
                       column_name = 'PWS_velocitycmsecmean', 
                       output_path = output_path, 
                       folder_name = 'pws_velocity_vs_depth_proxies')

delta_pix_h_count
r statistic: -0.34
p value: 0.00
-----------------
delta_pix_h_mean
r statistic:  0.41
p value: 0.00
-----------------
delta_pix_h_std
r statistic:  0.18
p value: 0.00
-----------------
delta_pix_h_min
r statistic:  0.29
p value: 0.00
-----------------
delta_pix_h_25%
r statistic:  0.50
p value: 0.00
-----------------
delta_pix_h_50%
r statistic:  0.52
p value: 0.00
-----------------
delta_pix_h_75%
r statistic:  0.52
p value: 0.00
-----------------
delta_pix_h_max
r statistic:  0.15
p value: 0.01
-----------------
delta_pix_h_rel_count
r statistic: -0.36
p value: 0.00
-----------------
delta_pix_h_rel_mean
r statistic:  0.55
p value: 0.00
-----------------
delta_pix_h_rel_std
r statistic:  0.21
p value: 0.00
-----------------
delta_pix_h_rel_min
r statistic:  0.38
p value: 0.00
-----------------
delta_pix_h_rel_25%
r statistic:  0.65
p value: 0.00
-----------------
delta_pix_h_rel_50%
r statistic:  0.72
p value: 0.00
-----------------
delta_pix_h_rel_75%
r statistic:

DTypePromotionError: The DType <class 'numpy.dtypes.Float64DType'> could not be promoted by <class 'numpy.dtypes.DateTime64DType'>. This means that no common DType exists for the given inputs. For example they cannot be stored in a single array unless the dtype is `object`. The full list of DTypes is: (<class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.DateTime64DType'>)

In [11]:
# plot all vars vs FW velocity 
scatterplot_all_vs_one(df = depth_proxy_w_bw_fw,
                       column_name = 'FW_velocitycmsecmean', 
                       output_path = output_path, 
                       folder_name = 'fw_velocity_vs_depth_proxies')

delta_pix_h_count
r statistic: -0.51
p value: 0.00
-----------------
delta_pix_h_mean
r statistic:  0.38
p value: 0.00
-----------------
delta_pix_h_std
r statistic:  0.13
p value: 0.02
-----------------
delta_pix_h_min
r statistic:  0.29
p value: 0.00
-----------------
delta_pix_h_25%
r statistic:  0.43
p value: 0.00
-----------------
delta_pix_h_50%
r statistic:  0.50
p value: 0.00
-----------------
delta_pix_h_75%
r statistic:  0.42
p value: 0.00
-----------------
delta_pix_h_max
r statistic:  0.08
p value: 0.16
-----------------
delta_pix_h_rel_count
r statistic: -0.53
p value: 0.00
-----------------
delta_pix_h_rel_mean
r statistic:  0.49
p value: 0.00
-----------------
delta_pix_h_rel_std
r statistic:  0.21
p value: 0.00
-----------------
delta_pix_h_rel_min
r statistic:  0.34
p value: 0.00
-----------------
delta_pix_h_rel_25%
r statistic:  0.52
p value: 0.00
-----------------
delta_pix_h_rel_50%
r statistic:  0.64
p value: 0.00
-----------------
delta_pix_h_rel_75%
r statistic:

DTypePromotionError: The DType <class 'numpy.dtypes.Float64DType'> could not be promoted by <class 'numpy.dtypes.DateTime64DType'>. This means that no common DType exists for the given inputs. For example they cannot be stored in a single array unless the dtype is `object`. The full list of DTypes is: (<class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.DateTime64DType'>)