In [None]:

import pandas as pd
import numpy as np
import polars as pl
import time
import matplotlib.pyplot as plt
from extraction_functions import *

path = '../../../data'


# Load the data
df_labels = pd.read_csv(f'{path}/sensor_data.csv')

### Once file has been opened & imported we will extract the standard WS features first.
### To compartmentalise the code, each cell focuses on one or multiple features within a domain rather than extracting all features from each participant 
### The code starts with step count features first:

In [None]:
#calculate the average number of daily steps by calculating total number of steps divided by the number of days in the analysis



#add an empty column to the dataframe to add values to it
df_labels['steps'] = np.nan 
df_labels['MVPA steps'] = np.nan
df['Q1'] = 0
df['Q2'] = 0
df['Q3'] = 0
df['Q95'] = 0   


# Loop over each patient file
for index, row in df_labels.iterrows():
    patient_id = row['Patient ID']
    print(f'Processing patient {patient_id}...')

    # Extract numeric patient ID by stripping the 'R' prefix and removing leading zeros
    numeric_patient_id = str(int(patient_id.lstrip('R')))

    # Read the patient data
    patient_file = f'{path}/steps/counts/{numeric_patient_id}_gait_hourly.csv'
    
    try:
        df_patient = pd.read_csv(patient_file)
    except FileNotFoundError:
        print(f"File not found for patient {patient_id}. Skipping.")
        df_labels.at[index, 'steps'] = np.nan
        continue  # Skip to the next iteration if file is missing

    #get the value from the column 'good_wear_hrs' for this patient
    good_wear_hrs = row['good_wear_hrs']

    #count the total number of steps in the steps column
    total_steps = df_patient['steps'].sum()
    days = good_wear_hrs/24
    

    #calculate the average number of steps per day
    avg_steps_per_day = total_steps/days
    df_labels.at[index, 'steps'] = avg_steps_per_day


    # Count rows with >= 100 steps to estimate MVPA steps
    mvpa_rows = df_patient[df_patient['steps'] >= 100].shape[0]
    avg_mvpa_per_day = mvpa_rows / days if mvpa_rows > 0 else 0
    df_labels.at[index, 'MVPA steps'] = avg_mvpa_per_day

    #now calculate the Speed-HR ratio

    #find the row with the first value in the column 'walking_time' and row with last value in that column
    first_row = df_patient[df_patient['walking_time'].notnull()].index[0]
    last_row = df_patient[df_patient['walking_time'].notnull()].index[-1]
    df_patient = df_patient.iloc[first_row:last_row+1]

    #open hr data for patient 
    hr_file = f'{path}/hr_values/{patient_id}.npy'
    hr_data = np.load(hr_file)

    hr_resampled = resmaple_hr_data(hr_data)

    # Create DataFrame for resampled HR data
    df_hr_resampled = pd.DataFrame({'hr': hr_resampled})

    # Ensure length of hr data matches patient data length by taking only first values in hr that match the steps 
    df_hr_resampled = df_hr_resampled.iloc[:len(df_patient)].reset_index(drop=True)


    # Combine the two dataframes
    df_combined = pd.concat([df_patient.reset_index(drop=True), df_hr_resampled], axis=1)

    # Calculate the ratio of steps to HR in df_combined
    df_combined['ratio'] = df_combined['steps'] / df_combined['hr']

    # Form a new DataFrame with only rows where steps > 0 and hr > 0
    df_active = df_combined[(df_combined['steps'] > 0) & (df_combined['hr'] > 0)]

    # Calculate the distribution of CHR ratios for active periods (steps > 50 and hr > 0)
    Q1 = df_active['ratio'].quantile(0.25)
    df.loc[index, 'Q1'] = Q1

    Q2 = df_active['ratio'].median()  # 50th percentile
    df.loc[index, 'Q2'] = Q2

    Q3 = df_active['ratio'].quantile(0.75)
    df.loc[index, 'Q3'] = Q3

    Q95 = df_active['ratio'].quantile(0.95)
    df.loc[index, 'Q95'] = Q95  





Processing patient R001...
1
Processing patient R002...
2
Processing patient R003...
3
Processing patient R004...
4
Processing patient R005...
5
Processing patient R006...
6
Processing patient R007...
7
Processing patient R008...
8
Processing patient R009...
9
Processing patient R010...
10
Processing patient R011...
11
Processing patient R012...
12
Processing patient R013...
13
Processing patient R014...
14
Processing patient R015...
15
Processing patient R016...
16
Processing patient R017...
17
Processing patient R018...
18
Processing patient R019...
19
Processing patient R020...
20
Processing patient R021...
21
Processing patient R022...
22
Processing patient R023...
23
Processing patient R024...
24
Processing patient R025...
25
Processing patient R026...
26
Processing patient R027...
27
Processing patient R028...
28
Processing patient R029...
29
Processing patient R030...
30
Processing patient R031...
31
Processing patient R032...
32
Processing patient R033...
33
Processing patient 

### After step features have been extracted - now movement features will be extracted.

In [None]:
#add several emtpy new columns to the df to add values to
df['Resting HR'] = np.nan
df['Max HR'] = np.nan
df['Min HR'] = np.nan
df['Time in MVPA'] = np.nan
df['Time in LPA'] = np.nan
df['Time in SB'] = np.nan
df['MVPA HR'] = np.nan
df['LPA HR'] = np.nan
df['SB HR'] = np.nan


# Load the data
print('starting_loop')
for index, row in df.iterrows():
    patient_id = row['Patient ID']
    file_name = df[df['Patient ID'] == patient_id]['file_name'].values[0]
    start_time = df[df['Patient ID'] == patient_id]['Start'].values[0]
    print(f'Starting patient {patient_id}')

    #get the good qual wear time from the column good_wear_hrs and find this in days 
    good_wear_hrs = df[df['Patient ID'] == patient_id]['good_wear_hrs'].values[0]
    days = good_wear_hrs / 24
    print(f'Days: {days}')

    # Load the HR data
    file = f'{path}/hr_values/{patient_id}.npy'
    hr_values = np.load(file)

    #open the activity classification file
    file = f'{path}/activity_class/{patient_id}_combined-timeSeries.csv.gz'
    acc_df = pd.read_csv(file, compression='gzip')

    #upsample to hr data to match the length of the activity classification data
    hr_30s_values = average_hr_30s(hr_values)

    # Align HR and accelerometer data
    acc_df = align_hr_and_acc(hr_30s_values, acc_df)

    # Extract sleep data
    sleep_data = extract_sleep_data(acc_df)

    # Filter the rows where the HR column is not 0
    sleep_df = sleep_df[sleep_df['HR'] != 0]
    if len(sleep_df) > 0:
        # Calculate the average and add to column
        resting_hr = sleep_df['HR'].mean()
        print(f"Resting HR during sleep:{resting_hr}")
        df.at[index, 'Resting HR'] = resting_hr
    else:
        print('No resting HR found')
        #calculate the resting HR from the average period when all rows have a 1 in the sedentary column
        resting_hr = acc_df[acc_df['sedentary'] == 1]['HR'].mean()
        #print(f"Resting HR from sedentary period:{resting_hr}")
        df.at[index, 'Resting HR'] = resting_hr

    
    #get the maximum value of the HR column
    max_hr = acc_df['HR'].max()
    df.at[index, 'Max HR'] = max_hr

    #get the minimum value of the HR column that is not 0
    min_hr = acc_df[acc_df['HR'] != 0]['HR'].min()
    df.at[index, 'Min HR'] = min_hr

    #get the time in minutes spent in MVPA, LPA, VPA and SB (each row is 30s) along with the average HR for each
    time_in_mvpa = (acc_df[acc_df['moderate-vigorous'] == 1].shape[0] / 2) / days
    print(f'Time in MVPA: {time_in_mvpa}')
    df.at[index, 'Time in MVPA'] = time_in_mvpa
    mvpa_hr = acc_df[(acc_df['moderate-vigorous'] == 1) & (acc_df['HR'] != 0)]['HR'].mean()
    df.at[index, 'MVPA HR'] = mvpa_hr

    time_in_lpa = (acc_df[acc_df['light'] == 1].shape[0] / 2) / days
    df.at[index, 'Time in LPA'] = time_in_lpa
    lpa_hr = acc_df[(acc_df['light'] == 1) & (acc_df['HR'] != 0)]['HR'].mean()
    df.at[index, 'LPA HR'] = lpa_hr

    time_in_sb = (acc_df[acc_df['sedentary'] == 1].shape[0] / 2) / days
    df.at[index, 'Time in SB'] = time_in_sb
    sb_hr = acc_df[(acc_df['sedentary'] == 1) & (acc_df['HR'] != 0)]['HR'].mean()
    df.at[index, 'SB HR'] = sb_hr


Unnamed: 0,Patient ID,Biosensor ID,file_name,Start,ecg_qual,wear_hrs,acc_qual,good_wear_hrs,steps,MVPA steps,age
0,R001,AVNDI,01_10,2022-12-21 14:00:00,75.51448,74.24,98.695732,73.271711,8521.160357,10.481535,57
1,R002,AVNDM,01_10,2023-01-03 12:15:00,74.175622,67.982222,83.084581,56.482744,1669.890529,2.97436,50
2,R003,AVNDG,01_10,2023-01-09 10:32:00,85.782574,94.72,86.894813,82.306767,3841.433855,4.373881,57
3,R004,BBJMP,01_10,2023-01-17 12:50:00,60.794467,79.928889,63.739338,50.946144,6201.372124,26.380799,67
4,R005,BGFJQ,01_10,2023-01-23 16:04:00,54.630006,47.786667,86.210519,41.197133,1131.340854,0.0,75


In [None]:
# save the dataframe to a csv file

df.to_csv(f'{path}/sensor_data.csv', index=False)