In [1]:
import pandas as pd
import numpy as np
import polars as pl
import time
import matplotlib.pyplot as plt
import neurokit2 as nk
import os

path = '../../../data'

#open the data_labels.csv file
df = pd.read_excel(f'{path}/data_labels.xlsx')

#read in signal freq -> dictionary
file = open(f'{path}/label_freq.txt', 'r')
label_freq = file.read()

label_freq = eval(label_freq)
file.close()

ecg_hz = label_freq['ECG_A']
print(ecg_hz)



244.140625


In [2]:
# Assuming df is already defined and loaded
# Add a column to df called ecg_quality
df["ecg_qual"] = np.nan



# Check the percentage of missing data in hr signals and add this to a new column
for index, row in df.iterrows():
    patient_id = row["Patient ID"]

    reformat_path = os.path.join(path, f"hr_values/{patient_id}.npy")

    # Load the npy file
    hr = np.load(reformat_path)

    # Calculate the percentage of the signal that isn't 0 values
    non_zero_percentage = np.sum(hr != 0) / len(hr) * 100
    #print(non_zero_percentage)

    # Add this to the row column value
    df.at[index, "ecg_qual"] = non_zero_percentage

In [3]:
# Initialize an empty list to store the formatted strings
formatted_list = []

#initialise an empty list to store the patient ID's that need to be removed
to_remove = []

# Loop through the numbers 1 to 198 with R in front to match 
for i in range(1, 199):
    # Add the required prefix based on the number of digits
    if i < 10:
        formatted_list.append(f"R00{i}")
    elif i < 100:
        formatted_list.append(f"R0{i}")
    else:
        formatted_list.append(f"R{i}")

# Loop through the formatted_list and check if each value exists in the 'Patient ID' column
for patient_id in formatted_list:
    if patient_id in df['Patient ID'].values:
        continue
    else:
        print(f"{patient_id} is not present in the DataFrame")
        to_remove.append(patient_id)

print(to_remove)




R095 is not present in the DataFrame
['R095']


In [4]:
# create a new empty column called wear_hrs

df["wear_hrs"] = np.nan

# Loop through the rows of the DataFrame and open the ACC_X file for each patient calculate the lenght in hr using Hz of 25
for index, row in df.iterrows():
    patient_id = row['Patient ID']
    file_name = row['file_name']
    print(f'Processing patient {patient_id}...')

    # Load the ACC_X file
    acc_x = pl.read_parquet(f'{path}/bdf_files/{file_name}/{patient_id}/ACC_X.parquet')
    #ecg = pl.read_parquet(f'{path}/bdf_files/{file_name}/{patient_id}/ECG_A.parquet')

    # convert to numpy array
    acc_x = acc_x.to_numpy()
    #ecg = ecg.to_numpy()


    # Calculate the length of the signal in hours
    length_acc = len(acc_x) / (25*60*60)
    #print(length_acc)

    #length_ecg = len(ecg) / (ecg_hz*60*60)
    #print(length_ecg)

    #if length_acc != length_ecg:
        #print(f'Patient {patient_id} has different signal lengths for ACC_X and ECG_A')
    
    # Add the length to the DataFrame
    df.at[index, 'wear_hrs'] = length_acc

    #print(f'Patient {patient_id} has a signal length of {length} hours')




Processing patient R001...
Processing patient R002...
Processing patient R003...
Processing patient R004...
Processing patient R005...
Processing patient R006...
Processing patient R007...
Processing patient R008...
Processing patient R009...
Processing patient R010...
Processing patient R011...
Processing patient R012...
Processing patient R013...
Processing patient R014...
Processing patient R015...
Processing patient R016...
Processing patient R017...
Processing patient R018...
Processing patient R019...
Processing patient R020...
Processing patient R021...
Processing patient R022...
Processing patient R023...
Processing patient R024...
Processing patient R025...
Processing patient R026...
Processing patient R027...
Processing patient R028...
Processing patient R029...
Processing patient R030...
Processing patient R031...
Processing patient R032...
Processing patient R033...
Processing patient R034...
Processing patient R035...
Processing patient R036...
Processing patient R037...
P

In [5]:
#create a new column called acc_qual and calculate the percentage of missing data in the ACC_X signal (where all axes are < -2)
df['acc_qual'] = np.nan
df['good_wear_hrs'] = np.nan

# Loop through the rows of the DataFrame and open the ACC_X file for each patient calculate the lenght in hr using Hz of 25
for index, row in df.iterrows():
    patient_id = row['Patient ID']
    file_name = row['file_name']
    print(f'Processing patient {patient_id}...')

    # Load the ACC_X file
    acc_x = pl.read_parquet(f'{path}/bdf_files/{file_name}/{patient_id}/ACC_X.parquet')
    acc_y = pl.read_parquet(f'{path}/bdf_files/{file_name}/{patient_id}/ACC_Y.parquet')
    acc_z = pl.read_parquet(f'{path}/bdf_files/{file_name}/{patient_id}/ACC_Z.parquet')

    # convert to numpy array
    acc_x = acc_x.to_numpy()
    acc_y = acc_y.to_numpy()
    acc_z = acc_z.to_numpy()

    #combine the 3 axes and calculate the percentage of the signal where all axes are < -1
    combined = np.stack((acc_x, acc_y, acc_z), axis=1)
    #print(combined)
    #print(combined.shape)

    # Calculate the percentage of the signal where all axes are < -1
    non_zero_percentage = np.sum(np.all(combined < -2, axis=1)) / len(combined) * 100
    good_data = 100 - non_zero_percentage
    print(good_data)

    df.at[index, 'acc_qual'] = good_data

    # Take the value from the wear_hrs column and multiply by the fraction of the signal where all axes are < -1
    wear_time = df.at[index, 'wear_hrs']
    print(wear_time)
    good_wear = wear_time * (good_data/100)
    print(good_wear)
    df.at[index, 'good_wear_hrs'] = good_wear






Processing patient R001...
98.69573156130268
74.24
73.2717111111111
Processing patient R002...
83.08458093619247
67.98222222222222
56.48274444444444
Processing patient R003...
86.89481278153153
94.72
82.30676666666666
Processing patient R004...
63.739337744661924
79.92888888888889
50.94614444444445
Processing patient R005...
86.21051897321429
47.78666666666667
41.19713333333334
Processing patient R006...
100.0
71.96444444444444
71.96444444444444
Processing patient R007...
99.9984313484252
72.24888888888889
72.24775555555556
Processing patient R008...
92.64993866219008
68.83555555555556
63.7761
Processing patient R009...
99.99826009825327
65.13777777777777
65.13664444444443
Processing patient R010...
90.54627732240436
52.053333333333335
47.132355555555556
Processing patient R011...
99.99769315944881
72.24888888888889
72.24722222222222
Processing patient R012...
100.0
43.80444444444444
43.80444444444444
Processing patient R013...
99.99834794960475
71.96444444444444
71.96325555555555
Proc

In [6]:
to_remove = []

for index, row in df.iterrows():
    if row['good_wear_hrs'] < 24:
        print(f"Patient ID: {row['Patient ID']}, Wear time: {row['Wear time']}")
        to_remove.append(row['Patient ID'])

print(to_remove)

Patient ID: R018, Wear time: 37.26222222222222
Patient ID: R025, Wear time: 68.55111111111111
Patient ID: R094, Wear time: 10.80888888888889
Patient ID: R112, Wear time: 22.1600005
Patient ID: R162, Wear time: 52.90666666666667
['R018', 'R025', 'R094', 'R112', 'R162']


In [7]:
counter = 0

for index, row in df.iterrows():
    # Calculate the remaining hours
    patient_id = row["Patient ID"]
    print(patient_id)

    reformat_path = os.path.join(path, f"hr_values/{patient_id}.npy")

    # Load the npy file
    hr = np.load(reformat_path)

    #extract the first 24hrs of the hr signal only (each hr value is 10s of time)
    hr_data = hr[:6*60*24]

    # Resample HR data to 1-minute intervals by taking the median of each 6 values, excluding zeros. If all 0s then HR is 0
    hr_resampled = []
    for i in range(0, len(hr_data), 6):
        chunk = hr_data[i:i+6]
        non_zero_chunk = chunk[chunk != 0]
        median = np.mean(non_zero_chunk) if len(non_zero_chunk) > 0 else 0
        hr_resampled.append(median)

    #count how many 0s in hr_resampled, if less 18hrs of hr than add to to_remove list if not already there
    zero_count = hr_resampled.count(0)
    print(zero_count)
    if zero_count > 60*6:
        if patient_id not in to_remove:
            to_remove.append(patient_id)
            counter += 1

print(f"Number of patients to remove: {counter}")
print(to_remove)

    


R001
53
R002
618
R003
654
R004
917
R005
415
R006
2
R007
40
R008
12
R009
714
R010
10
R011
1
R012
87
R013
48
R014
1263
R015
670
R016
112
R017
6
R018
527
R019
60
R020
42
R021
24
R022
23
R023
55
R024
49
R025
863
R026
138
R027
597
R028
1
R029
47
R030
169
R031
21
R032
185
R033
184
R034
46
R035
121
R036
84
R037
132
R038
6
R039
32
R040
163
R041
8
R042
425
R043
71
R044
7
R045
77
R046
78
R047
819
R048
4
R049
232
R050
10
R051
4
R052
69
R053
2
R054
20
R055
1
R056
112
R057
531
R058
265
R059
96
R060
11
R061
405
R062
365
R063
39
R064
39
R065
1
R066
447
R067
11
R068
9
R069
87
R070
196
R071
10
R072
1
R073
44
R074
39
R075
31
R076
5
R077
481
R078
128
R079
263
R080
60
R081
494
R082
7
R083
47
R084
46
R085
11
R086
64
R087
59
R088
201
R089
3
R090
0
R091
43
R092
7
R093
25
R094
88
R096
65
R097
118
R098
880
R099
10
R100
135
R101
10
R102
41
R103
60
R104
217
R105
2
R106
3
R107
8
R108
317
R109
260
R110
16
R111
6
R112
157
R113
91
R114
8
R115
28
R116
4
R117
23
R118
53
R119
211
R120
963
R121
3
R122
36
R123
10
R124
10

In [8]:
print(len(to_remove))

33


In [9]:
new = ['R026', 'R066', 'R114', 'R174', 'R186']

#check if any patients in the new list are already in the to_remove list
for patient in new:
    if patient in to_remove:
        print(f"{patient} is already in the to_remove list")
    else:
        print(f"{patient} is not in the to_remove list")
        #add the patient to the to_remove list
        to_remove.append(patient)
        

R026 is not in the to_remove list
R066 is already in the to_remove list
R114 is not in the to_remove list
R174 is not in the to_remove list
R186 is not in the to_remove list


In [10]:
#save to_remove list to a text file titled 'patients_to_remove2.txt'
file = open(f'{path}/patients_to_remove3.txt', 'w')
file.write(str(to_remove))
file.close()

In [12]:
for index, row in df.iterrows():
    # Calculate the remaining hours
    patient_id = row["Patient ID"]

    reformat_path = os.path.join(path, f"hr_values/{patient_id}.npy")

    # Load the npy file
    hr = np.load(reformat_path)

    # Calculate the percentage of the signal that isn't 0 values
    non_zero_percentage = np.sum(hr != 0)

    #each non_zero value is 10s, calculate the total time in hours of non-zero values
    total_time = non_zero_percentage / 6
    total_time_hrs = total_time / 60


    print(f"Patient ID: {patient_id}, Remaining hours: {total_time_hrs}")

    
    
    # Check if the remaining hrs is less than 24
    if total_time_hrs < 24:
        print(f"Patient ID: {row['Patient ID']}, Remaining hrs: {total_time_hrs}")
        #if value is not already in to_remove list, add it
        if row['Patient ID'] not in to_remove:
            to_remove.append(row['Patient ID'])

print(to_remove)
print(len(to_remove))

Patient ID: R001, Remaining hours: 56.06111111111111
Patient ID: R002, Remaining hours: 50.425
Patient ID: R003, Remaining hours: 81.25277777777778
Patient ID: R004, Remaining hours: 48.59166666666667
Patient ID: R005, Remaining hours: 26.105555555555554
Patient ID: R006, Remaining hours: 70.63611111111112
Patient ID: R007, Remaining hours: 65.34166666666667
Patient ID: R008, Remaining hours: 57.68333333333333
Patient ID: R009, Remaining hours: 15.56388888888889
Patient ID: R009, Remaining hrs: 15.56388888888889
Patient ID: R010, Remaining hours: 40.9
Patient ID: R011, Remaining hours: 53.66388888888889
Patient ID: R012, Remaining hours: 36.083333333333336
Patient ID: R013, Remaining hours: 56.31388888888889
Patient ID: R014, Remaining hours: 33.577777777777776
Patient ID: R015, Remaining hours: 31.194444444444446
Patient ID: R016, Remaining hours: 65.23055555555555
Patient ID: R017, Remaining hours: 47.25277777777777
Patient ID: R018, Remaining hours: 14.052777777777777
Patient ID: R0

In [8]:
#open a text file and write the patient ID's to be removed and save this under the data folder
file = open(f'{path}/patients_to_remove.txt', 'w')
file.write(str(to_remove))
file.close()


In [13]:
print(df.columns)
#remove the columns 'Wear time', 'missing ACC (% of total signal)', 'missing ECG overall (% of total signal)', 'Missing ECG from electrode (% of total signal)'
df = df.drop(columns=['Wear time', 'missing ACC (% of total signal)', 'missing ECG overall (% of total signal)', 'Missing ECG from electrode (% of total signal)'])

#save to a csv file
df.to_csv(f'{path}/sensor_data.csv', index=False)

Index(['Patient ID', 'Biosensor ID', 'file_name', 'Start', 'Wear time',
       'missing ACC (% of total signal)',
       'missing ECG overall (% of total signal)',
       'Missing ECG from electrode (% of total signal)', 'ecg_qual',
       'wear_hrs', 'acc_qual', 'good_wear_hrs'],
      dtype='object')
