## 1. Preprocessing the dataset of the accelerations

### Loading the dataset

In [150]:
import pandas as pd

columns = ["time", "x_acceleration", "y_acceleration", "z_acceleration"]

df_left = pd.read_csv('data/left_accs.csv', header=None, names=columns)
df_right = pd.read_csv('data/right_accs.csv', header=None, names=columns)


df_left['time'] = pd.to_datetime(df_left['time'], unit='s')
df_right['time'] = pd.to_datetime(df_right['time'], unit='s')

df_left.set_index('time', inplace=True)
df_right.set_index('time', inplace=True)

In [155]:
print("Dataframe left:")
print(print(df_left.head()))
print("len(df_left):", len(df_left))
print("\n")
print("Dataframe right:")
print(print(df_right.head()))
print("len(df_right):", len(df_right))

Dataframe left:
                               x_acceleration  y_acceleration  z_acceleration
time                                                                         
2024-07-19 20:00:05.059999943             -36             284            -232
2024-07-19 20:00:05.079999924            -224             -92             -60
2024-07-19 20:00:05.099999905            -236             -96             -64
2024-07-19 20:00:05.119999886            -240             -96             -72
2024-07-19 20:00:05.140000105            -252            -108             -80
None
len(df_left): 32789520


Dataframe right:
                               x_acceleration  y_acceleration  z_acceleration
time                                                                         
2024-07-19 20:00:07.420000076             112             232             -96
2024-07-19 20:00:07.440000057            -240              20              56
2024-07-19 20:00:07.460000038            -248              20              60
2

### Fixing the size of our two datasets to the same length

In [156]:
start_time = max(df_left.index[0], df_right.index[0])
end_time = min(df_left.index[-1], df_right.index[-1])

df_left = df_left[start_time:end_time]
df_right = df_right[start_time:end_time]

print("Start time:", start_time)
print("End time:", end_time)

Start time: 2024-07-19 20:00:07.420000076
End time: 2024-07-27 08:00:01.700200081


### Merging left and right datasets

In [106]:
### 2. Merging the datasets
df_left_and_right = pd.merge(df_left, df_right, on='time', suffixes=('_left', '_right'), how='outer')
df_left_and_right.head()

Unnamed: 0_level_0,x_acceleration_left,y_acceleration_left,z_acceleration_left,x_acceleration_right,y_acceleration_right,z_acceleration_right
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-07-19 20:00:07.420000076,-240.0,-100.0,-84.0,112.0,232.0,-96.0
2024-07-19 20:00:07.440000057,-240.0,-92.0,-84.0,-240.0,20.0,56.0
2024-07-19 20:00:07.460000038,-244.0,-100.0,-80.0,-248.0,20.0,60.0
2024-07-19 20:00:07.479799986,-244.0,-108.0,-80.0,,,
2024-07-19 20:00:07.480000019,,,,-248.0,20.0,56.0


### Interpolating values of the right dataset using time indexes of the left dataset

In [107]:
df_left_and_right['x_acceleration_left'] = df_left_and_right['x_acceleration_left'].interpolate(method='linear')
df_left_and_right['y_acceleration_left'] = df_left_and_right['y_acceleration_left'].interpolate(method='linear')
df_left_and_right['z_acceleration_left'] = df_left_and_right['z_acceleration_left'].interpolate(method='linear')

In [108]:
df_interpolated = df_left_and_right[df_left_and_right.index.isin(df_right.index)]

In [109]:
df_interpolated.shape

(31786550, 6)

In [62]:
df_interpolated.to_csv('data/interpolated_accs.csv', index=True)

In [110]:
df_interpolated

Unnamed: 0_level_0,x_acceleration_left,y_acceleration_left,z_acceleration_left,x_acceleration_right,y_acceleration_right,z_acceleration_right
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-07-19 20:00:07.420000076,-240.0,-100.0,-84.0,112.0,232.0,-96.0
2024-07-19 20:00:07.440000057,-240.0,-92.0,-84.0,-240.0,20.0,56.0
2024-07-19 20:00:07.460000038,-244.0,-100.0,-80.0,-248.0,20.0,60.0
2024-07-19 20:00:07.480000019,-244.0,-102.0,-82.0,-248.0,20.0,56.0
2024-07-19 20:00:07.500000000,-242.0,-96.0,-84.0,-252.0,20.0,60.0
...,...,...,...,...,...,...
2024-07-27 08:00:01.615000010,36.0,-180.0,-204.0,140.0,-12.0,208.0
2024-07-27 08:00:01.635299921,36.0,-180.0,-204.0,140.0,-12.0,204.0
2024-07-27 08:00:01.655699968,36.0,-180.0,-204.0,140.0,-12.0,208.0
2024-07-27 08:00:01.676000118,34.0,-180.0,-204.0,140.0,-12.0,204.0


### Resampling the dataset

In [111]:
df_resampled = df_interpolated.resample('60S').mean() # resample to 60 seconds

  df_resampled = df_interpolated.resample('60S').mean() # resample to 60 seconds


In [112]:
df_resampled.to_csv('data/resampled_accs.csv', index=True)

In [113]:
df_resampled.head(10)

Unnamed: 0_level_0,x_acceleration_left,y_acceleration_left,z_acceleration_left,x_acceleration_right,y_acceleration_right,z_acceleration_right
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-07-19 20:00:00,-246.499418,-93.416053,-43.676619,-251.978286,13.265607,14.371462
2024-07-19 20:01:00,-174.771166,-124.872492,-11.882353,-167.439646,9.154709,113.678341
2024-07-19 20:02:00,-68.565498,-88.782579,128.174209,-201.974821,9.12147,96.012249
2024-07-19 20:03:00,-62.486909,-92.762326,107.133628,-246.086365,5.434886,3.528052
2024-07-19 20:04:00,-91.68901,-109.599864,60.436203,-242.079619,8.563457,66.384485
2024-07-19 20:05:00,-10.84966,-217.731973,51.010204,-155.610884,2.556463,189.363265
2024-07-19 20:06:00,15.063265,-239.002721,74.834694,-161.429932,3.606803,193.439456
2024-07-19 20:07:00,-80.055763,-129.50561,73.179191,-143.405644,-37.271676,-88.748045
2024-07-19 20:08:00,95.335829,-200.076216,65.210616,-161.882273,12.193263,63.43246
2024-07-19 20:09:00,66.668027,-176.448299,96.757143,-185.383673,61.601361,159.893878


## Preprocessing the dataset of activities

### Patients

In [161]:
import pandas as pd

# Manually input the journal data into a dictionary
patient_activity_data = {  
    'start_time': [
        '20:05', '20:38', '21:09',                                                  # Jour de Pose
        '08:05', '08:15', None, None, '09:15', '08:45', '14:00', '19:30', '20:00',  # Jour 1
        '09:40', None, None, '13:28', '16:45', '17:16', '20:47', '21:01',           # Jour 2
        '07:14', None, '07:44', '13:30', '13:35', '14:25', '24:00',                 # Jour 3
        '07:15', '07:31', '07:40', '13:15', '20:30', '20:20', '08:40',              # Jour 4
        '07:36', '07:45', '08:19', '13:30', None, '16:40', '20:02'                  # Jour 5
    ],
    'end_time': [
        '20:12', '21:05', '21:15',                                                  # Jour de Pose
        '08:15', None, None, '08:45', '10:00', '09:15', None, '20:04', '21:40',     # Jour 1
        '10:50', None, '11:15', '13:48', '17:15', '17:30', '21:00', '21:05',        # Jour 2
         None, None,'08:03', None,'14:24', '14:30', None,                           # Jour 3
        '07:30', '07:35', '08:15', '14:00',None, None, '19:00'                      # Jour 4
        '07:44', '08:18', '08:31', None, '14:10', '18:10', '20:44'                  # Jour 5
    ],
    'activity': [
        'S', 'A', 'P',                                                              # Jour de Pose
        'T', 'Hbas', 'Hhaut', 'S', 'P', 'A', 'A', 'T', 'A',                         # Jour 1
        'A', 'Hhaut', 'Hbas','A', 'M', 'S', 'A', 'P',                               # Jour 2
        'T/S', 'Hhaut /Hbas', 'A', 'P', 'A', 'P', 'Hbas/Hhaut',                     # Jour 3
        'T/S', 'Hhaut /Hbas', 'A', 'A', 'A', 'T', 'Travail',                        # Jour 4
        'T/S', 'A', 'P', 'P', 'A/P', 'Kiné', 'P/A/P'                                # Jour 5
    ]
}

### Partner

In [162]:
partner_activity_data = {  
    'start_time': [
        '20:32', '21:05', None,                     # Jour de Pose
        '08:00', '08:15', '08:23', '08:50',         # Jour 1
        '10:00', '20:42', '21:03',                  # Jour 2
        '07:15', '07:29', '07:44', '08:06',         # Jour 3
        '07:15', '07:30', '07:42',                  # Jour 4
        '07:35', '07:45', '07:55', '08:15'          # Jour 5
    ],
    'end_time': [
        '21:05', '21:12', None,                     # Jour de Pose
        '08:15', '08:23', None, None,               # Jour 1
        '10:30', '21:00', '21:09',                  # Jour 2
        '07:28', '07:39', '08:01', None,            # Jour 3
        '07:30', '07:39', '08:00',                  # Jour 4
        '07:45', '07:50','08:15', '08:30'           # Jour 5
    ],
    'activity': [
        'A', 'P', 'T',                              # Jour de Pose
        'T', 'S', 'H', None,                        # Jour 1
        'A', 'A', 'P',                              # Jour 2
        'S', 'H', 'A', 'S',                         # Jour 3
        'S', 'H', 'A',                              # Jour 4
        'S', 'H', 'A', 'P'                          # Jour 5
    ]
}

First, we apply all the assumptions made in the subject. Then we decided to fill the Nan by looking at the mean of the considered task, in order to complete them. H usually last 8 minutes, A 33 minutes and T 14 minutes. For the S/T situation, we keep only T because it's the main action in this short amount of time (around 10 minutes). We decided to also drop the raw "Travail" which is not assigned to any specific category and plus not very informative as it's lasting for the all day

In [163]:
# Manually input the journal data into a dictionary
combined_activity_data = {  
    'start_time': [
        '20:05', '20:38', '21:09',                                                              # Jour de Pose
        '08:05', '08:15', '08:34', '09:15', '08:45', '14:00', '19:30', '20:00',                 # Jour 1
        '09:40', '11:08', '13:28', '16:45', '17:16', '20:47', '21:01',                          # Jour 2
        '07:14', '07:29', '07:44', '13:30', '13:35', '14:25',                                   # Jour 3
        '00:00','07:15', '07:31', '07:40', '13:15', '20:30', '20:20',                           # Jour 4
        '07:36', '07:45', '08:19', '13:30', '13:32','14:05', '16:40', '20:02', '20:07', '20:39' # Jour 5
    ],
    'end_time': [
        '20:12', '21:05', '21:15',                                                              # Jour de Pose
        '08:15', '8:23', '08:45', '10:00', '09:15', '14:33', '20:04', '21:40',                  # Jour 1
        '10:50', '11:15', '13:48', '17:15', '17:30', '21:00', '21:05',                          # Jour 2
        '07:28', '07:39','08:03', '23:35','14:24', '14:30',                                     # Jour 3
        '00:08','07:30', '07:35', '08:15', '14:00','21:03', '20:34',                            # Jour 4
        '07:44', '08:18', '08:31', '13:35', '14:05','14:10', '18:10','20:07', '20:39', '20:44'  # Jour 5
    ],
    'activity': [
        'S', 'A', 'P',                                                                          # Jour de Pose
        'T', 'H', 'S', 'P', 'A', 'A', 'T', 'A',                                                 # Jour 1
        'A', 'H','A', 'M', 'S', 'A', 'P',                                                       # Jour 2
        'T', 'H', 'A', 'P', 'A', 'P',                                                           # Jour 3
        'H','T', 'H', 'A', 'A', 'A', 'T',                                                       # Jour 4
        'T', 'A', 'P', 'P', 'A', 'P', 'Kiné', 'P', 'A', 'P'                                     # Jour 5
    ]
}

In [171]:
day_labels = ['Jour de Pose'] * 3 + \
             ['Jour 1'] * 8 + \
             ['Jour 2'] * 7 + \
             ['Jour 3'] * 6 + \
             ['Jour 4'] * 7 + \
             ['Jour 5'] * 10  

combined_activity_df = pd.DataFrame(combined_activity_data)

combined_activity_df['day'] = day_labels

combined_activity_df['start_time'] = pd.to_datetime(combined_activity_df['start_time'], format='%H:%M', errors='coerce').dt.time
combined_activity_df['end_time'] = pd.to_datetime(combined_activity_df['end_time'], format='%H:%M', errors='coerce').dt.time

combined_activity_df.to_csv("data/combined_activity_journal.csv", index=False)

In [172]:
combined_activity_df.head(10)

Unnamed: 0,start_time,end_time,activity,day
0,20:05:00,20:12:00,S,Jour de Pose
1,20:38:00,21:05:00,A,Jour de Pose
2,21:09:00,21:15:00,P,Jour de Pose
3,08:05:00,08:15:00,T,Jour 1
4,08:15:00,08:23:00,H,Jour 1
5,08:34:00,08:45:00,S,Jour 1
6,09:15:00,10:00:00,P,Jour 1
7,08:45:00,09:15:00,A,Jour 1
8,14:00:00,14:33:00,A,Jour 1
9,19:30:00,20:04:00,T,Jour 1


## Merging actvities dataset with accelerations dataset

In [197]:
df_activities = pd.read_csv('data/combined_activity_journal.csv', header=0)

In [198]:
df_resampled = pd.read_csv('data/resampled_accs.csv', header=0)

df_resampled['date'] = pd.to_datetime(df_resampled['time']).dt.date
df_resampled['timestamp'] = pd.to_datetime(df_resampled['time']).dt.time

df_resampled.head(4)

Unnamed: 0,time,x_acceleration_left,y_acceleration_left,z_acceleration_left,x_acceleration_right,y_acceleration_right,z_acceleration_right,date,timestamp
0,2024-07-19 20:00:00,-246.499418,-93.416053,-43.676619,-251.978286,13.265607,14.371462,2024-07-19,20:00:00
1,2024-07-19 20:01:00,-174.771166,-124.872492,-11.882353,-167.439646,9.154709,113.678341,2024-07-19,20:01:00
2,2024-07-19 20:02:00,-68.565498,-88.782579,128.174209,-201.974821,9.12147,96.012249,2024-07-19,20:02:00
3,2024-07-19 20:03:00,-62.486909,-92.762326,107.133628,-246.086365,5.434886,3.528052,2024-07-19,20:03:00


In [199]:
dates_mapping = {
    "Jour de Pose": "2024-07-19",
    "Jour 1": "2024-07-20",
    "Jour 2": "2024-07-21",
    "Jour 3": "2024-07-22",
    "Jour 4": "2024-07-23",
    "Jour 5": "2024-07-24",
    "Jour 6": "2024-07-25",
    "Jour 7": "2024-07-26",
    "Jour 8": "2024-07-27",
}

In [200]:
# mapping
df_activities['date'] = df_activities['day'].map(dates_mapping)
df_activities.head()

Unnamed: 0,start_time,end_time,activity,day,date
0,20:05:00,20:12:00,S,Jour de Pose,2024-07-19
1,20:38:00,21:05:00,A,Jour de Pose,2024-07-19
2,21:09:00,21:15:00,P,Jour de Pose,2024-07-19
3,08:05:00,08:15:00,T,Jour 1,2024-07-20
4,08:15:00,08:23:00,H,Jour 1,2024-07-20


In [204]:
df_activities['date'] = pd.to_datetime(df_activities['date']).dt.date
df_resampled['date'] = pd.to_datetime(df_resampled['date']).dt.date

df_activities['start_time'] = pd.to_datetime(df_activities['start_time'], format='%H:%M:%S').dt.time
df_activities['end_time'] = pd.to_datetime(df_activities['end_time'], format='%H:%M:%S').dt.time
df_resampled['time'] = pd.to_datetime(df_resampled['timestamp'], format='%H:%M:%S').dt.time


df_resampled["activity"] = None  

for _, row_activity in df_activities.iterrows():
    for index, row_accel in df_resampled.iterrows():
        if (row_activity['date'] == row_accel['date'] and
            row_accel['timestamp'] >= row_activity['start_time'] and
            row_accel['timestamp'] <= row_activity['end_time']):
            df_resampled.at[index, 'activity'] = row_activity['activity']


df_resampled.head(15)

Unnamed: 0,time,x_acceleration_left,y_acceleration_left,z_acceleration_left,x_acceleration_right,y_acceleration_right,z_acceleration_right,date,timestamp,activity
0,20:00:00,-246.499418,-93.416053,-43.676619,-251.978286,13.265607,14.371462,2024-07-19,20:00:00,
1,20:01:00,-174.771166,-124.872492,-11.882353,-167.439646,9.154709,113.678341,2024-07-19,20:01:00,
2,20:02:00,-68.565498,-88.782579,128.174209,-201.974821,9.12147,96.012249,2024-07-19,20:02:00,
3,20:03:00,-62.486909,-92.762326,107.133628,-246.086365,5.434886,3.528052,2024-07-19,20:03:00,
4,20:04:00,-91.68901,-109.599864,60.436203,-242.079619,8.563457,66.384485,2024-07-19,20:04:00,
5,20:05:00,-10.84966,-217.731973,51.010204,-155.610884,2.556463,189.363265,2024-07-19,20:05:00,S
6,20:06:00,15.063265,-239.002721,74.834694,-161.429932,3.606803,193.439456,2024-07-19,20:06:00,S
7,20:07:00,-80.055763,-129.50561,73.179191,-143.405644,-37.271676,-88.748045,2024-07-19,20:07:00,S
8,20:08:00,95.335829,-200.076216,65.210616,-161.882273,12.193263,63.43246,2024-07-19,20:08:00,S
9,20:09:00,66.668027,-176.448299,96.757143,-185.383673,61.601361,159.893878,2024-07-19,20:09:00,S


In [203]:
df_resampled.to_csv('data/annotated_accs.csv', index=False)