## Preprocess of the mHealth dataset

- Loading the initial dataset(s)
- Dropping the columns that are not needed
- Renaming the columns
- Removing labels or concatening them
- Changing the NaN Label values to 0 and converting the labels to int
- Cleaning the data
- Lowpass filter
- Downsampling signal
- Calculating the acceleration magnitude as feature
- Splitting the data into windows and features
- Remove unused labels(after extracting windows)
- Saving the datasets per users and concatenated

In [24]:
# Importing libraries needed
import os
import sys

import pandas as pd
import numpy as np

sys.path.append(os.path.abspath(os.path.join('../src/utils/')))
import feature_extraction
import preprocessing
import downsampling

In [25]:
# Read the raw data
user_id_dataframes = []
dataset_path = r'../data/activity_recognition_mHealth/'
for user_id in os.listdir(dataset_path):
    if user_id.endswith('.csv'):
        user_id_dataframes.append(pd.read_csv(dataset_path + user_id))

In [26]:
# Checking the basic info of the data
for user_id_dataframe in user_id_dataframes:
    print(user_id_dataframe.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161281 entries, 0 to 161280
Data columns (total 24 columns):
 #   Column                                                 Non-Null Count   Dtype  
---  ------                                                 --------------   -----  
 0   acceleration from the chest sensor (X axis)            161280 non-null  float64
 1   acceleration from the chest sensor (Y axis)            161280 non-null  float64
 2   acceleration from the chest sensor (Z axis)            161280 non-null  float64
 3   electrocardiogram signal (lead 1)                      161280 non-null  float64
 4   electrocardiogram signal (lead 2)                      161280 non-null  float64
 5   acceleration from the left-ankle sensor (X axis)       161280 non-null  float64
 6   acceleration from the left-ankle sensor (Y axis)       161280 non-null  float64
 7   acceleration from the left-ankle sensor (Z axis)       161280 non-null  float64
 8   gyro from the left-ankle sensor (X

In [27]:
# Get the column names that are not needed
drop_columns = [col for col in user_id_dataframes[0].columns if 'gyro' in col or 'magne' in col or 'electro' in col]
print(drop_columns)

['electrocardiogram signal (lead 1)', 'electrocardiogram signal (lead 2)', 'gyro from the left-ankle sensor (X axis)', 'gyro from the left-ankle sensor (Y axis)', 'gyro from the left-ankle sensor (Z axis)', 'magnetometer from the left-ankle sensor (X axis)', 'magnetometer from the left-ankle sensor (Y axis)', 'magnetometer from the left-ankle sensor (Z axis)', 'gyro from the right-lower-arm sensor (X axis)', 'gyro from the right-lower-arm sensor (Y axis)', 'gyro from the right-lower-arm sensor (Z axis)', 'magnetometer from the right-lower-arm sensor (X axis)', 'magnetometer from the right-lower-arm sensor (Y axis)', 'magnetometer from the right-lower-arm sensor (Z axis)']


In [28]:
# Drop the columns that are not needed
for user_id_dataframe in user_id_dataframes:
    user_id_dataframe.drop(drop_columns, axis=1, inplace=True)

In [29]:
# Check if the columns are dropped
for user_id_dataframe in user_id_dataframes:
    print(user_id_dataframe.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161281 entries, 0 to 161280
Data columns (total 10 columns):
 #   Column                                                 Non-Null Count   Dtype  
---  ------                                                 --------------   -----  
 0   acceleration from the chest sensor (X axis)            161280 non-null  float64
 1   acceleration from the chest sensor (Y axis)            161280 non-null  float64
 2   acceleration from the chest sensor (Z axis)            161280 non-null  float64
 3   acceleration from the left-ankle sensor (X axis)       161280 non-null  float64
 4   acceleration from the left-ankle sensor (Y axis)       161280 non-null  float64
 5   acceleration from the left-ankle sensor (Z axis)       161280 non-null  float64
 6   acceleration from the right-lower-arm sensor (X axis)  161280 non-null  float64
 7   acceleration from the right-lower-arm sensor (Y axis)  161280 non-null  float64
 8   acceleration from the right-lower-

In [30]:
# Rename the columns
for user_id_dataframe in user_id_dataframes:
    user_id_dataframe.columns = ['Acc.X.Chest','Acc.Y.Chest','Acc.Z.Chest', 'Acc.X.Left_Ankle','Acc.Y.Left_Ankle','Acc.Z.Left_Ankle','Acc.X.Right_Arm','Acc.Y.Right_Arm','Acc.Z.Right_Arm', 'Label']

In [31]:
# Checking the changes
user_id_dataframes[0].head()

Unnamed: 0,Acc.X.Chest,Acc.Y.Chest,Acc.Z.Chest,Acc.X.Left_Ankle,Acc.Y.Left_Ankle,Acc.Z.Left_Ankle,Acc.X.Right_Arm,Acc.Y.Right_Arm,Acc.Z.Right_Arm,Label
0,-9.8184,0.009971,0.29563,2.1849,-9.6967,0.63077,-8.6499,-4.5781,0.18776,0.0
1,-9.8489,0.52404,0.37348,2.3876,-9.508,0.68389,-8.6275,-4.3198,0.023595,0.0
2,-9.6602,0.18185,0.43742,2.4086,-9.5674,0.68113,-8.5055,-4.2772,0.27572,0.0
3,-9.6507,0.21422,0.24033,2.1814,-9.4301,0.55031,-8.6279,-4.3163,0.36752,0.0
4,-9.703,0.30389,0.31156,2.4173,-9.3889,0.71098,-8.7008,-4.1459,0.40729,0.0


In [32]:
# Checking the unique values of the labels
# 0. No activity, 1. Standing still, 2. Sitting and relaxing, 3. Lying down, 4. Walking, 5. Climbing stairs, 
# 6. Waist bends forward, 7. Frontal elevation of arms, 8. Knees bending (crouching), 9. Cycling, 10. Jogging, 11. Running, 12. Jump front & back
labels = []
for user_id_dataframe in user_id_dataframes:
    labels.extend(user_id_dataframe['Label'].unique())
labels = list(set(labels))
print(labels)

[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]


In [33]:
# Change the labels to 0-12 and nan to 0
for user_id_dataframe in user_id_dataframes:
    user_id_dataframe['Label'].fillna(0, inplace=True)
    user_id_dataframe['Label'] = user_id_dataframe['Label'].astype(int)

In [34]:
# Checking the unique values of the labels
labels = []
for user_id_dataframe in user_id_dataframes:
    labels.extend(user_id_dataframe['Label'].unique())
labels = list(set(labels))
print(labels)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]


In [35]:
# The labels 1,2,3 make them 1, 10,11 make them 2
for user_id_dataframe in user_id_dataframes:
    user_id_dataframe['Label'].replace({1: 1, 2: 1, 3: 1, 10: 2, 11: 2}, inplace=True)

In [36]:
# Checking the unique values of the labels
labels = []
for user_id_dataframe in user_id_dataframes:
    labels.extend(user_id_dataframe['Label'].unique())
labels = list(set(labels))
print(labels)

[0, 1, 2, 4, 5, 6, 7, 8, 9, 12]


In [37]:
# Checking the changes
user_id_dataframes[0].head()

Unnamed: 0,Acc.X.Chest,Acc.Y.Chest,Acc.Z.Chest,Acc.X.Left_Ankle,Acc.Y.Left_Ankle,Acc.Z.Left_Ankle,Acc.X.Right_Arm,Acc.Y.Right_Arm,Acc.Z.Right_Arm,Label
0,-9.8184,0.009971,0.29563,2.1849,-9.6967,0.63077,-8.6499,-4.5781,0.18776,0
1,-9.8489,0.52404,0.37348,2.3876,-9.508,0.68389,-8.6275,-4.3198,0.023595,0
2,-9.6602,0.18185,0.43742,2.4086,-9.5674,0.68113,-8.5055,-4.2772,0.27572,0
3,-9.6507,0.21422,0.24033,2.1814,-9.4301,0.55031,-8.6279,-4.3163,0.36752,0
4,-9.703,0.30389,0.31156,2.4173,-9.3889,0.71098,-8.7008,-4.1459,0.40729,0


In [38]:
# Drop all the rows with NaN values
for user_id_dataframe in user_id_dataframes:
    user_id_dataframe.dropna(inplace=True)

In [39]:
# Setting the parameters for the preprocessing
old_sampling_frequency = 50
new_sampling_frequency = 20
cutoff_frequency = 5
order = 3

In [40]:
# Lowpass filter the sensor data and calculate the magnitude
col_signals_chest = ['Acc.X.Chest','Acc.Y.Chest','Acc.Z.Chest']
col_signals_left_ankle = ['Acc.X.Left_Ankle','Acc.Y.Left_Ankle','Acc.Z.Left_Ankle']
col_signals_right_arm = ['Acc.X.Right_Arm','Acc.Y.Right_Arm','Acc.Z.Right_Arm']
for user_id_dataframe in user_id_dataframes:
    for col in col_signals_chest + col_signals_left_ankle + col_signals_right_arm:
        user_id_dataframe[col] = preprocessing.lowpass_filter(user_id_dataframe[col], old_sampling_frequency, cutoff_frequency, order)
        user_id_dataframe[col] = downsampling.downsample_signal(user_id_dataframe[col], old_sampling_frequency, new_sampling_frequency, None)
    user_id_dataframe = preprocessing.calculate_mag(user_id_dataframe, col_signals_chest)
    user_id_dataframe = preprocessing.calculate_mag(user_id_dataframe, col_signals_left_ankle)
    user_id_dataframe = preprocessing.calculate_mag(user_id_dataframe, col_signals_right_arm)
    labels = user_id_dataframe['Label']
    user_id_dataframe.drop('Label', axis=1, inplace=True)
    user_id_dataframe['Label'] = labels



In [41]:
# Checking the changes
user_id_dataframes[0].head()

Unnamed: 0,Acc.X.Chest,Acc.Y.Chest,Acc.Z.Chest,Acc.X.Left_Ankle,Acc.Y.Left_Ankle,Acc.Z.Left_Ankle,Acc.X.Right_Arm,Acc.Y.Right_Arm,Acc.Z.Right_Arm,Acc.Magnitude.Chest,Acc.Magnitude.Left_Ankle,Acc.Magnitude.Right_Arm,Label
0,-9.818275,0.0077,0.296487,2.185078,-9.695816,0.631322,-8.649182,-4.577483,0.187383,9.822753,9.959015,9.787585,0
1,-9.818275,0.0077,0.296487,2.185078,-9.695816,0.631322,-8.649182,-4.577483,0.187383,9.822753,9.959015,9.787585,0
2,-9.720378,0.198811,0.345354,2.286127,-9.527352,0.647779,-8.642478,-4.323515,0.29994,9.728543,9.819187,9.668256,0
3,-9.720378,0.198811,0.345354,2.286127,-9.527352,0.647779,-8.642478,-4.323515,0.29994,9.728543,9.819187,9.668256,0
4,-9.668968,0.248847,0.38794,2.304425,-9.460915,0.636319,-8.741284,-4.145053,0.377871,9.679946,9.758289,9.681647,0


In [42]:
# Setting the parameters for the feature extraction
window_duration = 0.3
overlap = 0.3
win_length = int(window_duration * new_sampling_frequency)
overlap = int(overlap * new_sampling_frequency)
col_extract = col_signals_chest + col_signals_left_ankle + col_signals_right_arm + ['Acc.Magnitude.Chest','Acc.Magnitude.Left_Ankle','Acc.Magnitude.Right_Arm']

In [43]:
# Extracting the features and generating the labels for each user
new_user_id_dataframes = []
for user_id_dataframe in user_id_dataframes:
    features = feature_extraction.calculate_features(user_id_dataframe, col_extract, win_length, overlap)
    labels = feature_extraction.generate_labels(user_id_dataframe, 'Label', win_length, overlap)
    features['Label'] = labels
    new_user_id_dataframes.append(features)

  labels = np.apply_along_axis(lambda x: stats.mode(x).mode, 1, labels)
  labels = np.apply_along_axis(lambda x: stats.mode(x).mode, 1, labels)
  labels = np.apply_along_axis(lambda x: stats.mode(x).mode, 1, labels)
  labels = np.apply_along_axis(lambda x: stats.mode(x).mode, 1, labels)
  labels = np.apply_along_axis(lambda x: stats.mode(x).mode, 1, labels)
  labels = np.apply_along_axis(lambda x: stats.mode(x).mode, 1, labels)
  labels = np.apply_along_axis(lambda x: stats.mode(x).mode, 1, labels)
  labels = np.apply_along_axis(lambda x: stats.mode(x).mode, 1, labels)
  labels = np.apply_along_axis(lambda x: stats.mode(x).mode, 1, labels)
  labels = np.apply_along_axis(lambda x: stats.mode(x).mode, 1, labels)


In [44]:
# Removing the label 0,6,7,8,9,12 and map 4 to 3 and 5 to 4
new_dict_map = {0:np.nan, 6:np.nan, 7:np.nan, 8:np.nan, 9:np.nan, 12:np.nan}
for user_id_dataframe in new_user_id_dataframes:
    user_id_dataframe['Label'].replace({4: 3, 5: 4}, inplace=True)
    user_id_dataframe['Label'].replace(new_dict_map, inplace=True)
    user_id_dataframe.dropna(inplace=True)

In [45]:
# Checking the changes
labels = []
for user_id_dataframe in new_user_id_dataframes:
    labels.extend(user_id_dataframe['Label'].unique())
labels = list(set(labels))
print(labels)

[1.0, 2.0, 3.0, 4.0]


In [46]:
# Saving the data
processed_path = r"../processed_data/activity_recognition_mHealth_less_classes_frequency_features/"
all_users_data = pd.DataFrame()
if not os.path.exists(processed_path):
    os.makedirs(processed_path)
for i in range(len(new_user_id_dataframes)):
    new_user_id_dataframes[i].insert(0, 'User_ID', i+1)
    new_user_id_dataframes[i].to_csv(processed_path + 'user_' + str(i+1) + '.csv', index=False)
    all_users_data = pd.concat([all_users_data, new_user_id_dataframes[i]])
all_users_data.to_csv(processed_path + 'all_users.csv', index=False)