## Preprocess of the mHealth dataset

- Loading the initial dataset(s)
- Dropping the columns that are not needed
- Renaming the columns
- Removing labels or concatening them
- Changing the NaN Label values to 0 and converting the labels to int
- Cleaning the data
- Lowpass filter
- Downsampling signal
- Calculating the acceleration magnitude as feature
- Splitting the data into windows and features
- Remove unused labels(after extracting windows)
- Saving the datasets per users and concatenated

In [1]:
# Importing libraries needed
import os
import sys
from datetime import datetime

import pandas as pd
import numpy as np

sys.path.append(os.path.abspath(os.path.join('../src/utils/')))
import feature_extraction
import preprocessing
import downsampling

In [2]:
# Read the raw data
user_id_dataframes = []
dataset_path = r'../data/activity_recognition_FLAAP/'
for user_id in os.listdir(dataset_path):
    if user_id.endswith('.csv'):
        user_id_dataframes.append(pd.read_csv(dataset_path + user_id))

In [3]:
# Checking the basic info of the data
for user_id_dataframe in user_id_dataframes:
    print(user_id_dataframe.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380154 entries, 0 to 380153
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   TimestampAcc  380154 non-null  float64
 1   Acc_X         380154 non-null  float64
 2   Acc_Y         380154 non-null  float64
 3   Acc_Z         380154 non-null  float64
 4   Activity      380154 non-null  object 
dtypes: float64(4), object(1)
memory usage: 14.5+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 425214 entries, 0 to 425213
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   TimestampAcc  425214 non-null  float64
 1   Acc_X         425214 non-null  float64
 2   Acc_Y         425214 non-null  float64
 3   Acc_Z         425214 non-null  float64
 4   Activity      425214 non-null  object 
dtypes: float64(4), object(1)
memory usage: 16.2+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex

In [4]:
# Rename the columns
for user_id_dataframe in user_id_dataframes:
    user_id_dataframe.columns = ['Timestamp', 'Acc.X.Center', 'Acc.Y.Center', 'Acc.Z.Center', 'Label']

In [5]:
# Checking the changes
user_id_dataframes[0].head()

Unnamed: 0,Timestamp,Acc.X.Center,Acc.Y.Center,Acc.Z.Center,Label
0,1650000000000.0,-0.80924,4.122815,8.734048,Walking
1,1650000000000.0,-0.88346,4.110844,8.714894,Walking
2,1650000000000.0,-0.905008,4.101267,8.698134,Walking
3,1650000000000.0,-0.890643,4.122815,8.693346,Walking
4,1650000000000.0,-0.837971,4.118027,8.734048,Walking


In [6]:
# Checking the unique values of the labels
labels = []
for user_id_dataframe in user_id_dataframes:
    labels.extend(user_id_dataframe['Label'].unique())
labels = list(set(labels))
print(labels)

['CrossLeg', 'CirWalk', 'Sitting', 'SitUp', 'StairDown', 'Laying', 'Walking', 'Jogging', 'StairUp', 'Standing']


In [7]:
# StairsDown and StairsUp are changed to Stairs
for user_id_dataframe in user_id_dataframes:
    user_id_dataframe['Label'] = user_id_dataframe['Label'].replace(['Laying', 'Sitting', 'Standing', 'CrossLeg'], 'Still')
    user_id_dataframe['Label'] = user_id_dataframe['Label'].replace(['StairDown', 'StairUp'], 'Stairs')
    user_id_dataframe['Label'] = user_id_dataframe['Label'].replace(['Walking', 'CirWalk'], 'Walking')

In [8]:
# Checking the unique values of the labels
labels = []
for user_id_dataframe in user_id_dataframes:
    labels.extend(user_id_dataframe['Label'].unique())
labels = list(set(labels))
print(labels)

['Stairs', 'SitUp', 'Still', 'Walking', 'Jogging']


In [9]:
map_dict = {'Still': 0, 'Walking': 1, 'Stairs': 2, 'Jogging': 3, 'SitUp': 4}
for user_id_dataframe in user_id_dataframes:
    user_id_dataframe['Label'] = user_id_dataframe['Label'].map(map_dict)

In [10]:
# Checking the unique values of the labels
labels = []
for user_id_dataframe in user_id_dataframes:
    labels.extend(user_id_dataframe['Label'].unique())
labels = list(set(labels))
print(labels)

[0, 1, 2, 3, 4]


In [11]:
# Checking the changes
user_id_dataframes[0].head()

Unnamed: 0,Timestamp,Acc.X.Center,Acc.Y.Center,Acc.Z.Center,Label
0,1650000000000.0,-0.80924,4.122815,8.734048,1
1,1650000000000.0,-0.88346,4.110844,8.714894,1
2,1650000000000.0,-0.905008,4.101267,8.698134,1
3,1650000000000.0,-0.890643,4.122815,8.693346,1
4,1650000000000.0,-0.837971,4.118027,8.734048,1


In [12]:
# Drop all the rows with NaN values
for user_id_dataframe in user_id_dataframes:
    user_id_dataframe.dropna(inplace=True)

In [13]:
# Setting the parameters for the preprocessing
old_sampling_frequency = 100
new_sampling_frequency = 20
cutoff_frequency = 5
order = 3

In [14]:
# Lowpass filter the sensor data and calculate the magnitude
col_signals = ['Acc.X.Center','Acc.Y.Center','Acc.Z.Center']
for user_id_dataframe in user_id_dataframes:
    for col in col_signals:
        user_id_dataframe[col] = preprocessing.lowpass_filter(user_id_dataframe[col], old_sampling_frequency, cutoff_frequency, order)
        user_id_dataframe[col] = downsampling.downsample_signal(user_id_dataframe[col], old_sampling_frequency, new_sampling_frequency, None)
    user_id_dataframe = preprocessing.calculate_mag(user_id_dataframe, col_signals)
    labels = user_id_dataframe['Label']
    user_id_dataframe.drop('Label', axis=1, inplace=True)
    user_id_dataframe['Label'] = labels

In [15]:
# Checking the changes
user_id_dataframes[0].head()

Unnamed: 0,Timestamp,Acc.X.Center,Acc.Y.Center,Acc.Z.Center,Acc.Magnitude.Center,Label
0,1650000000000.0,-0.797611,4.115096,8.748082,9.700471,1
1,1650000000000.0,-0.797611,4.115096,8.748082,9.700471,1
2,1650000000000.0,-0.797611,4.115096,8.748082,9.700471,1
3,1650000000000.0,-0.797611,4.115096,8.748082,9.700471,1
4,1650000000000.0,-0.797611,4.115096,8.748082,9.700471,1


In [18]:
# Setting the parameters for the feature extraction
window_duration = 0.3
overlap = 0.3
win_length = int(window_duration * new_sampling_frequency)
overlap = int(overlap * new_sampling_frequency)
col_extract = ['Acc.X.Center', 'Acc.Y.Center', 'Acc.Z.Center', 'Acc.Magnitude.Center']

In [19]:
# Extracting the features and generating the labels for each user
new_user_id_dataframes = []
for user_id_dataframe in user_id_dataframes:
    features = feature_extraction.calculate_features(user_id_dataframe, col_extract, win_length, overlap)
    labels = feature_extraction.generate_labels(user_id_dataframe, 'Label', win_length, overlap)
    features['Label'] = labels
    new_user_id_dataframes.append(features)

  labels = np.apply_along_axis(lambda x: stats.mode(x).mode, 1, labels)
  labels = np.apply_along_axis(lambda x: stats.mode(x).mode, 1, labels)
  labels = np.apply_along_axis(lambda x: stats.mode(x).mode, 1, labels)
  labels = np.apply_along_axis(lambda x: stats.mode(x).mode, 1, labels)
  labels = np.apply_along_axis(lambda x: stats.mode(x).mode, 1, labels)
  labels = np.apply_along_axis(lambda x: stats.mode(x).mode, 1, labels)
  labels = np.apply_along_axis(lambda x: stats.mode(x).mode, 1, labels)
  labels = np.apply_along_axis(lambda x: stats.mode(x).mode, 1, labels)


In [20]:
# Removing the label 0,6,7,8,9,12 and map 4 to 3 and 5 to 4
new_dict_map = {4:np.nan}
for user_id_dataframe in new_user_id_dataframes:
    user_id_dataframe['Label'].replace(new_dict_map, inplace=True)
    user_id_dataframe.dropna(inplace=True)

In [21]:
# Checking the changes
labels = []
for user_id_dataframe in new_user_id_dataframes:
    labels.extend(user_id_dataframe['Label'].unique())
labels = list(set(labels))
print(labels)

[0.0, 1.0, 2.0, 3.0]


In [22]:
# Saving the data
processed_path = r"../processed_data/activity_recognition_FLAAP_less_classes_frequency_features/"
all_users_data = pd.DataFrame()
if not os.path.exists(processed_path):
    os.makedirs(processed_path)
for i in range(len(new_user_id_dataframes)):
    new_user_id_dataframes[i].insert(0, 'User_ID', i+1)
    new_user_id_dataframes[i].to_csv(processed_path + 'user_' + str(i+1) + '.csv', index=False)
    all_users_data = pd.concat([all_users_data, new_user_id_dataframes[i]])
all_users_data.to_csv(processed_path + 'all_users.csv', index=False)