## Preprocess of the Senior Citizens dataset

- Loading the initial dataset(s)
- Renaming the columns
- Removing labels or concatenating them
- Making the labels from strings to integers
- Cleaning the data
- Lowpass filter
- Downsampling the signals
- Calculating the acceleration magnitude as feature
- Splitting the data into windows and features
- Saving the datasets per users and concatenated

In [1]:
# Importing libraries needed
import os
import sys

import numpy as np
import pandas as pd

sys.path.append(os.path.abspath(os.path.join('../src/utils/')))
import downsampling
import feature_extraction
import preprocessing

In [2]:
# Read the raw data
user_id_dataframes = []
dataset_path = r'../data/activity_recognition_senior_citizens/'
for user_id in os.listdir(dataset_path):
    if user_id.endswith('.csv'):
        user_id_dataframes.append(pd.read_csv(dataset_path + user_id))

In [3]:
# Checking the basic info of the data
for user_id_dataframe in user_id_dataframes:
    print(user_id_dataframe.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103860 entries, 0 to 103859
Data columns (total 8 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   timestamp  103860 non-null  object 
 1   back_x     103860 non-null  float64
 2   back_y     103860 non-null  float64
 3   back_z     103860 non-null  float64
 4   thigh_x    103860 non-null  float64
 5   thigh_y    103860 non-null  float64
 6   thigh_z    103860 non-null  float64
 7   label      103860 non-null  int64  
dtypes: float64(6), int64(1), object(1)
memory usage: 6.3+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131367 entries, 0 to 131366
Data columns (total 8 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   timestamp  131367 non-null  object 
 1   back_x     131367 non-null  float64
 2   back_y     131367 non-null  float64
 3   back_z     131367 non-null  float64
 4   thigh_x    131367 non-null  float64
 5   thigh_y    

In [4]:
# Rename the columns
for user_id_dataframe in user_id_dataframes:
    user_id_dataframe.columns = ['Timestamp', 'Acc.X.Back', 'Acc.Y.Back', 'Acc.Z.Back', 'Acc.X.Thigh', 'Acc.Y.Thigh', 'Acc.Z.Thigh', 'Label']

In [5]:
# Checking the changes
user_id_dataframes[0].head()

Unnamed: 0,Timestamp,Acc.X.Back,Acc.Y.Back,Acc.Z.Back,Acc.X.Thigh,Acc.Y.Thigh,Acc.Z.Thigh,Label
0,2021-03-24 14:42:03.839,-0.999023,-0.063477,0.140625,-0.980469,-0.112061,-0.048096,6
1,2021-03-24 14:42:03.859,-0.980225,-0.079346,0.140625,-0.961182,-0.121582,-0.051758,6
2,2021-03-24 14:42:03.880,-0.950195,-0.076416,0.140625,-0.949463,-0.080566,-0.067139,6
3,2021-03-24 14:42:03.900,-0.954834,-0.059082,0.140381,-0.95752,-0.046143,-0.050781,6
4,2021-03-24 14:42:03.920,-0.972412,-0.042969,0.142822,-0.977051,-0.023682,-0.026611,6


In [6]:
# Checking the unique values of the labels
# 1. walking, 3.shuffling, 4. stairs (ascending), 5. stairs (descending), 6. standing, 7. sitting, 8. lying
labels = []
for user_id_dataframe in user_id_dataframes:
    labels.extend(user_id_dataframe['Label'].unique())
labels = list(set(labels))
print(labels)

[1, 3, 4, 5, 6, 7, 8]


In [7]:
# The labels 4,5(stairs (ascending), stairs (descending)) make them 4, 6,7,8(standing, sitting, lying) make them 5
for user_id_dataframe in user_id_dataframes:
    user_id_dataframe['Label'] = user_id_dataframe['Label'].replace([5], 4)
    user_id_dataframe['Label'] = user_id_dataframe['Label'].replace([6, 7, 8], 5)

In [8]:
# Checking the unique values of the labels
labels = []
for user_id_dataframe in user_id_dataframes:
    labels.extend(user_id_dataframe['Label'].unique())
labels = list(set(labels))
print(labels)

[1, 3, 4, 5]


In [9]:
# Map the labels to numbers
map_dict = {1:1, 3:2, 4:3, 5:4}
for user_id_dataframe in user_id_dataframes:
    user_id_dataframe['Label'] = user_id_dataframe['Label'].map(map_dict)

In [10]:
# Checking the unique values of the labels
labels = []
for user_id_dataframe in user_id_dataframes:
    labels.extend(user_id_dataframe['Label'].unique())
labels = list(set(labels))
print(labels)

[1, 2, 3, 4]


In [11]:
# Checking the changes
user_id_dataframes[0].head()

Unnamed: 0,Timestamp,Acc.X.Back,Acc.Y.Back,Acc.Z.Back,Acc.X.Thigh,Acc.Y.Thigh,Acc.Z.Thigh,Label
0,2021-03-24 14:42:03.839,-0.999023,-0.063477,0.140625,-0.980469,-0.112061,-0.048096,4
1,2021-03-24 14:42:03.859,-0.980225,-0.079346,0.140625,-0.961182,-0.121582,-0.051758,4
2,2021-03-24 14:42:03.880,-0.950195,-0.076416,0.140625,-0.949463,-0.080566,-0.067139,4
3,2021-03-24 14:42:03.900,-0.954834,-0.059082,0.140381,-0.95752,-0.046143,-0.050781,4
4,2021-03-24 14:42:03.920,-0.972412,-0.042969,0.142822,-0.977051,-0.023682,-0.026611,4


In [12]:
# Setting the parameters for the preprocessing
old_sampling_frequency = 100
new_sampling_frequency = 20
cutoff_frequency = 5
order = 3

In [13]:
# Lowpass filter the sensor data, downsample the data and calculate the magnitude of the acceleration
col_signals_back = ['Acc.X.Back', 'Acc.Y.Back', 'Acc.Z.Back']
col_signals_thigh = ['Acc.X.Thigh', 'Acc.Y.Thigh', 'Acc.Z.Thigh']
for user_id_dataframe in user_id_dataframes:
    for col in col_signals_back + col_signals_thigh:
        user_id_dataframe[col] = preprocessing.lowpass_filter(user_id_dataframe[col], old_sampling_frequency, cutoff_frequency, order)
        user_id_dataframe[col] = downsampling.downsample_signal(user_id_dataframe[col], old_sampling_frequency, new_sampling_frequency, None)
    user_id_dataframe = preprocessing.calculate_mag(user_id_dataframe, col_signals_back)
    user_id_dataframe = preprocessing.calculate_mag(user_id_dataframe, col_signals_thigh)
    labels = user_id_dataframe['Label']
    user_id_dataframe.drop('Label', axis=1, inplace=True)
    user_id_dataframe['Label'] = labels

In [14]:
# Checking the changes
user_id_dataframes[0].head()

Unnamed: 0,Timestamp,Acc.X.Back,Acc.Y.Back,Acc.Z.Back,Acc.X.Thigh,Acc.Y.Thigh,Acc.Z.Thigh,Acc.Magnitude.Back,Acc.Magnitude.Thigh,Label
0,2021-03-24 14:42:03.839,-0.998972,-0.063252,0.140358,-0.980321,-0.113319,-0.047163,1.010765,0.987976,4
1,2021-03-24 14:42:03.859,-0.998972,-0.063252,0.140358,-0.980321,-0.113319,-0.047163,1.010765,0.987976,4
2,2021-03-24 14:42:03.880,-0.998972,-0.063252,0.140358,-0.980321,-0.113319,-0.047163,1.010765,0.987976,4
3,2021-03-24 14:42:03.900,-0.998972,-0.063252,0.140358,-0.980321,-0.113319,-0.047163,1.010765,0.987976,4
4,2021-03-24 14:42:03.920,-0.998972,-0.063252,0.140358,-0.980321,-0.113319,-0.047163,1.010765,0.987976,4


In [15]:
# Setting the parameters for the feature extraction
window_duration = 0.3
overlap = 0.3
win_length = int(window_duration * new_sampling_frequency)
overlap = int(overlap * new_sampling_frequency)
col_extract = col_signals_back + col_signals_thigh + ['Acc.Magnitude.Back', 'Acc.Magnitude.Thigh']

In [16]:
# Extracting the features and generating the labels for each user
new_user_id_dataframes = []
for user_id_dataframe in user_id_dataframes:
    features = feature_extraction.calculate_features(user_id_dataframe, col_extract, win_length, overlap)
    labels = feature_extraction.generate_labels(user_id_dataframe, 'Label', win_length, overlap)
    features['Label'] = labels
    new_user_id_dataframes.append(features)

  kurtosis = stats.kurtosis(data, axis=1).reshape((-1, 1))
  skewness = stats.skew(data, axis=1).reshape((-1, 1))
  kurtosis = stats.kurtosis(data, axis=1).reshape((-1, 1))
  skewness = stats.skew(data, axis=1).reshape((-1, 1))
  kurtosis = stats.kurtosis(data, axis=1).reshape((-1, 1))
  skewness = stats.skew(data, axis=1).reshape((-1, 1))
  kurtosis = stats.kurtosis(data, axis=1).reshape((-1, 1))
  skewness = stats.skew(data, axis=1).reshape((-1, 1))


  labels = np.apply_along_axis(lambda x: stats.mode(x).mode, 1, labels)
  kurtosis = stats.kurtosis(data, axis=1).reshape((-1, 1))
  skewness = stats.skew(data, axis=1).reshape((-1, 1))
  kurtosis = stats.kurtosis(data, axis=1).reshape((-1, 1))
  skewness = stats.skew(data, axis=1).reshape((-1, 1))
  kurtosis = stats.kurtosis(data, axis=1).reshape((-1, 1))
  skewness = stats.skew(data, axis=1).reshape((-1, 1))
  kurtosis = stats.kurtosis(data, axis=1).reshape((-1, 1))
  skewness = stats.skew(data, axis=1).reshape((-1, 1))
  labels = np.apply_along_axis(lambda x: stats.mode(x).mode, 1, labels)
  kurtosis = stats.kurtosis(data, axis=1).reshape((-1, 1))
  skewness = stats.skew(data, axis=1).reshape((-1, 1))
  kurtosis = stats.kurtosis(data, axis=1).reshape((-1, 1))
  skewness = stats.skew(data, axis=1).reshape((-1, 1))
  kurtosis = stats.kurtosis(data, axis=1).reshape((-1, 1))
  skewness = stats.skew(data, axis=1).reshape((-1, 1))
  kurtosis = stats.kurtosis(data, axis=1).reshape((-1, 1))

In [19]:
# Saving the data
processed_path = r"../processed_data/activity_recognition_senior_citizens_less_classes_frequency_features/"
all_users_data = pd.DataFrame()
if not os.path.exists(processed_path):
    os.makedirs(processed_path)
for i in range(len(new_user_id_dataframes)):
    new_user_id_dataframes[i].insert(0, 'User_ID', i+1)
    new_user_id_dataframes[i].to_csv(processed_path + 'user_' + str(i+1) + '.csv', index=False)
    all_users_data = pd.concat([all_users_data, new_user_id_dataframes[i]])
all_users_data.to_csv(processed_path + 'all_users.csv', index=False)