## Preprocess of the WISDM dataset

- Loading the initial dataset
- Changing the order of the columns
- Renaming the columns
- Removing labels or concatenating them
- Making the labels from strings to integers
- Divide the dataset into datasets for each user
- Cleaning the data
- Lowpass filter
- Calculating the acceleration magnitude as feature
- Splitting the data into windows and features
- Saving the datasets per users and concatenated

In [1]:
# Importing libraries needed
import os
import sys

import pandas as pd

sys.path.append(os.path.abspath(os.path.join('../src/utils/')))
import feature_extraction
import preprocessing

In [2]:
# Read in the raw data
df = pd.read_csv(r'../data/activity_recognition_wisdm/time_series_data_human_activities.csv')
df.head()

Unnamed: 0,user,activity,timestamp,x-axis,y-axis,z-axis
0,1,Walking,4991922345000,0.69,10.8,-2.03
1,1,Walking,4991972333000,6.85,7.44,-0.5
2,1,Walking,4992022351000,0.93,5.63,-0.5
3,1,Walking,4992072339000,-2.11,5.01,-0.69
4,1,Walking,4992122358000,-4.59,4.29,-1.95


In [3]:
# Checking the basic info of the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1073623 entries, 0 to 1073622
Data columns (total 6 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   user       1073623 non-null  int64  
 1   activity   1073623 non-null  object 
 2   timestamp  1073623 non-null  int64  
 3   x-axis     1073623 non-null  float64
 4   y-axis     1073623 non-null  float64
 5   z-axis     1073623 non-null  float64
dtypes: float64(3), int64(2), object(1)
memory usage: 49.1+ MB


In [4]:
# Change the column order
df = df[['user', 'timestamp','x-axis', 'y-axis', 'z-axis', 'activity']]
df.head()

Unnamed: 0,user,timestamp,x-axis,y-axis,z-axis,activity
0,1,4991922345000,0.69,10.8,-2.03,Walking
1,1,4991972333000,6.85,7.44,-0.5,Walking
2,1,4992022351000,0.93,5.63,-0.5,Walking
3,1,4992072339000,-2.11,5.01,-0.69,Walking
4,1,4992122358000,-4.59,4.29,-1.95,Walking


In [5]:
# Rename the columns to be universal
df.columns = ['User_ID','Timestamp', 'Acc.X.Phone', 'Acc.Y.Phone', 'Acc.Z.Phone', 'Label']

In [6]:
# Check the unique values of the label
df.Label.unique()

array(['Walking', 'Jogging', 'Upstairs', 'Downstairs', 'Sitting',
       'Standing'], dtype=object)

In [7]:
# Grouping the Upstairs and Downstairs as Stairs and Sitting and Standing as Still
df['Label'] = df['Label'].replace(['Upstairs', 'Downstairs'], 'Stairs')
df['Label'] = df['Label'].replace(['Sitting', 'Standing'], 'Still')

In [8]:
# Check the unique values of the label
df.Label.unique()

array(['Walking', 'Jogging', 'Stairs', 'Still'], dtype=object)

In [9]:
# Map the labels to numbers
map_dict = {'Walking': 1, 'Jogging': 2, 'Stairs': 3, 'Still': 4}
df['Label'] = df['Label'].map(map_dict)

In [10]:
# Checking the number of unique users
df.User_ID.unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36], dtype=int64)

In [11]:
# Dividing the data by user_id
user_id_dataframes = []
for user_id in df.User_ID.unique():
    user_id_dataframes.append(df[df.User_ID == user_id])

In [12]:
# Checking if everything is correct
user_id_dataframes[0].head()

Unnamed: 0,User_ID,Timestamp,Acc.X.Phone,Acc.Y.Phone,Acc.Z.Phone,Label
0,1,4991922345000,0.69,10.8,-2.03,1
1,1,4991972333000,6.85,7.44,-0.5,1
2,1,4992022351000,0.93,5.63,-0.5,1
3,1,4992072339000,-2.11,5.01,-0.69,1
4,1,4992122358000,-4.59,4.29,-1.95,1


In [13]:
# Setting the parameters for the preprocessing
sampling_frequency = 20
cutoff_frequency = 5
order = 3

In [14]:
# Lowpass filter the sensor data and calculate the magnitude
col_signals = ['Acc.X.Phone', 'Acc.Y.Phone', 'Acc.Z.Phone']
for user_id_dataframe in user_id_dataframes:
    for col_signal in col_signals:
        user_id_dataframe[col_signal] = preprocessing.lowpass_filter(user_id_dataframe[col_signal], sampling_frequency, cutoff_frequency, order)
    user_id_dataframe = preprocessing.calculate_mag(user_id_dataframe, col_signals)
    labels = user_id_dataframe['Label']
    user_id_dataframe.drop('Label', axis=1, inplace=True)
    user_id_dataframe['Label'] = labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_id_dataframe[col_signal] = preprocessing.lowpass_filter(user_id_dataframe[col_signal], sampling_frequency, cutoff_frequency, order)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[mag_col_name] = mag
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_id_dataframe.drop('Label', axis=1, inplace=True)
A value is trying to be set on a copy of a sli

In [15]:
# Checking if everything is correct
user_id_dataframes[0].head()

Unnamed: 0,User_ID,Timestamp,Acc.X.Phone,Acc.Y.Phone,Acc.Z.Phone,Acc.Magnitude.Phone,Label
0,1,4991922345000,0.690247,10.799816,-2.029869,11.010578,1
1,1,4991972333000,4.964224,7.77195,-0.443757,9.232749,1
2,1,4992022351000,1.626367,5.242737,-1.027593,5.58456,1
3,1,4992072339000,-3.656171,4.476615,-1.484754,5.967593,1
4,1,4992122358000,-0.008137,6.326895,1.499345,6.502131,1


In [16]:
# Setting the parameters for the feature extraction
window_duration = 0.3
overlap = 0.3
win_length = int(window_duration * sampling_frequency)
overlap = int(overlap * sampling_frequency)
col_extract = ['Acc.X.Phone', 'Acc.Y.Phone', 'Acc.Z.Phone', 'Acc.Magnitude.Phone']

In [17]:
# Extracting the features and generating the labels for each user
new_user_id_dataframes = []
for user_id_dataframe in user_id_dataframes:
    features = feature_extraction.calculate_features(user_id_dataframe, col_extract, win_length, overlap)
    labels = feature_extraction.generate_labels(user_id_dataframe, 'Label', win_length, overlap)
    features['Label'] = labels
    new_user_id_dataframes.append(features)

  labels = np.apply_along_axis(lambda x: stats.mode(x).mode, 1, labels)
  labels = np.apply_along_axis(lambda x: stats.mode(x).mode, 1, labels)
  labels = np.apply_along_axis(lambda x: stats.mode(x).mode, 1, labels)
  labels = np.apply_along_axis(lambda x: stats.mode(x).mode, 1, labels)
  labels = np.apply_along_axis(lambda x: stats.mode(x).mode, 1, labels)
  labels = np.apply_along_axis(lambda x: stats.mode(x).mode, 1, labels)
  labels = np.apply_along_axis(lambda x: stats.mode(x).mode, 1, labels)
  labels = np.apply_along_axis(lambda x: stats.mode(x).mode, 1, labels)
  labels = np.apply_along_axis(lambda x: stats.mode(x).mode, 1, labels)
  labels = np.apply_along_axis(lambda x: stats.mode(x).mode, 1, labels)
  labels = np.apply_along_axis(lambda x: stats.mode(x).mode, 1, labels)
  labels = np.apply_along_axis(lambda x: stats.mode(x).mode, 1, labels)
  labels = np.apply_along_axis(lambda x: stats.mode(x).mode, 1, labels)
  labels = np.apply_along_axis(lambda x: stats.mode(x).mode, 1, 

In [18]:
# Checking if everything is correct
new_user_id_dataframes[0].head()

Unnamed: 0,Acc.X.Phone_mean,Acc.X.Phone_std,Acc.X.Phone_min_,Acc.X.Phone_max_,Acc.X.Phone_range_,Acc.X.Phone_iqr,Acc.X.Phone_kurtosis,Acc.X.Phone_skewness,Acc.X.Phone_rms,Acc.X.Phone_mean_freq,...,Acc.Magnitude.Phone_rms,Acc.Magnitude.Phone_mean_freq,Acc.Magnitude.Phone_std_freq,Acc.Magnitude.Phone_max_freq,Acc.Magnitude.Phone_max_freq_mag,Acc.Magnitude.Phone_freq_mean,Acc.Magnitude.Phone_freq_std,Acc.Magnitude.Phone_freq_skew,Acc.Magnitude.Phone_freq_kurtosis,Label
0,2.312424,4.364166,-3.656171,10.258013,13.914184,3.963301,-0.546337,0.582609,4.938953,2.073931,...,9.422605,1.14042,0.853193,2.051933,53.033936,23.151172,21.394073,0.62993,-1.5,1.0
1,3.272272,5.519073,-2.741494,13.414924,16.156418,6.805059,-0.649045,0.629252,6.416224,2.057145,...,13.032426,1.118191,0.899024,2.201311,76.805456,30.109651,33.019172,0.707058,-1.5,1.0
2,5.271831,2.053836,3.190634,9.309275,6.118642,2.14676,-0.244014,0.994309,5.657777,0.801428,...,10.912775,0.837113,0.658046,1.60776,63.870382,25.409226,27.343151,0.673137,-1.5,1.0
3,4.033476,3.524306,-0.501917,10.122001,10.623918,4.333129,-0.940585,0.51322,5.356273,1.242052,...,13.843702,1.683161,1.190462,2.556778,79.3891,34.005223,32.2288,0.6801,-1.5,1.0
4,3.314182,5.663636,-4.380355,11.253502,15.633857,8.561458,-1.366781,0.258396,6.562056,1.919457,...,11.560698,1.313983,1.339973,3.153506,63.569489,28.285285,25.848973,0.495858,-1.5,1.0


In [19]:
# Saving the data
processed_path = r"../processed_data/activity_recognition_wisdm_less_classes_frequency_features/"
all_users_data = pd.DataFrame()
if not os.path.exists(processed_path):
    os.makedirs(processed_path)
for i in range(len(new_user_id_dataframes)):
    new_user_id_dataframes[i].insert(0, 'User_ID', i+1)
    new_user_id_dataframes[i].to_csv(processed_path + 'user_' + str(i+1) + '.csv', index=False)
    all_users_data = pd.concat([all_users_data, new_user_id_dataframes[i]])
all_users_data.to_csv(processed_path + 'all_users.csv', index=False)