In [1]:
# data handling library
import pandas as pd
# linear algebra library
import numpy as np
# statistics
from scipy import stats
# signal processing library
from scipy import signal

# progress tracking
from tqdm import tqdm
# OS manipulation
import os

## Let's remind ourselves of the structure of the data

<img src = "https://i.imgur.com/a3NRdUH.png" style = "width:90%; margin:auto;" />

## Define the path to the data, the users whose data we want to use, the IMU locations we want to use and the types of sensors we want to use

In [2]:
path_to_segmented = "./train"

# getting all the subject names 
subject_names = ['s0', 's1', 's4', 's5', 's6', 's7', 's8', 's9']
# getting all the sensors
imu_locations = ['Wrist', 'Thigh']
sensors = ['acc', 'gyr']

sampling_frequency=50.0

## Filter the segmented training data of the selected subjects

In [3]:
filtering_savepath = "./filtered_train"

if not os.path.exists(filtering_savepath):
    os.mkdir(filtering_savepath)

### Define the different types of filters

In [4]:
# butter is a function that allows the definition of a butterworth filter
from scipy.signal import butter
# filtfilt is the function which will apply the filter to the signal twice
from scipy.signal import filtfilt

def butter_lowpass_filter(data, cutoff, fs, order):
    normal_cutoff = cutoff / (0.5*fs)
    # get the filter coefficients 
    b, a = butter(order, normal_cutoff, btype='low', analog=False)
    # apply the filter
    y = filtfilt(b, a, data)
    return y

In [5]:
def butter_highpass_filter(data, cutoff, fs, order):
    normal_cutoff = cutoff / (0.5*fs)
    # get the filter coefficients 
    b, a = butter(order, normal_cutoff, btype='highpass', analog=False)
    # apply the filter
    y = filtfilt(b, a, data)
    return y

In [6]:
def butter_bandpass_filter(data, cutoff, fs, order):
    normal_cutoff_low = cutoff[0] / (0.5*fs)
    normal_cutoff_high = cutoff[1] / (0.5*fs)
    # get the filter coefficients 
    b, a = butter(order, [normal_cutoff_low, normal_cutoff_high], btype='bandpass', analog=False)
    # apply the filter
    y = filtfilt(b, a, data)
    return y

### Apply the filtering functions

<img src = 'https://i.imgur.com/FyYD6zr.png' />

In [20]:
# iterate through subject names
for subject_name in subject_names:
    # for each subject name, iterate through imu_locations
    for imu_location in tqdm(imu_locations):
        # for each location, iterate through the different sensors
        for sensor in sensors:
            # for each sensor, iterate through the different axes
            for ax in ['x', 'y', 'z', 'mag']:
                # load the segmented UNFILTERED sensor data 
                sensor_ax = pd.read_csv(os.path.join(path_to_segmented, "{}_{}_{}_{}.csv".format(subject_name, imu_location, sensor, ax)), header = 0)
                # save that segmented UNFILTERED sensor data to the `filtered` folder since we are going to keep both filtered and unfiltered data there
                sensor_ax.to_csv(os.path.join(filtering_savepath, "{}_{}_{}_{}.csv".format(subject_name, imu_location, sensor, ax)), index=False)
                
                # apply a bandpass filter to each row in the segmented UNFILTERED sensor data
                # the lower cutoff frequency of this bandpass filter is 3Hz and the higher cutoff frequency is 15Hz
                sensor_band_ax = np.apply_along_axis(butter_bandpass_filter, 1, sensor_ax.iloc[:, 1:].values, cutoff = [3, 15], fs = sampling_frequency, order = 3)
                
                # create a dataframe for the magnitude data with columns with names: ['0', '1', '2', ..., '98', '99']
                sensor_band_ax = pd.DataFrame (sensor_band_ax, columns = [str(i) for i in range(100)])
                # add an index column at the end
                sensor_band_ax['index'] = np.arange(sensor_band_ax.shape[0])
                # rearange the columns so that the index column is the first one
                sensor_band_ax = sensor_band_ax[['index'] + [str(i) for i in range(100)]]
                # save the data to the folder 'filtered' folder
                sensor_band_ax.to_csv(os.path.join(filtering_savepath, "{}_{}_band_{}_{}.csv".format(subject_name, imu_location, sensor, ax)), index=False)

100%|██████████| 2/2 [00:25<00:00, 12.51s/it]
100%|██████████| 2/2 [00:23<00:00, 11.88s/it]
100%|██████████| 2/2 [00:23<00:00, 11.93s/it]
100%|██████████| 2/2 [00:25<00:00, 12.76s/it]
100%|██████████| 2/2 [00:23<00:00, 11.66s/it]
100%|██████████| 2/2 [00:18<00:00,  9.03s/it]
100%|██████████| 2/2 [00:20<00:00, 10.06s/it]
100%|██████████| 2/2 [00:20<00:00, 10.17s/it]


## Feature extraction

### Define the function which extracts statistical features

In [21]:
def calculate_statistical_features (data, sensor_name):
    """Extracts basic statistical features for each window in a dataframe/2D-array. It is assumed that every row of the dataframe/2D-array is a different window, thus, 
    every feature is calculated for each row separately.

    Args:
        data (pd.DataFrame or np.array): Segmented sensor data where each row contains data from a different feature.
        sensor_name (str): The name of the sensor for which we are calculating features. It is used to name the calculated features.
        
    Returns:
        [pd.DataFrame]: the extracted features for each row in the original data.
    """   

    # check if data is of the right type
    if not isinstance(data, (pd.DataFrame, np.ndarray)):
        # if it is not than return None
        print ("Data not in right format")
        return None
    # if data is a dataframe, convert it to a numpy 2D-array
    elif isinstance(data, pd.DataFrame):
        data = data.values

    # calculate each feature per row (by specifying axis=1)
    # also, reshape the values from a row-vector to a column-vector
    mean = np.mean(data, axis=1).reshape((-1, 1))
    std = np.std(data, axis=1).reshape((-1, 1))
    median = np.median(data, axis=1).reshape((-1, 1))
    q75, q25 = np.percentile(data, [75, 25], axis=1)
    iqr = q75 - q25
    iqr = iqr.reshape((-1, 1))
    kurtosis = stats.kurtosis(data, axis=1).reshape((-1, 1))
    skewness = stats.skew(data, axis=1).reshape((-1, 1))
    rms = np.sqrt(np.mean(data**2, axis=1)).reshape((-1, 1))

    # stack the calculated features as columns in a 2D-array
    # if you included more than just the mean and STD, don't forget to stack these features in the STATISTICAL_FEATURES 2D array
    statistical_features = np.hstack((mean, std, median, iqr, kurtosis, skewness, rms))
    
    # name the features by combining the feature name and the sensor name
    # if you included more than just the mean and STD, don't forget to give a unique name to these features
    feature_names = ['mean', 'std', 'median', 'iqr', 'kurtosis', 'skewness', 'rms']
    feature_names = [f"{sensor_name}_{feature}" for feature in feature_names]

    return pd.DataFrame(statistical_features, columns=feature_names)

### Define the function which extracts time-domain features

In [22]:
# define a function which calculates the mean crossing rate for a single signal/window/array
def __calc_mean_crossing_rate__ (signal):
    mean = np.mean(signal)
    return len(np.nonzero(np.diff(signal > mean))[0])/len(signal)

# define a function which calculates the peak-to-average power ratio for a single signal/window/array
def __calc_papr_db__ (signal):
    peak = max((abs(np.amax(signal)), abs(np.amin(signal))))
    rms = np.sqrt(np.mean(signal**2))

    # convert the result to dB
    return 10*np.log10((peak**2)/(rms**2))

def calculate_time_features (data, sensor_name):
    """Extracts several time-domain features for each window in a dataframe/2D-array. It is assumed that every row of the dataframe/2D-array is a different window, thus, 
    every feature is calculated for each row separately.

    Args:
        data (pd.DataFrame or np.array): Segmented sensor data where each row contains data from a different feature.
        sensor_name (str): The name of the sensor for which we are calculating features. It is used to name the calculated features.
        
    Returns:
        [pd.DataFrame]: the extracted features for each row in the original data.
    """   

    # check if data is of the right type
    if not isinstance(data, (pd.DataFrame, np.ndarray)):
        # if it is not than return None
        print ("Data not in right format")
        return None
    # if data is a dataframe, convert it to a numpy 2D-array
    elif isinstance(data, pd.DataFrame):
        data = data.values
    
    integral = np.trapz(data, axis = 1).reshape((-1, 1))
    
    # some functions don't offer the axis parameter, instead we define a function, such as __calc_mean_crossing_rate__ which works on a single row/array
    # and call it for each row in the `data` 2D-array using numpy.apply_along_axis
    mean_crossing_rate = np.apply_along_axis(__calc_mean_crossing_rate__, 1, data).reshape((-1, 1))
    
    # sometimes, if the function is fairly short, we don't even need to give it a name, we can use a lambda (nameless) function to achieve the same thing
    num_peaks = np.apply_along_axis(lambda x: len(signal.find_peaks(x)), 1, data).reshape((-1, 1))
    avg_peak_height = np.apply_along_axis(lambda x: np.average(x[signal.find_peaks(x)[0]]), 1, data).reshape((-1, 1))
    
    ssum = np.sum(data, axis = 1).reshape((-1, 1))
    squared_ssum = np.sum(data*2, axis=1).reshape((-1, 1))
    
    papr_db = np.apply_along_axis(__calc_papr_db__, 1, data).reshape(-1, 1)

    # stack the calculated features as columns in a 2D-array
    time_features = np.hstack((integral, mean_crossing_rate, num_peaks, avg_peak_height, ssum, squared_ssum, papr_db))
    
    # name the features by combining the feature name and the sensor name
    feature_names = ['integral', 'mean_crossing_rate', 'num_peaks', 'avg_peak_height', 'sum', 'squared_sum', 'papr_db']
    feature_names = [f"{sensor_name}_{feature}" for feature in feature_names]

    return pd.DataFrame(time_features, columns=feature_names)

### Define the path to the filtered data and the labels

In [23]:
# define the path to the segmented and filtered data
data_location = './filtered_train'
labels_location = './train'

# read the names of the files contained in the data dictionary
segmented_filenames = os.listdir(data_location)

# a dictionary to store the features
# this dictionary will be a triple-nested one and access to some features will be done in the following manner:
# features['s0']['Wrist']['band_acc_x']
statistical_features = {}
time_domain_features = {}

### Apply the feature extraction function on each filtered file

<img src = "https://i.imgur.com/2Rmt6JW.png" />

In [24]:
for subject_name in subject_names:
    # for each subject create a dictionary whose keys will be the IMU locations
    statistical_features[subject_name] = {}
    time_domain_features[subject_name] = {}
    
    
    
    for imu_location in tqdm(imu_locations):
        # for each subject and imu_location pair, create a dictionary whose keys will be the sensor names
        # and the values, the calculated features for that sensor location
        statistical_features[subject_name][imu_location] = {}
        time_domain_features[subject_name][imu_location] = {}
        
        
        
        # iterate through all the files we found in the segmented data location
        for filename in segmented_filenames:
            # check to see if this filename is for the appropriate user, imu_location and if it is not a label file (no need to extract features from labels)
            if (subject_name in filename) and (imu_location in filename) and ('activity_id' not in filename):
                # read the data from disk
                sensor_data = pd.read_csv(os.path.join(data_location, filename), header= 0).iloc[:, 1:]
                # get the sensor name from the filename
                sensor_name = filename.split(".")[0].split('_')[2:]
                sensor_name = "_".join(([x.lower() for x in sensor_name]))
                # calculate statistical features
                statistical_features[subject_name][imu_location][sensor_name] = calculate_statistical_features(sensor_data, sensor_name)
                # calculate time-domain features
                time_domain_features[subject_name][imu_location][sensor_name] = calculate_time_features(sensor_data, sensor_name)
                

                
    labels = pd.read_csv(os.path.join(labels_location, "{}_Wrist_activity_id.csv".format(subject_name)), header= 0).iloc[:, -1].values
    statistical_features[subject_name]['activity_ids'] = labels
    time_domain_features[subject_name]['activity_ids'] = labels
    
    

100%|██████████| 2/2 [00:09<00:00,  4.55s/it]
100%|██████████| 2/2 [00:09<00:00,  4.56s/it]
100%|██████████| 2/2 [00:09<00:00,  4.70s/it]
100%|██████████| 2/2 [00:08<00:00,  4.36s/it]
100%|██████████| 2/2 [00:09<00:00,  4.58s/it]
100%|██████████| 2/2 [00:06<00:00,  3.46s/it]
100%|██████████| 2/2 [00:08<00:00,  4.04s/it]
100%|██████████| 2/2 [00:08<00:00,  4.03s/it]


In [25]:
statistical_features['s0']['Thigh']['band_acc_mag']

Unnamed: 0,band_acc_mag_mean,band_acc_mag_std,band_acc_mag_median,band_acc_mag_iqr,band_acc_mag_kurtosis,band_acc_mag_skewness,band_acc_mag_rms
0,0.000373,0.007882,0.000346,0.010437,-0.089721,0.160913,0.007891
1,0.000572,0.011377,0.000226,0.010483,3.616179,-0.013026,0.011391
2,-0.000125,0.007892,0.000735,0.009772,-0.265100,-0.278154,0.007892
3,-0.000163,0.007592,0.000193,0.010298,-0.200395,-0.239846,0.007594
4,0.000197,0.006205,-0.000323,0.007887,0.271856,0.173096,0.006208
...,...,...,...,...,...,...,...
3973,0.001362,0.118917,0.001614,0.092894,5.493128,-0.586234,0.118925
3974,-0.000257,0.007904,0.000074,0.007999,0.828909,-0.015261,0.007908
3975,-0.001402,0.017073,0.000610,0.011398,9.550890,-2.309192,0.017131
3976,-0.001871,0.058252,-0.000692,0.048833,1.571554,0.048016,0.058282


In [26]:
time_domain_features['s0']['Thigh']['band_acc_mag']

Unnamed: 0,band_acc_mag_integral,band_acc_mag_mean_crossing_rate,band_acc_mag_num_peaks,band_acc_mag_avg_peak_height,band_acc_mag_sum,band_acc_mag_squared_sum,band_acc_mag_papr_db
0,0.037364,0.42,2.0,0.009287,0.037329,0.074658,9.163490
1,0.056857,0.35,2.0,0.009002,0.057237,0.114474,11.628840
2,-0.012573,0.36,2.0,0.008003,-0.012462,-0.024924,7.408222
3,-0.016502,0.32,2.0,0.007219,-0.016283,-0.032566,9.074976
4,0.019837,0.36,2.0,0.005896,0.019732,0.039463,9.643080
...,...,...,...,...,...,...,...
3973,0.134610,0.30,2.0,0.096271,0.136224,0.272448,12.944319
3974,-0.025360,0.43,2.0,0.007795,-0.025668,-0.051336,9.058891
3975,-0.139673,0.32,2.0,0.009860,-0.140241,-0.280481,14.095180
3976,-0.186142,0.32,2.0,0.053527,-0.187129,-0.374259,9.756717


### Define the location where the extracted features should be saved and create the directory if it doesn't exist

In [27]:
feature_savepath = "./features_train"

if not os.path.exists(feature_savepath):
    os.mkdir(feature_savepath)

### Save the statistical features

In [28]:
# iterate through the subject names and imu_locations
for subject_name in subject_names:
    for imu_location in imu_locations:
        #format the labels
        labels = pd.DataFrame(statistical_features[subject_name]['activity_ids'], columns=['activity_id'])
        labels['index'] = np.arange(labels.shape[0])

        labels = labels[['index', 'activity_id']]

        # save the labels of all windows from this user
        labels.to_csv(os.path.join(feature_savepath, "{}_activity_ids.csv".format(subject_name)), index=False)
        # create a dataframe where we will store the concatenated features from all sensor at this IMU location
        subject_location_features = pd.DataFrame()
        # concatenate the features from all sensors at this IMU location
        for sensor_name, sensor_features in statistical_features[subject_name][imu_location].items():
            subject_location_features = pd.concat([subject_location_features, sensor_features], axis = 1)

        # display a control string
        print ("The number of features in {}_{}_{} is: {}".format(subject_name, imu_location, "stat", subject_location_features.shape))
        print("The shape of the labels is:", statistical_features[subject_name]['activity_ids'].shape)
        
        # generate the path to the file where the features need to be saved
        savepath = os.path.join(feature_savepath, "{}_{}_{}.csv".format(subject_name, imu_location, "stat"))
        subject_location_features.to_csv(savepath, index=False)

The number of features in s0_Wrist_stat is: (3978, 112)
The shape of the labels is: (3978,)
The number of features in s0_Thigh_stat is: (3978, 112)
The shape of the labels is: (3978,)
The number of features in s1_Wrist_stat is: (3884, 112)
The shape of the labels is: (3884,)
The number of features in s1_Thigh_stat is: (3884, 112)
The shape of the labels is: (3884,)
The number of features in s4_Wrist_stat is: (4003, 112)
The shape of the labels is: (4003,)
The number of features in s4_Thigh_stat is: (4003, 112)
The shape of the labels is: (4003,)
The number of features in s5_Wrist_stat is: (3961, 112)
The shape of the labels is: (3961,)
The number of features in s5_Thigh_stat is: (3961, 112)
The shape of the labels is: (3961,)
The number of features in s6_Wrist_stat is: (3976, 112)
The shape of the labels is: (3976,)
The number of features in s6_Thigh_stat is: (3976, 112)
The shape of the labels is: (3976,)
The number of features in s7_Wrist_stat is: (3108, 112)
The shape of the labels 

### Save the time-domain features

In [29]:
# iterate through the subject names and imu_locations
for subject_name in subject_names:
    for imu_location in imu_locations:
        #format the labels
        labels = pd.DataFrame(time_domain_features[subject_name]['activity_ids'], columns=['activity_id'])
        labels['index'] = np.arange(labels.shape[0])

        labels = labels[['index', 'activity_id']]

        # save the labels of all windows from this user
        labels.to_csv(os.path.join(feature_savepath, "{}_activity_ids.csv".format(subject_name)), index=False)
        # create a dataframe where we will store the concatenated features from all sensor at this IMU location
        subject_location_features = pd.DataFrame()
        # concatenate the features from all sensors at this IMU location
        for sensor_name, sensor_features in time_domain_features[subject_name][imu_location].items():
            subject_location_features = pd.concat([subject_location_features, sensor_features], axis = 1)

        # display a control string
        print ("The number of features in {}_{}_{} is: {}".format(subject_name, imu_location, "time", subject_location_features.shape))
        print("The shape of the labels is:", time_domain_features[subject_name]['activity_ids'].shape)
        
        # generate the path to the file where the features need to be saved
        savepath = os.path.join(feature_savepath, "{}_{}_{}.csv".format(subject_name, imu_location, "time"))
        subject_location_features.to_csv(savepath, index=False)

The number of features in s0_Wrist_time is: (3978, 112)
The shape of the labels is: (3978,)
The number of features in s0_Thigh_time is: (3978, 112)
The shape of the labels is: (3978,)
The number of features in s1_Wrist_time is: (3884, 112)
The shape of the labels is: (3884,)
The number of features in s1_Thigh_time is: (3884, 112)
The shape of the labels is: (3884,)
The number of features in s4_Wrist_time is: (4003, 112)
The shape of the labels is: (4003,)
The number of features in s4_Thigh_time is: (4003, 112)
The shape of the labels is: (4003,)
The number of features in s5_Wrist_time is: (3961, 112)
The shape of the labels is: (3961,)
The number of features in s5_Thigh_time is: (3961, 112)
The shape of the labels is: (3961,)
The number of features in s6_Wrist_time is: (3976, 112)
The shape of the labels is: (3976,)
The number of features in s6_Thigh_time is: (3976, 112)
The shape of the labels is: (3976,)
The number of features in s7_Wrist_time is: (3108, 112)
The shape of the labels 

## Load the training features and setup a LOSO cross-validation scheme

In [30]:
# define the path to the directory where the features are saved
path_to_train_features = "./features_train"

# define the subject names for whom you've extracted features
train_subject_names = ['s0', 's1', 's4', 's5', 's6', 's7', 's8', 's9']

# define the imu_locations in the data
imu_locations = ['Wrist', 'Thigh']
# define the types of features you've extracted from the data of the subjects
types_of_features = ['stat', 'time']
# define the sensors which you want to use
sensor_names = ['acc', 'gyr']

# define a dictionary to store the data from each user
train_data = {}

In [31]:
ls ./features_train

s0_activity_ids.csv  s4_Wrist_time.csv    s7_Wrist_stat.csv
s0_Thigh_stat.csv    s5_activity_ids.csv  s7_Wrist_time.csv
s0_Thigh_time.csv    s5_Thigh_stat.csv    s8_activity_ids.csv
s0_Wrist_stat.csv    s5_Thigh_time.csv    s8_Thigh_stat.csv
s0_Wrist_time.csv    s5_Wrist_stat.csv    s8_Thigh_time.csv
s1_activity_ids.csv  s5_Wrist_time.csv    s8_Wrist_stat.csv
s1_Thigh_stat.csv    s6_activity_ids.csv  s8_Wrist_time.csv
s1_Thigh_time.csv    s6_Thigh_stat.csv    s9_activity_ids.csv
s1_Wrist_stat.csv    s6_Thigh_time.csv    s9_Thigh_stat.csv
s1_Wrist_time.csv    s6_Wrist_stat.csv    s9_Thigh_time.csv
s4_activity_ids.csv  s6_Wrist_time.csv    s9_Wrist_stat.csv
s4_Thigh_stat.csv    s7_activity_ids.csv  s9_Wrist_time.csv
s4_Thigh_time.csv    s7_Thigh_stat.csv
s4_Wrist_stat.csv    s7_Thigh_time.csv


### How are we going to combine the data?

<img src = 'https://i.imgur.com/omOODBH.png'>

In [32]:
# iterate through the different subjects
for subject_name in train_subject_names:
    # for each subject we will save one dataframe for the features and one for the labels
    train_data[subject_name] = {}
    train_data[subject_name]['features'] = pd.DataFrame()

    # both label files per user (wrist and thigh) are identical and it doesn't matter which one we choose
    label_filename = "{}_activity_ids.csv".format(subject_name)
    label_filepath = os.path.join(path_to_train_features, label_filename)
    
    # read the labels from disk
    train_data[subject_name]['labels'] = pd.read_csv(label_filepath, header = 0).iloc[:, 1].values

    # iterate through the imu locations
    for imu_location in imu_locations:
        # iterate through the types of features
        for feature_type in types_of_features:
            # construct the name of the file to read off disk based on the subject_name, imu_location and feature type
            feature_filename = "{}_{}_{}.csv".format(subject_name, imu_location, feature_type)
            # add it to the path of the dictionary containing the features
            feature_filepath = os.path.join(path_to_train_features, feature_filename)
            
            # read the features off disk
            feats = pd.read_csv(feature_filepath, header=0)
            
            # remove all features that don't come from a sensor we selected for use
            selected_sensor_features = []
            for feature_name in feats.columns:
                for sensor_name in sensor_names:
                    if sensor_name in feature_name:
                        selected_sensor_features.append(feature_name)

            selected_sensor_features = list(set(selected_sensor_features))
                
            feats = feats[selected_sensor_features]
            
            # add the imu_location to the name of the feature
            # we do this because features from different imu_location files have the same names and this will be a problem
            feats.columns = [f'{imu_location}_{x}' for x in feats.columns]
            feats = feats[sorted(feats.columns)]

            # horizontally concatenate all features together
            train_data[subject_name]['features'] = pd.concat([train_data[subject_name]['features'], feats], axis = 1)

    print ("Shape of the features and labels for '{}': {} - {}".format(subject_name, train_data[subject_name]['features'].shape, train_data[subject_name]['labels'].shape))

Shape of the features and labels for 's0': (3978, 448) - (3978,)
Shape of the features and labels for 's1': (3884, 448) - (3884,)
Shape of the features and labels for 's4': (4003, 448) - (4003,)
Shape of the features and labels for 's5': (3961, 448) - (3961,)
Shape of the features and labels for 's6': (3976, 448) - (3976,)
Shape of the features and labels for 's7': (3108, 448) - (3108,)
Shape of the features and labels for 's8': (3437, 448) - (3437,)
Shape of the features and labels for 's9': (3430, 448) - (3430,)


In [33]:
# concatenate the data from all other users in a training subset
# a dataframe to hold the features of all other subjects
train_features = pd.DataFrame()
# a numpy array for the labels
train_labels = np.array([])
# a list to hold the name of the subject to whome each window belongs
train_groups = []

for subject_name in train_subject_names:
    # concatenate the user features to the training subset
    train_features = pd.concat([train_features, train_data[subject_name]['features']], axis = 0)
    # concatenate the user labels to the training subset
    train_labels = np.concatenate((train_labels, train_data[subject_name]['labels'].flatten()))
    # concatenate the name of the user like a label for each window
    train_groups += [subject_name] * train_data[subject_name]['features'].shape[0]

### Tune a classifier using LOSO cross-validation

In [35]:
from sklearn.model_selection import GroupKFold

from xgboost import XGBClassifier

from xgboost import cv

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

In [None]:
num_subjects_train = len(np.unique(train_groups))

# initialize the object which is going to do the splits
group_kfold = GroupKFold(n_splits = num_subjects_train)

# a place to store the predictions and the true labels for each validation split
cv_predictions = []
cv_true = []

# each iteration of the for loop is a different iteration in the cross-validation
for train_index, val_index in group_kfold.split(train_features, train_labels, train_groups):
    # select only the data from the instances in the training set
    train_X, train_y = train_features.iloc[train_index, :], train_labels[train_index]
    val_X, val_y = train_features.iloc[val_index, :], train_labels[val_index]

    print ("Shape of train_X and train_y:", train_X.shape, train_y.shape) 

    xgb = XGBClassifier()

    # train the classifier
    xgb.fit(train_X, train_y)
    
    split_predictions = xgb.predict(val_X)

    # print the individual results as well:
    subject_name = np.unique(np.array(train_groups)[val_index])[0]
    
    print(subject_name)
    print (classification_report(val_y, split_predictions))

    # concatenate the predictions and the true labels to the global array
    cv_predictions += split_predictions.tolist()
    cv_true += val_y.tolist()

print ("Combined:")
print (classification_report(cv_true, cv_predictions))
print ("#################################################################")
print (confusion_matrix(cv_true, cv_predictions))
cf = confusion_matrix(cv_true, cv_predictions)

## Repeat THE SAME EXACT filtering and feature extraction procedures for the test set

### Apply the filtering

In [37]:
path_to_segmented = "./test"

imu_locations = ['Wrist', 'Thigh']
sensors = ['acc', 'gyr']

sampling_frequency=50.0

In [38]:
filtering_savepath = "./filtered_test"

if not os.path.exists(filtering_savepath):
    os.mkdir(filtering_savepath)

In [39]:
for imu_location in tqdm(imu_locations):
    # for each location, iterate through the different sensors
    for sensor in sensors:
        # for each sensor, iterate through the different axes
        for ax in ['x', 'y', 'z', 'mag']:
            # load the segmented UNFILTERED sensor data 
            sensor_ax = pd.read_csv(os.path.join(path_to_segmented, "{}_{}_{}.csv".format(imu_location, sensor, ax)), header = 0)
            # save that segmented UNFILTERED sensor data to the `filtered` folder since we are going to keep both filtered and unfiltered data there
            sensor_ax.to_csv(os.path.join(filtering_savepath, "{}_{}_{}.csv".format(imu_location, sensor, ax)), index=False)

            # apply a bandpass filter to each row in the segmented UNFILTERED sensor data
            # the lower cutoff frequency of this bandpass filter is 3Hz and the higher cutoff frequency is 15Hz
            sensor_band_ax = np.apply_along_axis(butter_bandpass_filter, 1, sensor_ax.iloc[:, 1:].values, cutoff = [3, 15], fs = sampling_frequency, order = 3)
            # create a dataframe for the magnitude data with columns with names: ['0', '1', '2', ..., '98', '99']
            sensor_band_ax = pd.DataFrame (sensor_band_ax, columns = [str(i) for i in range(100)])

            # add an index column at the end
            sensor_band_ax['index'] = np.arange(sensor_band_ax.shape[0])
            # rearange the columns so that the index colum is first
            sensor_band_ax = sensor_band_ax[['index'] + [str(i) for i in range(100)]]
            # save the data to the 'filtered' folder
            sensor_band_ax.to_csv(os.path.join(filtering_savepath, "{}_band_{}_{}.csv".format(imu_location, sensor, ax)), index=False)

100%|██████████| 2/2 [00:48<00:00, 24.03s/it]


In [40]:
ls ./filtered_test

Thigh_acc_mag.csv       Thigh_band_gyr_z.csv    Wrist_band_acc_y.csv
Thigh_acc_x.csv         Thigh_gyr_mag.csv       Wrist_band_acc_z.csv
Thigh_acc_y.csv         Thigh_gyr_x.csv         Wrist_band_gyr_mag.csv
Thigh_acc_z.csv         Thigh_gyr_y.csv         Wrist_band_gyr_x.csv
Thigh_band_acc_mag.csv  Thigh_gyr_z.csv         Wrist_band_gyr_y.csv
Thigh_band_acc_x.csv    Wrist_acc_mag.csv       Wrist_band_gyr_z.csv
Thigh_band_acc_y.csv    Wrist_acc_x.csv         Wrist_gyr_mag.csv
Thigh_band_acc_z.csv    Wrist_acc_y.csv         Wrist_gyr_x.csv
Thigh_band_gyr_mag.csv  Wrist_acc_z.csv         Wrist_gyr_y.csv
Thigh_band_gyr_x.csv    Wrist_band_acc_mag.csv  Wrist_gyr_z.csv
Thigh_band_gyr_y.csv    Wrist_band_acc_x.csv


### Extract the same features as the training set (the functions which extract the features should be already defined in the cells above)

In [41]:
# define the path to the segmented and filtered data
filtered_test = './filtered_test'

# read the names of the files contained in the data dictionary
segmented_filenames = os.listdir(filtered_test)

# a dictionary to store the features
# this dictionary will be a nested one and access to some features will be done in the following manner:
# features['Wrist']['band_acc_x']
statistical_features_test = {}
time_domain_features_test = {}

### If you extracted frequency-domain features on the training data, don't forget to do the same on the test data

In [42]:
for imu_location in tqdm(imu_locations):
    # for each subject and imu_location pair, create a dictionary whose keys will be the sensor names
    # and the values, the calculated features for that sensor location
    statistical_features_test[imu_location] = {}
    time_domain_features_test[imu_location] = {}
    
    # iterate through all the files we found in the segmented data location
    for filename in segmented_filenames:
        # check to see if this filename is for the appropriate user, imu_location and if it is not a label file (no need to extract features from labels)
        if imu_location in filename:
            # read the data from disk
            sensor_data = pd.read_csv(os.path.join(filtered_test, filename), header= 0).iloc[:, 1:]
            # get the sensor name from the filename
            sensor_name = filename.split(".")[0].split('_')[1:]
            sensor_name = "_".join(([x.lower() for x in sensor_name]))
            # calculate statistical features for this sensor data
            statistical_features_test[imu_location][sensor_name] = calculate_statistical_features(sensor_data, sensor_name)
            time_domain_features_test[imu_location][sensor_name] = calculate_time_features(sensor_data, sensor_name)

            
            

100%|██████████| 2/2 [00:18<00:00,  9.42s/it]


In [43]:
time_domain_features_test['Wrist']['band_acc_mag']

Unnamed: 0,band_acc_mag_integral,band_acc_mag_mean_crossing_rate,band_acc_mag_num_peaks,band_acc_mag_avg_peak_height,band_acc_mag_sum,band_acc_mag_squared_sum,band_acc_mag_papr_db
0,-0.043365,0.36,2.0,0.006521,-0.043325,-0.086651,8.283302
1,0.626768,0.24,2.0,0.036807,0.626981,1.253962,17.194024
2,0.015155,0.29,2.0,0.005993,0.015303,0.030607,7.911873
3,-0.010290,0.37,2.0,0.006199,-0.010402,-0.020804,8.581205
4,-0.015665,0.40,2.0,0.007593,-0.015580,-0.031160,7.523918
...,...,...,...,...,...,...,...
7745,0.020714,0.41,2.0,0.013118,0.021498,0.042995,8.534216
7746,1.246520,0.26,2.0,0.265623,1.249131,2.498262,9.019407
7747,-0.658865,0.26,2.0,0.134069,-0.659944,-1.319887,10.309080
7748,-0.019551,0.26,2.0,0.041741,-0.016845,-0.033690,13.268845


### Save the extracted features, but using a different directory

In [44]:
feature_savepath = "./features_test"

if not os.path.exists(feature_savepath):
    os.mkdir(feature_savepath)

### Save the statistical features from the test set

In [45]:
# iterate through the subject names and imu_locations
for imu_location in imu_locations:
    # create a dataframe where we will store the concatenated features from all sensor at this IMU location
    subject_location_features = pd.DataFrame()
    # concatenate the features from all sensors at this IMU location
    for sensor_name, sensor_features in statistical_features_test[imu_location].items():
        subject_location_features = pd.concat([subject_location_features, sensor_features], axis = 1)

    # display a control string
    print ("The number of features in {}_{} is: {}".format(imu_location, "stat", subject_location_features.shape))
    # generate the path to the file where the features need to be saved
    savepath = os.path.join(feature_savepath, "{}_{}.csv".format(imu_location, "stat"))
    subject_location_features.to_csv(savepath, index=False)

The number of features in Wrist_stat is: (7750, 112)
The number of features in Thigh_stat is: (7750, 112)


### Save the time-domain features from the test set

In [46]:
# iterate through the subject names and imu_locations
for imu_location in imu_locations:
    # create a dataframe where we will store the concatenated features from all sensor at this IMU location
    subject_location_features = pd.DataFrame()
    # concatenate the features from all sensors at this IMU location
    for sensor_name, sensor_features in time_domain_features_test[imu_location].items():
        subject_location_features = pd.concat([subject_location_features, sensor_features], axis = 1)

    # display a control string
    print ("The number of features in {}_{} is: {}".format(imu_location, "time", subject_location_features.shape))
    # generate the path to the file where the features need to be saved
    savepath = os.path.join(feature_savepath, "{}_{}.csv".format(imu_location, "time"))
    subject_location_features.to_csv(savepath, index=False)

The number of features in Wrist_time is: (7750, 112)
The number of features in Thigh_time is: (7750, 112)


## Train a classifier on the training data and predict the test set

In [47]:
# define the path to the directory where the features are saved
path_to_train_features = "./features_train"
path_to_test_features = "./features_test"

# define the subjects in the training set
train_subject_names = ['s0', 's1', 's4', 's5', 's6', 's7', 's8', 's9']

imu_locations = ['Wrist', 'Thigh']

# what sensors to include?
sensor_names = ['acc', 'gyr']

# define the types of features
types_of_features = ['stat', 'time']

### Concatenate all training data

In [48]:
# a place where we can store the concatenated training data
train_features = pd.DataFrame()
# a place to save the training labels
train_labels = np.array([])

# iterate through the different subjects
for subject_name in train_subject_names:
    subject_data = pd.DataFrame()
    
    # both label files per user (wrist and thigh) are identical and it doesn't matter which one we choose
    label_filename = "{}_activity_ids.csv".format(subject_name)
    label_filepath = os.path.join(path_to_train_features, label_filename)
    
    # read the labels from disk
    subject_labels = pd.read_csv(label_filepath, header = 0).iloc[:, 1].values
    
    for imu_location in imu_locations:
        for feature_type in types_of_features:
            feature_filename = os.path.join(path_to_train_features, f"{subject_name}_{imu_location}_{feature_type}.csv")
            # load the features from disk
            feats = pd.read_csv(feature_filename, header=0)
            
            # remove all features that don't come from a sensor we selected for use
            selected_sensor_features = []
            for feature_name in feats.columns:
                for sensor_name in sensor_names:
                    if sensor_name in feature_name:
                        selected_sensor_features.append(feature_name)

            selected_sensor_features = list(set(selected_sensor_features))
                
            feats = feats[selected_sensor_features]
            
            # add the imu_location to the name of the feature
            # we do this because features from different imu_location files have the same names and this will be a problem
            feats.columns = [f'{imu_location}_{x}' for x in feats.columns]
            feats = feats[sorted(feats.columns)]
            
            # horizontally concatenate all features together
            subject_data = pd.concat([subject_data, feats], axis = 1)
    # vertically concatenate the data from different subjects
    train_features = pd.concat([train_features, subject_data], axis=0)
    train_labels = np.concatenate((train_labels, subject_labels))

### Concatenate the test data

In [49]:
test_features = pd.DataFrame()

for imu_location in imu_locations:
    for feature_type in types_of_features:
        feature_filename = os.path.join(path_to_test_features, f"{imu_location}_{feature_type}.csv")
        
        subject_features = pd.read_csv(feature_filename, header=0)
        
        # add the imu_location to the name of the feature
        # we do this because features from different imu_location files have the same names and this will be a problem
        subject_features.columns = [f"{imu_location}_{x}" for x in subject_features.columns]
        subject_features = subject_features[sorted(subject_features.columns)]
        
        # horizontally concatenate the data f
        test_features = pd.concat([test_features, subject_features], axis = 1)

print (len(np.unique(test_features.columns)))
print (test_features.shape)

448
(7750, 448)


### Check the feature shapes

In [50]:
train_features.shape, test_features.shape

((29777, 448), (7750, 448))

### Train a classifier on the training data

In [None]:
from xgboost import XGBClassifier

In [None]:
# training the model with the training data
xgb = XGBClassifier()
xgb.fit(train_features, train_labels)

In [None]:
# doing the predictions
train_pred = xgb.predict(train_features)

In [None]:
# getting all the predictions to a csv file
submission = pd.DataFrame(columns = ['index', 'activity_id'])

submission['index'] = np.arange(train_pred.shape[0])
submission['activity_id'] = train_pred.astype(int)

submission.to_csv("pred_final.csv", index=False)

### Predict the labels of the test set

In [None]:
# to analize the data with the help of seaborn
import seaborn as sns

corr = train_features.corr()

ax = sns.heatmap(
    corr,
    annot = True,
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)

ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

In [None]:
# predicting the labels of the test set
test_predictions = xgb.predict(test_features[train_features.columns])

In [None]:
%%capture
pip install cr-features

In [None]:
# using the cr library for hmm_smooth function
from cr_features.pipeline_functions import hmm_smooth

In [None]:
# smoothening the predictions
smoothed_prediction = hmm_smooth(train_labels, cf, test_predictions, activities = list(np.unique(train_labels)))

In [None]:
# creating the submission file with the smoothed predictions
submission = pd.DataFrame(columns = ['index', 'activity_id'])

submission['index'] = np.arange(smoothed_prediction.shape[0])
submission['activity_id'] = smoothed_prediction.astype(int)

submission.to_csv("submission_final.csv", index=False)