In [9]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [10]:
def calculate_time_domain_features(data):
    features = []
    for axis in ["X", "Y", "Z"]:
        mean = np.mean(data[f"accelerometer_{axis}"])
        std_dev = np.std(data[f"accelerometer_{axis}"])
        max_val = np.max(data[f"accelerometer_{axis}"])
        min_val = np.min(data[f"accelerometer_{axis}"])
        median = np.median(data[f"accelerometer_{axis}"])
        range_val = max_val - min_val
        interquartile_range = np.percentile(data[f"accelerometer_{axis}"], 75) - np.percentile(data[f"accelerometer_{axis}"], 25)
        rms = np.sqrt(np.mean(data[f"accelerometer_{axis}"] ** 2))
        features.extend([mean, std_dev, max_val, min_val, median, range_val, interquartile_range, rms])
    return features


def preprocess_data(folder, label):
    features = []
    for file in os.listdir(folder):
        if file.endswith(".csv"):
            data = pd.read_csv(os.path.join(folder, file))
            time_domain_features = calculate_time_domain_features(data)
            features.append(time_domain_features + [label])
    return features


def basic_features(data):
    return data[:, :24] 

def extended_features(data):
    return data[:, :]

In [11]:
running_data = preprocess_data("data/running", "running")
idle_data = preprocess_data("data/idle", "idle")
stairs_data = preprocess_data("data/stairs", "stairs")
walking_data = preprocess_data("data/walking", "walking")

all_data = running_data + idle_data + stairs_data + walking_data

df = pd.DataFrame(all_data)



In [16]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

sc = StandardScaler()
X = sc.fit_transform(X)


def first_6_features(data):
    return data[:, :6] 

def all_features(data):
    return data[:, :] 


classifiers = {
    'SVM': SVC(kernel='linear'),
    'Random Forest': RandomForestClassifier(n_estimators=100)
}


feature_sets = {
    'First 6 Features': first_6_features,
    'All Features': all_features
}


for feature_name, feature_func in feature_sets.items():
    for classifier_name, classifier in classifiers.items():
        pipe = Pipeline([('scaler', StandardScaler()), ('classifier', classifier)])
        scores = cross_val_score(pipe, feature_func(X), y, cv=10)
        print(f"{classifier_name} with {feature_name}: {np.mean(scores)}")
        

SVM with First 6 Features: 0.9617733191055645
Random Forest with First 6 Features: 0.9925722912609279
SVM with All Features: 0.9899402338011589
Random Forest with All Features: 0.998761849163321


We pre-processed the data to obtain a single DataFrame with calculated Time domain features.
As we expected Random Forest classifier gives us better accuracy than SVM classifier.
Moreover we gained better accuracy using all features than only first 6 features - probably the wrong set of features was used.
In our case SVM classifier gives us much better results (~96%) than we expected (according to article ~65%) - perhaps it depends on the specifics of the input data.