In [188]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.tree import plot_tree

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [189]:
# Loading data.
def load_data(folder, label, sampling_interval_ms=100):
    files = [os.path.join(folder, f) for f in os.listdir(folder) if f.endswith('.csv')]
    data = []
    for file in files:
        df = pd.read_csv(file)
        df['label'] = label
        df['timestamp'] = pd.date_range(start='2024-01-01', periods=len(df), freq=f'{sampling_interval_ms}ms')
        df['timestamp'] = df['timestamp'].dt.strftime('%H:%M:%S.%f').str[:-4]
        data.append(df)
    combined = pd.concat(data, ignore_index=True)
    return combined

idle_data = load_data('data/idle', 'idle')
running_data = load_data('data/running', 'running')
elevation_data = load_data('data/stairs', 'elevation')
walking_data = load_data('data/walking', 'walking')

all_data = pd.concat([idle_data, running_data, elevation_data, walking_data], ignore_index=True)
all_data = all_data.rename(columns={'accelerometer_X': 'acc_X', 'accelerometer_Y': 'acc_Y', 'accelerometer_Z': 'acc_Z'})

print(f'DATA SAMPLE HEAD:\n{all_data.head()}\n')
print(f'DATA SAMPLE TAIL:\n{all_data.tail()}\n')
print(f'LABEL DATA:\n{all_data['label'].value_counts()}')

DATA SAMPLE HEAD:
      acc_X     acc_Y     acc_Z label    timestamp
0  1.000776  4.616021  8.576031  idle  00:00:00.00
1  0.718261  4.209007  8.446744  idle  00:00:00.10
2 -0.909797 -0.282516  9.203311  idle  00:00:00.20
3  5.099650  0.148441  8.418014  idle  00:00:00.30
4  1.762132 -0.162806  9.251195  idle  00:00:00.40

DATA SAMPLE TAIL:
           acc_X      acc_Y     acc_Z    label    timestamp
193855  5.109226 -15.452178 -1.470040  walking  00:00:02.50
193856  6.373365 -11.741165 -8.226476  walking  00:00:02.60
193857  3.289633  -9.993398 -0.383072  walking  00:00:02.70
193858 -2.978387  -3.050213  1.273715  walking  00:00:02.80
193859 -4.347870  -9.926360 -1.642422  walking  00:00:02.90

LABEL DATA:
label
running      102240
walking       55500
idle          31170
elevation      4950
Name: count, dtype: int64


In [190]:
def feature_define(df):
    feature = pd.DataFrame()
    columns = ['acc_X', 'acc_Y', 'acc_Z']
    
    for col in columns:
        feature[col + '_mean'] = df[col].rolling(window=30).mean().reset_index(drop=True)
        feature[col + '_median'] = df[col].rolling(window=30).median().reset_index(drop=True)
        feature[col + '_std'] = df[col].rolling(window=30).std().reset_index(drop=True)
        feature[col + '_max'] = df[col].rolling(window=30).max().reset_index(drop=True)
        feature[col + '_min'] = df[col].rolling(window=30).min().reset_index(drop=True)
    return feature.dropna()

features = feature_define(all_data)
features.reset_index(drop=True, inplace=True)

labels = all_data['label'][len(all_data) - len(features):].reset_index(drop=True)
assert len(features) == len(labels), "Mismatch between features and labels length"

print(f'FEATURES:\n{features.head()}')
print(f'LABELS:\n{labels.head()}')

FEATURES:
   acc_X_mean  acc_X_median  acc_X_std  acc_X_max  acc_X_min  acc_Y_mean  acc_Y_median  acc_Y_std  \
0    0.178448     -0.102950   1.036361   5.099650  -0.909797    0.167435     -0.131681   1.157603   
1    0.141737     -0.102950   1.025678   5.099650  -0.909797    0.009417     -0.131681   0.796708   
2    0.115081     -0.102950   1.020557   5.099650  -0.909797   -0.134554     -0.131681   0.075075   
3    0.141258     -0.102950   1.003288   5.099650  -0.320823   -0.130245     -0.131681   0.069816   
4   -0.039424     -0.110133   0.363864   1.762132  -0.320823   -0.141577     -0.134075   0.046827   

   acc_Y_max  acc_Y_min  acc_Z_mean  acc_Z_median  acc_Z_std  acc_Z_max  acc_Z_min  
0   4.616021  -0.282516    9.605697      9.770737   0.406903    9.80665   8.418014  
1   4.209007  -0.282516    9.645281      9.770737   0.358120    9.80665   8.418014  
2   0.148441  -0.282516    9.688536      9.770737   0.277703    9.80665   8.418014  
3   0.148441  -0.244209    9.707530      9.

In [191]:
assert len(features) == len(labels), "Mismatch between features and labels length"

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42)

print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)

Training set size: (135681, 15)
Test set size: (58150, 15)


In [192]:
# Model SVM
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# Model random forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Forecasting on the test sample
y_pred_svm = svm_model.predict(X_test)
y_pred_rf = rf_model.predict(X_test)

# Calculation of the confusion matrix
conf_matrix_svm = confusion_matrix(y_test, y_pred_svm)
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)

In [193]:
# Accuracy
accuracy_svm = accuracy_score(y_test, y_pred_svm)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

print("SVM Accuracy:", accuracy_svm)
print("Random Forest Accuracy:", accuracy_rf)

# Raport
print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm))

print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

SVM Accuracy: 0.9895270851246776
Random Forest Accuracy: 0.9998452278589853
SVM Classification Report:
              precision    recall  f1-score   support

   elevation       0.87      0.70      0.78      1515
        idle       1.00      1.00      1.00      9299
     running       1.00      1.00      1.00     30622
     walking       0.97      0.99      0.98     16714

    accuracy                           0.99     58150
   macro avg       0.96      0.92      0.94     58150
weighted avg       0.99      0.99      0.99     58150

Random Forest Classification Report:
              precision    recall  f1-score   support

   elevation       1.00      1.00      1.00      1515
        idle       1.00      1.00      1.00      9299
     running       1.00      1.00      1.00     30622
     walking       1.00      1.00      1.00     16714

    accuracy                           1.00     58150
   macro avg       1.00      1.00      1.00     58150
weighted avg       1.00      1.00      1.00  