Load data + filter walking activities

In [8]:
import numpy as np

# Load training data
X_train = np.loadtxt("../data/uci/train/X_train.txt")
y_train = np.loadtxt("../data/uci/train/y_train.txt")
subject_train = np.loadtxt("../data/uci/train/subject_train.txt")

# Load test data (for understanding only)
X_test = np.loadtxt("../data/uci/test/X_test.txt")
y_test = np.loadtxt("../data/uci/test/y_test.txt")
subject_test = np.loadtxt("../data/uci/test/subject_test.txt")


In [9]:
print("X_train shape        :", X_train.shape)
print("y_train shape        :", y_train.shape)
print("subject_train shape  :", subject_train.shape)

print("\nX_test shape         :", X_test.shape)
print("y_test shape         :", y_test.shape)
print("subject_test shape   :", subject_test.shape)


X_train shape        : (7352, 561)
y_train shape        : (7352,)
subject_train shape  : (7352,)

X_test shape         : (2947, 561)
y_test shape         : (2947,)
subject_test shape   : (2947,)


In [10]:
print("Unique activities in train:", np.unique(y_train))
print("Unique subjects in train  :", np.unique(subject_train))

print("\nUnique activities in test :", np.unique(y_test))
print("Unique subjects in test   :", np.unique(subject_test))


Unique activities in train: [1. 2. 3. 4. 5. 6.]
Unique subjects in train  : [ 1.  3.  5.  6.  7.  8. 11. 14. 15. 16. 17. 19. 21. 22. 23. 25. 26. 27.
 28. 29. 30.]

Unique activities in test : [1. 2. 3. 4. 5. 6.]
Unique subjects in test   : [ 2.  4.  9. 10. 12. 13. 18. 20. 24.]


In [11]:
unique, counts = np.unique(y_train, return_counts=True)

print("Samples per activity (train):")
for u, c in zip(unique, counts):
    print(f"Activity {int(u)} : {c}")


Samples per activity (train):
Activity 1 : 1226
Activity 2 : 1073
Activity 3 : 986
Activity 4 : 1286
Activity 5 : 1374
Activity 6 : 1407


In [17]:
import numpy as np

# STEP 2 OUTPUT (redefine explicitly)
X_train = np.loadtxt("../data/uci/train/X_train.txt")
y_train = np.loadtxt("../data/uci/train/y_train.txt")
subject_train = np.loadtxt("../data/uci/train/subject_train.txt")

# walking-related activities only
gait_mask = np.isin(y_train, [1, 2, 3])

X_final = X_train[gait_mask]
y_final = subject_train[gait_mask]

print("X_final shape:", X_final.shape)
print("y_final shape:", y_final.shape)


X_final shape: (3285, 561)
y_final shape: (3285,)


In [18]:
print("Unique employee IDs used for training:")
print(np.unique(y_gait_subject))

print("\nNumber of employees:", len(np.unique(y_gait_subject)))


Unique employee IDs used for training:
[ 1.  3.  5.  6.  7.  8. 11. 14. 15. 16. 17. 19. 21. 22. 23. 25. 26. 27.
 28. 29. 30.]

Number of employees: 21


In [19]:
print("Are test subjects present in training data?")
print(np.intersect1d(y_gait_subject, subject_test))


Are test subjects present in training data?
[]


In [21]:
y_final = y_final.astype(int)


In [22]:
import numpy as np
from collections import Counter

subject_counts = Counter(y_final)

print("Samples per employee (before expansion):")
for k in sorted(subject_counts.keys()):
    print(f"Employee {k:2d} : {subject_counts[k]}")


Samples per employee (before expansion):
Employee  1 : 197
Employee  3 : 166
Employee  5 : 150
Employee  6 : 156
Employee  7 : 155
Employee  8 : 127
Employee 11 : 159
Employee 14 : 158
Employee 15 : 144
Employee 16 : 149
Employee 17 : 155
Employee 19 : 131
Employee 21 : 144
Employee 22 : 124
Employee 23 : 164
Employee 25 : 197
Employee 26 : 164
Employee 27 : 152
Employee 28 : 151
Employee 29 : 150
Employee 30 : 192


In [23]:
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier


In [24]:
print("X_final shape:", X_final.shape)
print("y_final shape:", y_final.shape)

print("Unique employees:", np.unique(y_final))


X_final shape: (3285, 561)
y_final shape: (3285,)
Unique employees: [ 1  3  5  6  7  8 11 14 15 16 17 19 21 22 23 25 26 27 28 29 30]


In [25]:
cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)


In [26]:
models = {
    "LogisticRegression": Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(max_iter=2000))
    ]),
    
    "KNN": Pipeline([
        ("scaler", StandardScaler()),
        ("clf", KNeighborsClassifier(n_neighbors=5))
    ]),
    
    "SVM_RBF": Pipeline([
        ("scaler", StandardScaler()),
        ("clf", SVC(kernel="rbf", probability=True))
    ]),
    
    "RandomForest": RandomForestClassifier(
        n_estimators=300,
        random_state=42,
        n_jobs=-1
    )
}


In [27]:
results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")

    scores = cross_val_score(
        model,
        X_final,
        y_final,
        cv=cv,
        scoring="accuracy",
        n_jobs=-1
    )

    results[name] = {
        "mean_accuracy": scores.mean(),
        "std_accuracy": scores.std()
    }

    print(f" CV Accuracy : {scores.mean():.4f} ± {scores.std():.4f}")



Training LogisticRegression...
 CV Accuracy : 0.9921 ± 0.0033

Training KNN...
 CV Accuracy : 0.9820 ± 0.0035

Training SVM_RBF...
 CV Accuracy : 0.9936 ± 0.0031

Training RandomForest...
 CV Accuracy : 0.9927 ± 0.0006


In [29]:
from sklearn.ensemble import RandomForestClassifier

final_model = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)


In [30]:
final_model.fit(X_final, y_final)

print("Final model trained on full enrolled-employee dataset.")


Final model trained on full enrolled-employee dataset.


In [32]:
import os
import joblib

MODEL_PATH = "../models/gait_model.pkl"


joblib.dump(final_model, MODEL_PATH)

print(f"Model saved at: {MODEL_PATH}")


Model saved at: ../models/gait_model.pkl


In [33]:
loaded_model = joblib.load(MODEL_PATH)

print("Loaded model type:", type(loaded_model))


Loaded model type: <class 'sklearn.ensemble._forest.RandomForestClassifier'>


In [34]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# create hold-out split from enrolled employees
X_tr, X_val, y_tr, y_val = train_test_split(
    X_final,
    y_final,
    test_size=0.2,
    stratify=y_final,
    random_state=42
)

# train model again on split
final_model.fit(X_tr, y_tr)

# predict on hold-out
y_pred = final_model.predict(X_val)

holdout_acc = accuracy_score(y_val, y_pred)

print(f"Hold-out Accuracy: {holdout_acc:.4f}")


Hold-out Accuracy: 1.0000


In [35]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

cv_scores = cross_val_score(
    final_model,
    X_final,
    y_final,
    cv=cv,
    scoring="accuracy",
    n_jobs=-1
)

print("Cross-Validation Accuracy:")
print("Mean :", cv_scores.mean())
print("Std  :", cv_scores.std())


Cross-Validation Accuracy:
Mean : 0.9926940639269406
Std  : 0.0006088280060883022


In [36]:
from sklearn.metrics import confusion_matrix
import numpy as np

cm = confusion_matrix(y_val, y_pred)

print("Confusion matrix shape:", cm.shape)
print("Diagonal (correct predictions):")
print(np.diag(cm))


Confusion matrix shape: (21, 21)
Diagonal (correct predictions):
[39 33 30 31 31 25 32 32 29 30 31 26 29 25 33 39 33 30 30 30 39]
