Import libraries

In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, LeaveOneGroupOut
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

Train data to csv

In [2]:
X_train = pd.read_csv("X_train.txt", sep='\s+', header=None)
y_train = pd.read_csv("y_train.txt", sep='\s+', header=None)

train_data = pd.concat([X_train, y_train], axis=1)

column_names = [f"Feature_{i+1}" for i in range(X_train.shape[1])]
column_names.append("Activity")
train_data.columns = column_names

train_data.to_csv("train_data.csv", index=False)

Text data to csv

In [3]:
X_test = pd.read_csv("X_test.txt", sep='\s+', header=None)
y_test = pd.read_csv("y_test.txt", sep='\s+', header=None)

test_data = pd.concat([X_test, y_test], axis=1)
test_data.columns = column_names
test_data.to_csv("test_data.csv", index=False)

In [4]:
df_train = pd.read_csv("train_data.csv")

In [5]:
df_test = pd.read_csv("test_data.csv")

Splitting train data into features and target


In [6]:
X = df_train.drop(columns=["Activity"])
y = df_train["Activity"]

In [8]:
XT = df_test.drop(columns=["Activity"])
yT = df_test["Activity"]

In [9]:
X.shape, y.shape

((7352, 561), (7352,))

In [10]:
XT.shape, yT.shape

((2947, 561), (2947,))

Loading ML models

In [11]:
models = {
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=500),
    'AdaBoost': AdaBoostClassifier(algorithm="SAMME")
}

The code performs K-Fold Cross Validation (with 5 splits) on the specified models to evaluate their performance. It calculates and prints the mean accuracy, precision, recall and F1 for each model using the training data

In [14]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
scoring = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1': make_scorer(f1_score, average='weighted')
}

print("\nModel Performance (K-Fold Cross Validation):")
for model_name, model in models.items():
    scores = cross_validate(model, X, y, cv=kfold, scoring=scoring)
    print(f"{model_name} - Accuracy: {scores['test_accuracy'].mean():.4f} ± {scores['test_accuracy'].std():.4f}")
    print(f"{model_name} - Precision (Weighted): {scores['test_precision'].mean():.4f} ± {scores['test_precision'].std():.4f}")
    print(f"{model_name} - Recall (Weighted): {scores['test_recall'].mean():.4f} ± {scores['test_recall'].std():.4f}")
    print(f"{model_name} - F1 Score (Weighted): {scores['test_f1'].mean():.4f} ± {scores['test_f1'].std():.4f}")



Model Performance (K-Fold Cross Validation):
Random Forest - Accuracy: 0.9815 ± 0.0021
Random Forest - Precision (Weighted): 0.9816 ± 0.0021
Random Forest - Recall (Weighted): 0.9815 ± 0.0021
Random Forest - F1 Score (Weighted): 0.9815 ± 0.0021
Decision Tree - Accuracy: 0.9377 ± 0.0021
Decision Tree - Precision (Weighted): 0.9379 ± 0.0021
Decision Tree - Recall (Weighted): 0.9377 ± 0.0021
Decision Tree - F1 Score (Weighted): 0.9377 ± 0.0021
Logistic Regression - Accuracy: 0.9833 ± 0.0028
Logistic Regression - Precision (Weighted): 0.9833 ± 0.0028
Logistic Regression - Recall (Weighted): 0.9833 ± 0.0028
Logistic Regression - F1 Score (Weighted): 0.9833 ± 0.0028


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


AdaBoost - Accuracy: 0.3547 ± 0.0046
AdaBoost - Precision (Weighted): 0.2968 ± 0.1190
AdaBoost - Recall (Weighted): 0.3547 ± 0.0046
AdaBoost - F1 Score (Weighted): 0.2203 ± 0.0042


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


The code performs Leave-One-Group-Out Cross Validation (LOGO CV) using the provided group labels (subject_train.txt) to evaluate model performance. It calculates and prints the mean accuracy and standard deviation for each model using the training data

In [15]:
group_labels = np.loadtxt('subject_train.txt')
logo = LeaveOneGroupOut()

scoring = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1': make_scorer(f1_score, average='weighted')
}

print("\nModel Performance (Leave-One-Subject-Out CV):")
for model_name, model in models.items():
    scores = cross_validate(model, X, y, groups=group_labels, cv=logo, scoring=scoring)
    print(f"{model_name} - Accuracy: {scores['test_accuracy'].mean():.4f} ± {scores['test_accuracy'].std():.4f}")
    print(f"{model_name} - Precision (Weighted): {scores['test_precision'].mean():.4f} ± {scores['test_precision'].std():.4f}")
    print(f"{model_name} - Recall (Weighted): {scores['test_recall'].mean():.4f} ± {scores['test_recall'].std():.4f}")
    print(f"{model_name} - F1 Score (Weighted): {scores['test_f1'].mean():.4f} ± {scores['test_f1'].std():.4f}")



Model Performance (Leave-One-Subject-Out CV):


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Random Forest - Accuracy: 0.9113 ± 0.0791
Random Forest - Precision (Weighted): 0.9167 ± 0.0902
Random Forest - Recall (Weighted): 0.9113 ± 0.0791
Random Forest - F1 Score (Weighted): 0.9025 ± 0.0938
Decision Tree - Accuracy: 0.8516 ± 0.0902
Decision Tree - Precision (Weighted): 0.8701 ± 0.0756
Decision Tree - Recall (Weighted): 0.8516 ± 0.0902
Decision Tree - F1 Score (Weighted): 0.8463 ± 0.0944


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Logistic Regression - Accuracy: 0.9395 ± 0.0713
Logistic Regression - Precision (Weighted): 0.9467 ± 0.0696
Logistic Regression - Recall (Weighted): 0.9395 ± 0.0713
Logistic Regression - F1 Score (Weighted): 0.9347 ± 0.0820


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

AdaBoost - Accuracy: 0.3534 ± 0.0288
AdaBoost - Precision (Weighted): 0.1938 ± 0.0764
AdaBoost - Recall (Weighted): 0.3534 ± 0.0288
AdaBoost - F1 Score (Weighted): 0.2179 ± 0.0348


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


The code performs K-Fold Cross Validation (with 5 splits) on the specified models to evaluate their performance. It calculates and prints the mean accuracy, precision, recall and F1 for each model using the testing data

In [16]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
scoring = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1': make_scorer(f1_score, average='weighted')
}

print("\nModel Performance (K-Fold Cross Validation) on test data:")
for model_name, model in models.items():
    scores = cross_validate(model, XT, yT, cv=kfold, scoring=scoring)
    print(f"{model_name} - Accuracy: {scores['test_accuracy'].mean():.4f} ± {scores['test_accuracy'].std():.4f}")
    print(f"{model_name} - Precision (Weighted): {scores['test_precision'].mean():.4f} ± {scores['test_precision'].std():.4f}")
    print(f"{model_name} - Recall (Weighted): {scores['test_recall'].mean():.4f} ± {scores['test_recall'].std():.4f}")
    print(f"{model_name} - F1 Score (Weighted): {scores['test_f1'].mean():.4f} ± {scores['test_f1'].std():.4f}")



Model Performance (K-Fold Cross Validation) on test data:
Random Forest - Accuracy: 0.9851 ± 0.0077
Random Forest - Precision (Weighted): 0.9853 ± 0.0076
Random Forest - Recall (Weighted): 0.9851 ± 0.0077
Random Forest - F1 Score (Weighted): 0.9851 ± 0.0077
Decision Tree - Accuracy: 0.9325 ± 0.0114
Decision Tree - Precision (Weighted): 0.9338 ± 0.0121
Decision Tree - Recall (Weighted): 0.9325 ± 0.0114
Decision Tree - F1 Score (Weighted): 0.9325 ± 0.0114
Logistic Regression - Accuracy: 0.9834 ± 0.0092
Logistic Regression - Precision (Weighted): 0.9837 ± 0.0089
Logistic Regression - Recall (Weighted): 0.9834 ± 0.0092
Logistic Regression - F1 Score (Weighted): 0.9834 ± 0.0092


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


AdaBoost - Accuracy: 0.4021 ± 0.0361
AdaBoost - Precision (Weighted): 0.3767 ± 0.1614
AdaBoost - Recall (Weighted): 0.4021 ± 0.0361
AdaBoost - F1 Score (Weighted): 0.2626 ± 0.0418


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


The code performs Leave-One-Group-Out Cross Validation (LOGO CV) using the provided group labels (subject_train.txt) to evaluate model performance. It calculates and prints the mean accuracy and standard deviation for each model using the testing data

In [17]:
group_labels = np.loadtxt('subject_test.txt')
logo = LeaveOneGroupOut()

scoring = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1': make_scorer(f1_score, average='weighted')
}

print("\nModel Performance (Leave-One-Subject-Out CV):")
for model_name, model in models.items():
    scores = cross_validate(model, XT, yT, groups=group_labels, cv=logo, scoring=scoring)
    print(f"{model_name} - Accuracy: {scores['test_accuracy'].mean():.4f} ± {scores['test_accuracy'].std():.4f}")
    print(f"{model_name} - Precision (Weighted): {scores['test_precision'].mean():.4f} ± {scores['test_precision'].std():.4f}")
    print(f"{model_name} - Recall (Weighted): {scores['test_recall'].mean():.4f} ± {scores['test_recall'].std():.4f}")
    print(f"{model_name} - F1 Score (Weighted): {scores['test_f1'].mean():.4f} ± {scores['test_f1'].std():.4f}")



Model Performance (Leave-One-Subject-Out CV):
Random Forest - Accuracy: 0.9105 ± 0.0367
Random Forest - Precision (Weighted): 0.9260 ± 0.0245
Random Forest - Recall (Weighted): 0.9105 ± 0.0367
Random Forest - F1 Score (Weighted): 0.9053 ± 0.0463
Decision Tree - Accuracy: 0.8252 ± 0.0360
Decision Tree - Precision (Weighted): 0.8452 ± 0.0408
Decision Tree - Recall (Weighted): 0.8252 ± 0.0360
Decision Tree - F1 Score (Weighted): 0.8175 ± 0.0379
Logistic Regression - Accuracy: 0.9227 ± 0.0460
Logistic Regression - Precision (Weighted): 0.9394 ± 0.0317
Logistic Regression - Recall (Weighted): 0.9227 ± 0.0460
Logistic Regression - F1 Score (Weighted): 0.9204 ± 0.0477


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


AdaBoost - Accuracy: 0.3956 ± 0.0643
AdaBoost - Precision (Weighted): 0.1997 ± 0.0708
AdaBoost - Recall (Weighted): 0.3956 ± 0.0643
AdaBoost - F1 Score (Weighted): 0.2517 ± 0.0695


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Training the models using the training data and testing it using the test data

In [18]:
print("\nModel Performance on Test Data:")
for model_name, model in models.items():
    model.fit(X, y)
    y_pred = model.predict(XT)
    acc = accuracy_score(yT, y_pred)
    prec = precision_score(yT, y_pred, average='weighted')
    rec = recall_score(yT, y_pred, average='weighted')
    f1 = f1_score(yT, y_pred, average='weighted')

    print(f"{model_name} -> Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1 Score: {f1:.4f}")


Model Performance on Test Data:
Random Forest -> Accuracy: 0.9250, Precision: 0.9260, Recall: 0.9250, F1 Score: 0.9249
Decision Tree -> Accuracy: 0.8578, Precision: 0.8584, Recall: 0.8578, F1 Score: 0.8573
Logistic Regression -> Accuracy: 0.9610, Precision: 0.9623, Recall: 0.9610, F1 Score: 0.9608
AdaBoost -> Accuracy: 0.3492, Precision: 0.1548, Recall: 0.3492, F1 Score: 0.2128


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
