In [25]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, LeaveOneGroupOut
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [26]:
X_train = pd.read_csv("X_train.txt", sep='\s+', header=None)
y_train = pd.read_csv("y_train.txt", sep='\s+', header=None)

train_data = pd.concat([X_train, y_train], axis=1)

column_names = [f"Feature_{i+1}" for i in range(X_train.shape[1])]
column_names.append("Activity")
train_data.columns = column_names

train_data.to_csv("train_data.csv", index=False)

In [27]:
train_data.shape

(7352, 562)

In [28]:
X_test = pd.read_csv("X_test.txt", sep='\s+', header=None)
y_test = pd.read_csv("y_test.txt", sep='\s+', header=None)

test_data = pd.concat([X_test, y_test], axis=1)
test_data.columns = column_names
test_data.to_csv("test_data.csv", index=False)

In [29]:
df_train = pd.read_csv("train_data.csv")

In [30]:
df_test = pd.read_csv("test_data.csv")

In [31]:
X = df_train.drop(columns=["Activity"])
y = df_train["Activity"]

In [32]:
XT = df_test.drop(columns=["Activity"])
yT = df_test["Activity"]

In [33]:
X.shape, y.shape

((7352, 561), (7352,))

In [34]:
XT.shape, yT.shape

((2947, 561), (2947,))

Reducing Features by removing higly corellated features


In [35]:
correlation_matrix = X.corr()
highly_correlated_features = []

for column in correlation_matrix.columns:
    correlated_columns = correlation_matrix[column].abs() > 0.85
    correlated_columns[column] = False

    if correlated_columns.any():
        highly_correlated_features.append(column)

print("Highly correlated features:", highly_correlated_features)


Highly correlated features: ['Feature_4', 'Feature_5', 'Feature_6', 'Feature_7', 'Feature_8', 'Feature_9', 'Feature_10', 'Feature_11', 'Feature_12', 'Feature_13', 'Feature_14', 'Feature_15', 'Feature_16', 'Feature_17', 'Feature_18', 'Feature_19', 'Feature_20', 'Feature_21', 'Feature_22', 'Feature_23', 'Feature_24', 'Feature_25', 'Feature_26', 'Feature_27', 'Feature_30', 'Feature_31', 'Feature_34', 'Feature_35', 'Feature_41', 'Feature_42', 'Feature_43', 'Feature_44', 'Feature_45', 'Feature_46', 'Feature_47', 'Feature_48', 'Feature_49', 'Feature_50', 'Feature_51', 'Feature_52', 'Feature_53', 'Feature_54', 'Feature_55', 'Feature_57', 'Feature_58', 'Feature_60', 'Feature_61', 'Feature_62', 'Feature_66', 'Feature_67', 'Feature_68', 'Feature_69', 'Feature_70', 'Feature_71', 'Feature_72', 'Feature_73', 'Feature_74', 'Feature_75', 'Feature_76', 'Feature_77', 'Feature_84', 'Feature_85', 'Feature_86', 'Feature_87', 'Feature_88', 'Feature_89', 'Feature_90', 'Feature_91', 'Feature_92', 'Feature_93

In [36]:
print(len(highly_correlated_features))

442


Droping the highly corelated features as they are redundant and are analyzed whike going thorugh other features

In [37]:
X.drop(highly_correlated_features, axis=1, inplace=True)
XT.drop(highly_correlated_features, axis=1, inplace=True)

In [38]:
models = {
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=500),
    'AdaBoost': AdaBoostClassifier(algorithm="SAMME")
}

You can see that X contains only those features that are not corellated to others.

In [42]:
print(X)

      Feature_1  Feature_2  Feature_3  Feature_28  Feature_29  Feature_32  \
0      0.288585  -0.020294  -0.132905    0.359910   -0.058526    0.264106   
1      0.278419  -0.016411  -0.123520    0.284213    0.284595    0.294310   
2      0.279653  -0.019467  -0.113462    0.337202   -0.164739    0.342256   
3      0.279174  -0.026201  -0.123283    0.198204   -0.264307    0.323154   
4      0.276629  -0.016570  -0.115362    0.191161    0.086904    0.434728   
...         ...        ...        ...         ...         ...         ...   
7347   0.299665  -0.057193  -0.181233   -0.078255   -0.056751   -0.119821   
7348   0.273853  -0.007749  -0.147468    0.206839   -0.154722    0.034260   
7349   0.273387  -0.017011  -0.045022    0.063584   -0.017019    0.119962   
7350   0.289654  -0.018843  -0.158281    0.009588   -0.038354    0.101761   
7351   0.351503  -0.012423  -0.203867   -0.230562    0.139282   -0.156435   

      Feature_33  Feature_36  Feature_37  Feature_38  ...  Feature_533  \
0

The code performs K-Fold Cross Validation (with 5 splits) on the specified models to evaluate their performance with reduced features. It calculates and prints the mean accuracy, precision, recall and F1 for each model using the training data

In [39]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
scoring = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1': make_scorer(f1_score, average='weighted')
}

print("\nModel Performance (K-Fold Cross Validation):")
for model_name, model in models.items():
    scores = cross_validate(model, X, y, cv=kfold, scoring=scoring)
    print(f"{model_name} - Accuracy: {scores['test_accuracy'].mean():.4f} ± {scores['test_accuracy'].std():.4f}")
    print(f"{model_name} - Precision (Weighted): {scores['test_precision'].mean():.4f} ± {scores['test_precision'].std():.4f}")
    print(f"{model_name} - Recall (Weighted): {scores['test_recall'].mean():.4f} ± {scores['test_recall'].std():.4f}")
    print(f"{model_name} - F1 Score (Weighted): {scores['test_f1'].mean():.4f} ± {scores['test_f1'].std():.4f}")



Model Performance (K-Fold Cross Validation):
Random Forest - Accuracy: 0.9410 ± 0.0019
Random Forest - Precision (Weighted): 0.9417 ± 0.0020
Random Forest - Recall (Weighted): 0.9410 ± 0.0019
Random Forest - F1 Score (Weighted): 0.9411 ± 0.0019
Decision Tree - Accuracy: 0.8473 ± 0.0093
Decision Tree - Precision (Weighted): 0.8480 ± 0.0090
Decision Tree - Recall (Weighted): 0.8473 ± 0.0093
Decision Tree - F1 Score (Weighted): 0.8473 ± 0.0093
Logistic Regression - Accuracy: 0.9286 ± 0.0022
Logistic Regression - Precision (Weighted): 0.9289 ± 0.0026
Logistic Regression - Recall (Weighted): 0.9286 ± 0.0022
Logistic Regression - F1 Score (Weighted): 0.9286 ± 0.0024
AdaBoost - Accuracy: 0.6027 ± 0.0085
AdaBoost - Precision (Weighted): 0.6087 ± 0.0254
AdaBoost - Recall (Weighted): 0.6027 ± 0.0085
AdaBoost - F1 Score (Weighted): 0.5888 ± 0.0203


In [44]:
print(XT)

      Feature_1  Feature_2  Feature_3  Feature_28  Feature_29  Feature_32  \
0      0.257178  -0.023285  -0.014654    0.482280   -0.045462    0.130858   
1      0.286027  -0.013163  -0.119083    0.040674    0.272991    0.411411   
2      0.275485  -0.026050  -0.118152    0.032703    0.192385    0.470819   
3      0.270298  -0.032614  -0.117520    0.034200    0.153639    0.446100   
4      0.274833  -0.027848  -0.129527   -0.032804    0.294340    0.168419   
...         ...        ...        ...         ...         ...         ...   
2942   0.310155  -0.053391  -0.099109    0.114556    0.069925    0.013961   
2943   0.363385  -0.039214  -0.105915    0.124019    0.133963    0.070093   
2944   0.349966   0.030077  -0.115788    0.400424   -0.123618    0.170446   
2945   0.237594   0.018467  -0.096499    0.541771   -0.204716    0.233641   
2946   0.153627  -0.018437  -0.137018    0.406554   -0.151102    0.338025   

      Feature_33  Feature_36  Feature_37  Feature_38  ...  Feature_533  \
0

The code performs Leave-One-Group-Out Cross Validation (LOGO CV) using the provided group labels (subject_train.txt) to evaluate model performance using reduced features. It calculates and prints the mean accuracy and standard deviation for each model using the training data

In [45]:
group_labels = np.loadtxt('subject_train.txt')
logo = LeaveOneGroupOut()

scoring = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1': make_scorer(f1_score, average='weighted')
}

print("\nModel Performance (Leave-One-Subject-Out CV):")
for model_name, model in models.items():
    scores = cross_validate(model, X, y, groups=group_labels, cv=logo, scoring=scoring)
    print(f"{model_name} - Accuracy: {scores['test_accuracy'].mean():.4f} ± {scores['test_accuracy'].std():.4f}")
    print(f"{model_name} - Precision (Weighted): {scores['test_precision'].mean():.4f} ± {scores['test_precision'].std():.4f}")
    print(f"{model_name} - Recall (Weighted): {scores['test_recall'].mean():.4f} ± {scores['test_recall'].std():.4f}")
    print(f"{model_name} - F1 Score (Weighted): {scores['test_f1'].mean():.4f} ± {scores['test_f1'].std():.4f}")



Model Performance (Leave-One-Subject-Out CV):
Random Forest - Accuracy: 0.8676 ± 0.0928
Random Forest - Precision (Weighted): 0.8934 ± 0.0695
Random Forest - Recall (Weighted): 0.8676 ± 0.0928
Random Forest - F1 Score (Weighted): 0.8592 ± 0.1032
Decision Tree - Accuracy: 0.7509 ± 0.0960
Decision Tree - Precision (Weighted): 0.7688 ± 0.0850
Decision Tree - Recall (Weighted): 0.7509 ± 0.0960
Decision Tree - F1 Score (Weighted): 0.7431 ± 0.1012


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Logistic Regression - Accuracy: 0.8734 ± 0.0806
Logistic Regression - Precision (Weighted): 0.8869 ± 0.0769
Logistic Regression - Recall (Weighted): 0.8734 ± 0.0806
Logistic Regression - F1 Score (Weighted): 0.8674 ± 0.0896
AdaBoost - Accuracy: 0.5876 ± 0.0936
AdaBoost - Precision (Weighted): 0.6020 ± 0.0873
AdaBoost - Recall (Weighted): 0.5876 ± 0.0936
AdaBoost - F1 Score (Weighted): 0.5656 ± 0.0960


For Test data

In [47]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
scoring = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1': make_scorer(f1_score, average='weighted')
}

print("\nModel Performance (K-Fold Cross Validation):")
for model_name, model in models.items():
    scores = cross_validate(model, XT, yT, cv=kfold, scoring=scoring)
    print(f"{model_name} - Accuracy: {scores['test_accuracy'].mean():.4f} ± {scores['test_accuracy'].std():.4f}")
    print(f"{model_name} - Precision (Weighted): {scores['test_precision'].mean():.4f} ± {scores['test_precision'].std():.4f}")
    print(f"{model_name} - Recall (Weighted): {scores['test_recall'].mean():.4f} ± {scores['test_recall'].std():.4f}")
    print(f"{model_name} - F1 Score (Weighted): {scores['test_f1'].mean():.4f} ± {scores['test_f1'].std():.4f}")



Model Performance (K-Fold Cross Validation):
Random Forest - Accuracy: 0.9515 ± 0.0079
Random Forest - Precision (Weighted): 0.9521 ± 0.0079
Random Forest - Recall (Weighted): 0.9515 ± 0.0079
Random Forest - F1 Score (Weighted): 0.9513 ± 0.0080
Decision Tree - Accuracy: 0.8442 ± 0.0151
Decision Tree - Precision (Weighted): 0.8452 ± 0.0150
Decision Tree - Recall (Weighted): 0.8442 ± 0.0151
Decision Tree - F1 Score (Weighted): 0.8440 ± 0.0153
Logistic Regression - Accuracy: 0.9379 ± 0.0099
Logistic Regression - Precision (Weighted): 0.9388 ± 0.0098
Logistic Regression - Recall (Weighted): 0.9379 ± 0.0099
Logistic Regression - F1 Score (Weighted): 0.9380 ± 0.0099
AdaBoost - Accuracy: 0.5277 ± 0.0102
AdaBoost - Precision (Weighted): 0.5637 ± 0.0141
AdaBoost - Recall (Weighted): 0.5277 ± 0.0102
AdaBoost - F1 Score (Weighted): 0.5142 ± 0.0087


In [49]:
group_labels = np.loadtxt('subject_test.txt')
logo = LeaveOneGroupOut()

scoring = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1': make_scorer(f1_score, average='weighted')
}

print("\nModel Performance (Leave-One-Subject-Out CV):")
for model_name, model in models.items():
    scores = cross_validate(model, XT, yT, groups=group_labels, cv=logo, scoring=scoring)
    print(f"{model_name} - Accuracy: {scores['test_accuracy'].mean():.4f} ± {scores['test_accuracy'].std():.4f}")
    print(f"{model_name} - Precision (Weighted): {scores['test_precision'].mean():.4f} ± {scores['test_precision'].std():.4f}")
    print(f"{model_name} - Recall (Weighted): {scores['test_recall'].mean():.4f} ± {scores['test_recall'].std():.4f}")
    print(f"{model_name} - F1 Score (Weighted): {scores['test_f1'].mean():.4f} ± {scores['test_f1'].std():.4f}")



Model Performance (Leave-One-Subject-Out CV):
Random Forest - Accuracy: 0.8661 ± 0.0437
Random Forest - Precision (Weighted): 0.8809 ± 0.0352
Random Forest - Recall (Weighted): 0.8661 ± 0.0437
Random Forest - F1 Score (Weighted): 0.8618 ± 0.0479
Decision Tree - Accuracy: 0.6983 ± 0.0642
Decision Tree - Precision (Weighted): 0.7106 ± 0.0645
Decision Tree - Recall (Weighted): 0.6983 ± 0.0642
Decision Tree - F1 Score (Weighted): 0.6938 ± 0.0635
Logistic Regression - Accuracy: 0.8338 ± 0.0896
Logistic Regression - Precision (Weighted): 0.8566 ± 0.0816
Logistic Regression - Recall (Weighted): 0.8338 ± 0.0896
Logistic Regression - F1 Score (Weighted): 0.8255 ± 0.0995
AdaBoost - Accuracy: 0.5044 ± 0.0445
AdaBoost - Precision (Weighted): 0.5374 ± 0.0416
AdaBoost - Recall (Weighted): 0.5044 ± 0.0445
AdaBoost - F1 Score (Weighted): 0.4916 ± 0.0451


In [50]:
print("\nModel Performance on Test Data:")
for model_name, model in models.items():
    model.fit(X, y)
    y_pred = model.predict(XT)
    acc = accuracy_score(yT, y_pred)
    prec = precision_score(yT, y_pred, average='weighted')
    rec = recall_score(yT, y_pred, average='weighted')
    f1 = f1_score(yT, y_pred, average='weighted')

    print(f"{model_name} -> Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1 Score: {f1:.4f}")


Model Performance on Test Data:
Random Forest -> Accuracy: 0.8962, Precision: 0.8996, Recall: 0.8962, F1 Score: 0.8961
Decision Tree -> Accuracy: 0.7615, Precision: 0.7692, Recall: 0.7615, F1 Score: 0.7613
Logistic Regression -> Accuracy: 0.8812, Precision: 0.8843, Recall: 0.8812, F1 Score: 0.8817
AdaBoost -> Accuracy: 0.5385, Precision: 0.6002, Recall: 0.5385, F1 Score: 0.5102


Conclusion:- The accuracies for all the models across train and test data have decreased except for adaboost classifier which showed a significant increase in the model accuracy and achieved an max accuracy score of 60.27% using K-fold evaluation on train data. The reduce of accuracy may indicate that feature selection can be done better as the features after dropping the higly corellated features do not entirely represent the entire dataset and thus a drop in accuracy score. Additionally, we can see that some models like adaboost claddifier are sensitive to the dimensionality of the data and thus highlight the importance of dimensionality reduction.