In [None]:
## Students:
## Tamim Dostyar
## Brook Peterson

# Activity Classification - KNeighborsClassifier Training

This notebook trains a KNeighborsClassifier on the physical activity dataset using GridSearchCV for hyperparameter tuning.

## Load Data and Prepare Training Set


In [1]:
%reset -f

import importlib

import activity_functions
importlib.reload(activity_functions)
from activity_functions import *

In [2]:
activtity = load_data()

Loaded from Kaggle: /home/thuy/.cache/kagglehub/datasets/diegosilvadefrana/fisical-activity-dataset/versions/4/dataset2.csv


In [3]:
df_train, df_test = create_train_test(activtity, test_ratio=0.2)
print(df_train.shape)
print(df_test.shape)

(2291244, 33)
(572812, 33)


In [4]:
X_train, y_train, X_test, y_test = prepare_for_train(df_train, df_test)

## Hyperparameter Tuning with GridSearchCV

In [6]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")


def grid_searchCV(X, y):
    model = KNeighborsClassifier()
    param = {
        "n_neighbors": [5, 6, 7],
        "p": [1, 2],
        "weights": ["uniform"],
        "metric": ["minkowski"]
    }

    grid = GridSearchCV(
        model,
        param,
        verbose=1,
        refit=True,
        cv=3,
        scoring='accuracy',
        n_jobs=-1,
        return_train_score=True
    )

    grid.fit(X, y)
    return grid

best_model = grid_searchCV(X_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


In [7]:
import pandas as pd
cv_result = pd.DataFrame(best_model.cv_results_)
columns = ['params', 'rank_test_score', 'mean_train_score', 'mean_test_score']
cv_result = cv_result[columns]
cv_result.sort_values(by='rank_test_score')

Unnamed: 0,params,rank_test_score,mean_train_score,mean_test_score
1,"{'metric': 'minkowski', 'n_neighbors': 5, 'p':...",1,0.996677,0.992771
3,"{'metric': 'minkowski', 'n_neighbors': 6, 'p':...",2,0.995809,0.991856
5,"{'metric': 'minkowski', 'n_neighbors': 7, 'p':...",3,0.994627,0.990363
0,"{'metric': 'minkowski', 'n_neighbors': 5, 'p':...",4,,
2,"{'metric': 'minkowski', 'n_neighbors': 6, 'p':...",4,,
4,"{'metric': 'minkowski', 'n_neighbors': 7, 'p':...",4,,


## Best Hyperparameters Found

Display the best hyperparameters found by GridSearchCV:


In [8]:
print("Best Hyperparameters:")
print(best_model.best_params_)
print(f"\nBest Cross-Validation Accuracy: {best_model.best_score_:.4f}")


Best Hyperparameters:
{'metric': 'minkowski', 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}

Best Cross-Validation Accuracy: 0.9928


## Model Evaluation

Evaluate the best model on the test set:


In [9]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report

# Predict on test set
y_test_hat = best_model.predict(X_test)

# Calculate metrics
compute_scores(y_test, y_test_hat, verbose=True)


Accuracy:  0.9962
F1-Score:  0.9963
Recall:    0.9964
Precision: 0.9962


Unnamed: 0,Accuracy,F1_Score,Recall,Precision
0,0.996173,0.996299,0.996388,0.996213


In [10]:
print("\nDetailed Classification Report:")
print(classification_report(y_test, y_test_hat))



Detailed Classification Report:
                      precision    recall  f1-score   support

      Nordic walking       1.00      1.00      1.00     37621
    ascending stairs       0.99      0.99      0.99     23443
             cycling       1.00      1.00      1.00     32920
   descending stairs       0.99      0.98      0.99     20989
             ironing       1.00      1.00      1.00     47738
               lying       1.00      1.00      1.00     38505
        rope jumping       1.00      1.00      1.00      8594
             running       1.00      1.00      1.00     19640
             sitting       1.00      1.00      1.00     37038
            standing       1.00      1.00      1.00     37986
transient activities       1.00      0.99      0.99    185515
     vacuum cleaning       1.00      1.00      1.00     35071
             walking       0.99      1.00      0.99     47752

            accuracy                           1.00    572812
           macro avg       1.00    

## Summary

The KNeighborsClassifier was tuned using GridSearchCV with the following hyperparameter grid:
- **n_neighbors**: [5, 6, 7]
- **weights**: ["uniform"]
- **metric**: ["minkowski"]

This resulted in **6 candidate models** evaluated with **3-fold cross-validation** (18 total fits).

The best model was selected based on accuracy and evaluated on the held-out test set above.
