<a href="https://colab.research.google.com/github/Noor-Z1/PAMAP2-DataAnalysis-ML/blob/main/PreProcesssing%26FeatureExtraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **CNG 514 - Term Project**

### Notebook # 2




In this notebook we experiment with three different feature extraction techniques and compare each of their results for two different models (Random Forest and KNN), for two of the subjects


In [None]:
# mounting drive for loading the dataset files

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# import the necessary libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler
import matplotlib as plt

# PreProcessor class

In this class you can see the basic pre-processing applied to the data of any subject before any feature extraction. Any dropped columns are due to dataset author's comments or domain knowledge. For example all orientation columns are dropped since they are not that necessary.

In [None]:
from matplotlib import pyplot as plt
import pandas as pd
from sklearn.preprocessing import RobustScaler, StandardScaler
from scipy.signal import ellip, filtfilt, welch
import numpy as np
from scipy.stats import skew, kurtosis, entropy
from sklearn.metrics import pairwise_distances


class PreProcessor:
    def __init__(self):
        self.dataFrame = pd.DataFrame()

    def initializeDataFrame(self, filepath):

        colNames = ["timestamp", "activityID", "heartrate"]

        IMUhand = ['handTemperature',
                   'handAcc16_1', 'handAcc16_2', 'handAcc16_3',
                   'handAcc6_1', 'handAcc6_2', 'handAcc6_3',
                   'handGyro1', 'handGyro2', 'handGyro3',
                   'handMagne1', 'handMagne2', 'handMagne3',
                   'handOrientation1', 'handOrientation2', 'handOrientation3', 'handOrientation4']

        IMUchest = ['chestTemperature',
                    'chestAcc16_1', 'chestAcc16_2', 'chestAcc16_3',
                    'chestAcc6_1', 'chestAcc6_2', 'chestAcc6_3',
                    'chestGyro1', 'chestGyro2', 'chestGyro3',
                    'chestMagne1', 'chestMagne2', 'chestMagne3',
                    'chestOrientation1', 'chestOrientation2', 'chestOrientation3', 'chestOrientation4']

        IMUankle = ['ankleTemperature',
                    'ankleAcc16_1', 'ankleAcc16_2', 'ankleAcc16_3',
                    'ankleAcc6_1', 'ankleAcc6_2', 'ankleAcc6_3',
                    'ankleGyro1', 'ankleGyro2', 'ankleGyro3',
                    'ankleMagne1', 'ankleMagne2', 'ankleMagne3',
                    'ankleOrientation1', 'ankleOrientation2', 'ankleOrientation3', 'ankleOrientation4']

        columns = colNames + IMUhand + IMUchest + IMUankle  # All columns in one list

        procData = pd.read_table(filepath, header=None, sep='\s+')
        procData.columns = columns
        procData['subject_id'] = int(filepath[-5])
        self.dataFrame = self.dataFrame._append(procData, ignore_index=True)
        self.dataFrame.reset_index(drop=True, inplace=True)

    def dataCleaning(self):
        self.dataFrame = self.dataFrame.drop(
            ['handOrientation1', 'handOrientation2', 'handOrientation3', 'handOrientation4',
             'chestOrientation1', 'chestOrientation2', 'chestOrientation3', 'chestOrientation4',
             'ankleOrientation1', 'ankleOrientation2', 'ankleOrientation3', 'ankleOrientation4',
             'handAcc6_1', 'handAcc6_2', 'handAcc6_3', 'chestAcc6_1', 'chestAcc6_2',
             'chestAcc6_3', 'ankleAcc6_1', 'ankleAcc6_2', 'ankleAcc6_3'], axis=1)

        self.dataFrame = self.dataFrame.drop(self.dataFrame[self.dataFrame.activityID == 0].index)
        self.dataFrame = self.dataFrame.apply(pd.to_numeric, errors='ignore')
        self.dataFrame = self.dataFrame.interpolate()

    def applyPreProcessing(self):
        self.dataFrame.reset_index(drop=True, inplace=True)
        self.dataFrame.loc[:3, "heartrate"] = 100

        checkForNan = self.dataFrame.isnull().values.any()
        if checkForNan:
            print("DataFrame still contains some NAN values")

    def getSubjectDf(self, subject_id):
        return self.dataFrame[self.dataFrame['subject_id'] == subject_id]



---



# **Feature Extraction 1**

This is our own way of doing feature extraction where we:
> perform windowing with 0 overlap and 150 as the window size, extract time and frequency domain feature from the raw gyroscope and acc columns per window and concatenate all features.



In [None]:
from matplotlib import pyplot as plt
import pandas as pd
from sklearn.preprocessing import RobustScaler, StandardScaler
from scipy.signal import ellip, filtfilt, welch
import numpy as np
from scipy.stats import skew, kurtosis, entropy
from sklearn.metrics import pairwise_distances


class FeatureExtraction1:
    def __init__(self, subjectDf, subjectID):
        self.dataFrame = subjectDf
        self.subjectID = subjectID

    @staticmethod
    def compute_time_domain_features(data, isheartrate= False):


        mean = np.mean(data)
        std_dev = np.std(data)

        if not isheartrate:
          skewness = skew(data, nan_policy='omit')
          kurt = kurtosis(data, nan_policy='omit')
          return mean, std_dev, skewness, kurt

        else:
          return mean, std_dev

    @staticmethod
    def compute_frequency_domain_features(data, fs):
        f, Pxx = welch(data, fs=fs, nperseg=len(data))
        entropy_power = entropy(Pxx)
        peak_power_freq = f[np.argmax(Pxx)]
        return entropy_power, peak_power_freq

    @staticmethod
    def compute_signal_magnitude_area(data):
        return np.sum(np.abs(data))

    @staticmethod
    def compute_pairwise_correlations(data):
        correlations = pairwise_distances(data, metric='correlation')
        return correlations

    def sliding_window_feature_extraction(self, window_size=150, overlap=0, fs=1/0.01):
        angular_velocity_columns = ['handGyro1', 'handGyro2', 'handGyro3',
                                    'chestGyro1', 'chestGyro2', 'chestGyro3',
                                    'ankleGyro1', 'ankleGyro2', 'ankleGyro3']
        acceleration_columns = ['handAcc16_1', 'handAcc16_2', 'handAcc16_3',
                                'chestAcc16_1', 'chestAcc16_2', 'chestAcc16_3',
                                'ankleAcc16_1', 'ankleAcc16_2', 'ankleAcc16_3']

        heart_rate_col = ['heartrate']


        combined_columns  = angular_velocity_columns + acceleration_columns

        all_features = []

        all_labels = []

        stride = int(window_size * (1 - overlap))

        for start in range(0, len(self.dataFrame) - window_size + 1, stride):
            window_data_time = self.dataFrame.loc[start:start + window_size - 1, angular_velocity_columns + acceleration_columns + heart_rate_col]
            labels = self.dataFrame.loc[start:start + window_size - 1, 'activityID']

            # Ensure the window contains only one activity
            if labels.nunique() == 1:
                label = labels.iloc[0]
                time_domain_features = [self.compute_time_domain_features(window_data_time[column]) for column in combined_columns]
                time_domain_features = np.array(time_domain_features).flatten()

                freq_domain_features = [self.compute_frequency_domain_features(window_data_time[column], fs) for column in combined_columns]
                freq_domain_features = np.array(freq_domain_features).flatten()
                sma = self.compute_signal_magnitude_area(window_data_time)

                # pairwise_corr = self.compute_pairwise_correlations(window_data_time)
                # pairwise_corr = pairwise_corr.flatten()

                features = np.concatenate([time_domain_features, freq_domain_features, sma])
                all_features.append(features)
                all_labels.append(label)

        return np.array(all_features), np.array(all_labels)

    def applyFeatureExtraction(self, window_size, overlap, fs):
        features, labels = self.sliding_window_feature_extraction(window_size=window_size, overlap=overlap, fs=fs)
        return features, labels



# # Usage example
# file_path = '/content/drive/MyDrive/PAMAP2_Dataset/Protocol/subject101.dat'
# processor = PreProcessor()
# processor.initializeDataFrame(file_path)
# processor.dataCleaning()
# processor.applyPreProcessing()

# subject_id = 1
# subject_df = processor.getSubjectDf(subject_id)


# feature_extractor = FeatureExtraction1(subject_df, subject_id)

# window_size = 150
# overlap = 0

# features, labels = feature_extractor.applyFeatureExtraction(window_size, overlap, fs=100)

# print(features.shape)
# print(labels.shape)

(1653, 127)
(1653,)


In [None]:
# now based on these features, train, test and cross validate a model

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from scipy.stats import skew, kurtosis
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split



def nested_cross_validation(features, labels, test_size=0.3, n_splits=5):
    # Split data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=test_size, random_state=42)

    outer_cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    }

    model = RandomForestClassifier()
    clf = GridSearchCV(estimator=model, param_grid=param_grid, cv=n_splits, scoring='f1_weighted')

    outer_scores = []

    for train_index, val_index in outer_cv.split(X_train, y_train):
        X_train_inner, X_val = X_train[train_index], X_train[val_index]
        y_train_inner, y_val = y_train[train_index], y_train[val_index]

        imputer = SimpleImputer(strategy='mean')
        X_train_inner = imputer.fit_transform(X_train_inner)
        X_val = imputer.transform(X_val)

        scaler = StandardScaler().fit(X_train_inner)
        X_train_inner = scaler.transform(X_train_inner)
        X_val = scaler.transform(X_val)

        clf.fit(X_train_inner, y_train_inner)
        best_model = clf.best_estimator_

        y_pred = best_model.predict(X_val)
        accuracy = accuracy_score(y_val, y_pred)
        outer_scores.append(accuracy)

        print(f"Fold accuracy: {accuracy}")
        print(classification_report(y_val, y_pred, zero_division=0))

    print("Average accuracy:", np.mean(outer_scores))

    # Final evaluation on the test set
    imputer = SimpleImputer(strategy='mean')
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)

    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    clf.fit(X_train, y_train)
    best_model = clf.best_estimator_

    y_pred_test = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred_test)

    print("Test set accuracy:", test_accuracy)
    print(classification_report(y_test, y_pred_test, zero_division=0))


nested_cross_validation(features, labels)

Fold accuracy: 0.9181034482758621
              precision    recall  f1-score   support

           1       1.00      0.85      0.92        27
           2       1.00      0.81      0.89        21
           3       0.70      1.00      0.83        19
           4       1.00      0.86      0.92        21
           5       1.00      0.95      0.98        21
           6       1.00      0.82      0.90        22
           7       0.89      0.94      0.92        18
          12       0.83      0.94      0.88        16
          13       1.00      0.93      0.96        14
          16       0.92      1.00      0.96        22
          17       0.83      1.00      0.91        20
          24       1.00      1.00      1.00        11

    accuracy                           0.92       232
   macro avg       0.93      0.92      0.92       232
weighted avg       0.93      0.92      0.92       232

Fold accuracy: 0.9310344827586207
              precision    recall  f1-score   support

          

In [None]:
from sklearn.neighbors import KNeighborsClassifier

def nested_cross_validation(features, labels, test_size=0.3, n_splits=10):

    # Split data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=test_size, random_state=42)

    outer_cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    param_grid={
    'n_neighbors': [3, 5, 7, 9, 11, 12],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski'] }

    model = KNeighborsClassifier()
    clf = GridSearchCV(estimator=model, param_grid=param_grid, cv=n_splits, scoring='f1_weighted')

    outer_scores = []

    for train_index, val_index in outer_cv.split(X_train, y_train):
        X_train_inner, X_val = X_train[train_index], X_train[val_index]
        y_train_inner, y_val = y_train[train_index], y_train[val_index]

        scaler = RobustScaler().fit(X_train_inner)
        X_train_inner = scaler.transform(X_train_inner)
        X_val = scaler.transform(X_val)

        clf.fit(X_train_inner, y_train_inner)
        best_model = clf.best_estimator_

        y_pred = best_model.predict(X_val)
        accuracy = accuracy_score(y_val, y_pred)
        outer_scores.append(accuracy)

        print(f"Fold accuracy: {accuracy}")
        print(classification_report(y_val, y_pred))

    print("Average accuracy:", np.mean(outer_scores))

    # Final evaluation on the test set

    scaler = RobustScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    clf.fit(X_train, y_train)
    best_model = clf.best_estimator_


    y_pred_test = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred_test)

    print("Test set accuracy:", test_accuracy)
    print(classification_report(y_test, y_pred_test, zero_division=0))


nested_cross_validation(features, labels)

Fold accuracy: 0.9310344827586207
              precision    recall  f1-score   support

           1       1.00      0.86      0.92        14
           2       0.91      1.00      0.95        10
           3       0.73      0.89      0.80         9
           4       1.00      0.82      0.90        11
           5       0.91      0.91      0.91        11
           6       1.00      0.91      0.95        11
           7       0.90      1.00      0.95         9
          12       0.88      0.88      0.88         8
          13       1.00      1.00      1.00         7
          16       1.00      1.00      1.00        11
          17       1.00      1.00      1.00        10
          24       0.83      1.00      0.91         5

    accuracy                           0.93       116
   macro avg       0.93      0.94      0.93       116
weighted avg       0.94      0.93      0.93       116

Fold accuracy: 0.8620689655172413
              precision    recall  f1-score   support

          

## Evaluating the first Feature Extraction on another subject (subject 4)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from scipy.stats import skew, kurtosis
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split



def nested_cross_validationRF(features, labels, test_size=0.3, n_splits=5):
    # Split data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=test_size, random_state=42)

    outer_cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    }

    model = RandomForestClassifier()
    clf = GridSearchCV(estimator=model, param_grid=param_grid, cv=n_splits, scoring='f1_weighted')

    outer_scores = []

    for train_index, val_index in outer_cv.split(X_train, y_train):
        X_train_inner, X_val = X_train[train_index], X_train[val_index]
        y_train_inner, y_val = y_train[train_index], y_train[val_index]

        imputer = SimpleImputer(strategy='mean')
        X_train_inner = imputer.fit_transform(X_train_inner)
        X_val = imputer.transform(X_val)

        scaler = StandardScaler().fit(X_train_inner)
        X_train_inner = scaler.transform(X_train_inner)
        X_val = scaler.transform(X_val)

        clf.fit(X_train_inner, y_train_inner)
        best_model = clf.best_estimator_

        y_pred = best_model.predict(X_val)
        accuracy = accuracy_score(y_val, y_pred)
        outer_scores.append(accuracy)

        print(f"Fold accuracy: {accuracy}")
        print(classification_report(y_val, y_pred, zero_division=0))

    print("Average accuracy:", np.mean(outer_scores))

    # Final evaluation on the test set
    imputer = SimpleImputer(strategy='mean')
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)

    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    clf.fit(X_train, y_train)
    best_model = clf.best_estimator_

    y_pred_test = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred_test)

    print("Test set accuracy:", test_accuracy)
    print(classification_report(y_test, y_pred_test, zero_division=0))


def nested_cross_validationKNN(features, labels, test_size=0.3, n_splits=10):

    # Split data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=test_size, random_state=42)

    outer_cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    param_grid={
    'n_neighbors': [3, 5, 7, 9, 11, 12],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski'] }

    model = KNeighborsClassifier()
    clf = GridSearchCV(estimator=model, param_grid=param_grid, cv=n_splits, scoring='f1_weighted')

    outer_scores = []

    for train_index, val_index in outer_cv.split(X_train, y_train):
        X_train_inner, X_val = X_train[train_index], X_train[val_index]
        y_train_inner, y_val = y_train[train_index], y_train[val_index]

        scaler = RobustScaler().fit(X_train_inner)
        X_train_inner = scaler.transform(X_train_inner)
        X_val = scaler.transform(X_val)

        clf.fit(X_train_inner, y_train_inner)
        best_model = clf.best_estimator_

        y_pred = best_model.predict(X_val)
        accuracy = accuracy_score(y_val, y_pred)
        outer_scores.append(accuracy)

        print(f"Fold accuracy: {accuracy}")
        print(classification_report(y_val, y_pred))

    print("Average accuracy:", np.mean(outer_scores))

    # Final evaluation on the test set

    scaler = RobustScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    clf.fit(X_train, y_train)
    best_model = clf.best_estimator_


    y_pred_test = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred_test)

    print("Test set accuracy:", test_accuracy)
    print(classification_report(y_test, y_pred_test, zero_division=0))


file_path = '/content/drive/MyDrive/PAMAP2_Dataset/Protocol/subject104.dat'
processor = PreProcessor()
processor.initializeDataFrame(file_path)
processor.dataCleaning()
processor.applyPreProcessing()

subject_id = 4
subject_df = processor.getSubjectDf(subject_id)


feature_extractor = FeatureExtraction1(subject_df, subject_id)

window_size = 150
overlap = 0

features, labels = feature_extractor.applyFeatureExtraction(window_size, overlap, fs=100)

print(features.shape)
print(labels.shape)


nested_cross_validationKNN(features, labels)
nested_cross_validationRF(features,labels)

DataFrame still contains some NAN values
(1531, 127)
(1531,)
Fold accuracy: 0.9166666666666666
              precision    recall  f1-score   support

           1       1.00      0.92      0.96        12
           2       0.86      1.00      0.92        12
           3       0.73      0.80      0.76        10
           4       0.93      1.00      0.96        13
           6       1.00      0.91      0.95        11
           7       1.00      1.00      1.00        13
          12       0.88      0.88      0.88         8
          13       1.00      0.86      0.92         7
          16       1.00      0.80      0.89        10
          17       0.85      0.92      0.88        12

    accuracy                           0.92       108
   macro avg       0.92      0.91      0.91       108
weighted avg       0.92      0.92      0.92       108

Fold accuracy: 0.9345794392523364
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        12
   

In [None]:
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from scipy.stats import skew, kurtosis
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split



def nested_cross_validationRF(features, labels, test_size=0.3, n_splits=5):
    # Split data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=test_size, random_state=42)

    outer_cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    }

    model = RandomForestClassifier()
    clf = GridSearchCV(estimator=model, param_grid=param_grid, cv=n_splits, scoring='f1_weighted')

    outer_scores = []

    for train_index, val_index in outer_cv.split(X_train, y_train):
        X_train_inner, X_val = X_train[train_index], X_train[val_index]
        y_train_inner, y_val = y_train[train_index], y_train[val_index]

        imputer = SimpleImputer(strategy='mean')
        X_train_inner = imputer.fit_transform(X_train_inner)
        X_val = imputer.transform(X_val)

        scaler = StandardScaler().fit(X_train_inner)
        X_train_inner = scaler.transform(X_train_inner)
        X_val = scaler.transform(X_val)

        clf.fit(X_train_inner, y_train_inner)
        best_model = clf.best_estimator_

        y_pred = best_model.predict(X_val)
        accuracy = accuracy_score(y_val, y_pred)
        outer_scores.append(accuracy)

        print(f"Fold accuracy: {accuracy}")
        print(classification_report(y_val, y_pred, zero_division=0))

    print("Average accuracy:", np.mean(outer_scores))

    # Final evaluation on the test set
    imputer = SimpleImputer(strategy='mean')
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)

    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    clf.fit(X_train, y_train)
    best_model = clf.best_estimator_

    y_pred_test = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred_test)

    print("Test set accuracy:", test_accuracy)
    print(classification_report(y_test, y_pred_test, zero_division=0))


def nested_cross_validationKNN(features, labels, test_size=0.3, n_splits=10):

    # Split data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=test_size, random_state=42)

    outer_cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    param_grid={
    'n_neighbors': [3, 5, 7, 9, 11, 12],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski'] }

    model = KNeighborsClassifier()
    clf = GridSearchCV(estimator=model, param_grid=param_grid, cv=n_splits, scoring='f1_weighted')

    outer_scores = []

    for train_index, val_index in outer_cv.split(X_train, y_train):
        X_train_inner, X_val = X_train[train_index], X_train[val_index]
        y_train_inner, y_val = y_train[train_index], y_train[val_index]

        scaler = RobustScaler().fit(X_train_inner)
        X_train_inner = scaler.transform(X_train_inner)
        X_val = scaler.transform(X_val)

        clf.fit(X_train_inner, y_train_inner)
        best_model = clf.best_estimator_

        y_pred = best_model.predict(X_val)
        accuracy = accuracy_score(y_val, y_pred)
        outer_scores.append(accuracy)

        print(f"Fold accuracy: {accuracy}")
        print(classification_report(y_val, y_pred))

    print("Average accuracy:", np.mean(outer_scores))

    # Final evaluation on the test set

    scaler = RobustScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    clf.fit(X_train, y_train)
    best_model = clf.best_estimator_


    y_pred_test = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred_test)

    print("Test set accuracy:", test_accuracy)
    print(classification_report(y_test, y_pred_test, zero_division=0))


file_path = '/content/drive/MyDrive/PAMAP2_Dataset/Protocol/subject106.dat'
processor = PreProcessor()
processor.initializeDataFrame(file_path)
processor.dataCleaning()
processor.applyPreProcessing()

subject_id = 6
subject_df = processor.getSubjectDf(subject_id)


feature_extractor = FeatureExtraction1(subject_df, subject_id)

window_size = 150
overlap = 0

features, labels = feature_extractor.applyFeatureExtraction(window_size, overlap, fs=100)

print(features.shape)
print(labels.shape)


nested_cross_validationKNN(features, labels)
print("\n Now evluating Subject 6 on RF")
nested_cross_validationRF(features,labels)

# **Feature Extraction 2**

We have also included the heartrate information and every window we extract the mean heartrate and its std deviation.

Further, in this we separate all accelerometer data to body and gravity components and include both components and apply feature extraction on them. We also change window size to 300 and overlap to 0.5, same as one of the reference papers.

In [None]:
from scipy.signal import ellip, filtfilt
import numpy as np
import pandas as pd
from scipy.stats import skew, kurtosis, entropy
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances
from scipy.signal import welch


class FeatureExtraction2:
    def __init__(self, subjectDf, subjectID):
        self.dataFrame = subjectDf
        self.subjectID = subjectID

    @staticmethod
    def compute_time_domain_features(data, isheartrate= False):

        mean = np.mean(data)
        std_dev = np.std(data)

        if not isheartrate:
          skewness = skew(data, nan_policy='omit')
          kurt = kurtosis(data, nan_policy='omit')
          return mean, std_dev, skewness, kurt

        else:
          return mean, std_dev

    @staticmethod
    def compute_frequency_domain_features(data, fs):
        f, Pxx = welch(data, fs=fs, nperseg=len(data))
        entropy_power = entropy(Pxx)
        peak_power_freq = f[np.argmax(Pxx)]
        return entropy_power, peak_power_freq

    @staticmethod
    def compute_signal_magnitude_area(data):
        return np.sum(np.abs(data))

    @staticmethod
    def compute_pairwise_correlations(data):
        correlations = pairwise_distances(data.T, metric='correlation')
        return correlations

    def apply_iir_filter(self, data, fs, cutoff=0.3, order=4):
        b, a = ellip(order, 0.01, 100, cutoff / (0.5 * fs), btype='low')
        filtered_data = filtfilt(b, a, data, axis=0)
        body_component = data - filtered_data
        return body_component, filtered_data

    def generate_feature_names(self, angular_velocity_columns, acceleration_columns):
        feature_names = []

        time_domain = []
        freq_domain = []
        sma  = []

        # Generate feature names for angular velocity columns
        for col in angular_velocity_columns:
            time_domain.extend([f'{col}_mean', f'{col}_std', f'{col}_skew', f'{col}_kurt'])
            freq_domain.extend([f'{col}_entropy_power', f'{col}_peak_power_freq'])
            sma.append(f'{col}_sma')

        # manually generating feature names for heart rate column also

        col = 'heartrate'
        time_domain.extend([f'{col}_mean', f'{col}_std'])

        # Generate feature names for body and gravity components of acceleration columns
        for col in acceleration_columns:
            body_prefix = f'body_{col}'
            gravity_prefix = f'gravity_{col}'
            time_domain.extend([f'{body_prefix}_mean', f'{body_prefix}_std', f'{body_prefix}_skew', f'{body_prefix}_kurt'])
            time_domain.extend([f'{gravity_prefix}_mean', f'{gravity_prefix}_std', f'{gravity_prefix}_skew', f'{gravity_prefix}_kurt'])
            freq_domain.extend([f'{body_prefix}_entropy_power', f'{body_prefix}_peak_power_freq'])
            sma.append(f'{body_prefix}_sma')

        feature_names = time_domain + freq_domain + sma

        return feature_names

    def sliding_window_feature_extraction(self, window_size=300, overlap=0.5, fs=100):
        angular_velocity_columns = ['handGyro1', 'handGyro2', 'handGyro3',
                                    'chestGyro1', 'chestGyro2', 'chestGyro3',
                                    'ankleGyro1', 'ankleGyro2', 'ankleGyro3']
        acceleration_columns = ['handAcc16_1', 'handAcc16_2', 'handAcc16_3',
                                'chestAcc16_1', 'chestAcc16_2', 'chestAcc16_3',
                                'ankleAcc16_1', 'ankleAcc16_2', 'ankleAcc16_3']

        heart_rate_column = ['heartrate']

        all_features = []
        all_labels = []

        stride = int(window_size * (1 - overlap))
        feature_names = self.generate_feature_names(angular_velocity_columns, acceleration_columns)

        for start in range(0, len(self.dataFrame) - window_size + 1, stride):
            window_data = self.dataFrame.loc[start:start + window_size - 1, angular_velocity_columns + acceleration_columns + heart_rate_column]
            labels = self.dataFrame.loc[start:start + window_size - 1, 'activityID']

            # Ensure the window contains only one activity
            if labels.nunique() == 1:
                label = labels.iloc[0]

                # Apply IIR filter to separate body and gravity components
                body_acc = []
                gravity_acc = []

                for col in acceleration_columns:
                    body, gravity = self.apply_iir_filter(window_data[col].values, fs)
                    body_acc.append(body)
                    gravity_acc.append(gravity)

                body_acc = np.array(body_acc).T
                gravity_acc = np.array(gravity_acc).T

                # Extract features from gyroscope, body acceleration, and gravity acceleration signals
                time_domain_features = []
                freq_domain_features = []
                sma_values = []
                pairwise_corr_values = []

                for col in angular_velocity_columns:
                    time_domain_features.extend(self.compute_time_domain_features(window_data[col]))
                    freq_domain_features.extend(self.compute_frequency_domain_features(window_data[col], fs))
                    sma_values.append(self.compute_signal_magnitude_area(window_data[col]))

                time_domain_features.extend(self.compute_time_domain_features(window_data[heart_rate_column[0]], isheartrate= True))

                for idx in range(body_acc.shape[1]):
                    body = body_acc[:, idx]
                    gravity = gravity_acc[:, idx]

                    time_domain_features.extend(self.compute_time_domain_features(body))

                    time_domain_features.extend(self.compute_time_domain_features(gravity))

                    freq_domain_features.extend(self.compute_frequency_domain_features(body, fs))

                    sma_values.append(self.compute_signal_magnitude_area(body))


                features = np.concatenate([time_domain_features, freq_domain_features, sma_values])
                all_features.append(features)
                all_labels.append(label)

        return np.array(all_features), np.array(all_labels), feature_names

    def applyFeatureExtraction(self, window_size, overlap, fs):
        features, labels, feature_names = self.sliding_window_feature_extraction(window_size=window_size, overlap=overlap, fs=fs)
        return features, labels, feature_names

    def features_to_dataframe(self, features, labels, feature_names):
        features_df = pd.DataFrame(features, columns=feature_names)
        labels_df = pd.DataFrame(labels, columns=['label'])
        combined_df = pd.concat([features_df, labels_df], axis=1)
        return combined_df


# Usage example
file_path = '/content/drive/MyDrive/PAMAP2_Dataset/Protocol/subject101.dat'
processor = PreProcessor()
processor.initializeDataFrame(file_path)
processor.dataCleaning()
processor.applyPreProcessing()


print(processor.dataFrame['activityID'].value_counts())

subject_id = 1
subject_df = processor.getSubjectDf(subject_id)

feature_extractor = FeatureExtraction2(subject_df, subject_id)
features, labels, feature_names = feature_extractor.applyFeatureExtraction(300, 0.5, 100)
features_df = feature_extractor.features_to_dataframe(features, labels, feature_names)


features_df

activityID
1     27187
6     23575
17    23573
2     23480
16    22941
4     22253
3     21717
5     21265
7     20265
12    15890
13    14899
24    12912
Name: count, dtype: int64


Unnamed: 0,handGyro1_mean,handGyro1_std,handGyro1_skew,handGyro1_kurt,handGyro2_mean,handGyro2_std,handGyro2_skew,handGyro2_kurt,handGyro3_mean,handGyro3_std,...,body_handAcc16_1_sma,body_handAcc16_2_sma,body_handAcc16_3_sma,body_chestAcc16_1_sma,body_chestAcc16_2_sma,body_chestAcc16_3_sma,body_ankleAcc16_1_sma,body_ankleAcc16_2_sma,body_ankleAcc16_3_sma,label
0,0.137226,0.527463,0.015774,0.100983,0.350521,0.464791,0.310585,-1.051760,0.061615,0.301724,...,210.619957,262.908913,241.664229,74.184756,32.721972,44.976022,21.336608,52.368111,27.090982,1
1,-0.036560,0.824434,0.080164,0.224009,0.198049,0.361774,0.103602,-0.928197,0.111461,0.423236,...,195.620804,356.556044,355.552099,63.122289,33.805038,52.297379,20.483788,30.655766,22.201224,1
2,-0.128513,0.913687,0.570222,0.352048,0.086507,0.295838,-0.078255,-0.068920,0.002302,0.410864,...,225.167825,358.287608,350.745403,50.018347,29.469813,40.986112,20.432232,29.401250,20.448833,1
3,0.079574,0.605663,0.732990,2.954588,-0.077228,0.433403,-2.852270,10.781492,-0.087603,0.244531,...,157.873195,250.200390,183.683916,56.243605,30.779978,50.668413,21.818145,21.905372,19.762246,1
4,0.247295,0.524092,0.922840,5.402961,-0.659975,0.690729,-0.948733,0.277324,0.051882,0.269721,...,270.410285,185.765805,394.944860,201.979049,106.711644,194.831842,29.755081,94.431961,118.782399,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1634,0.748777,2.057120,0.526309,-0.419738,0.338619,1.812967,0.099541,-0.973624,-0.130424,2.523959,...,2859.453314,1236.549837,1422.965678,369.933178,4585.979909,1351.515500,2549.008752,3132.212002,1241.533197,24
1635,0.268074,1.899629,0.497572,0.219167,1.053477,1.887499,0.027861,-0.335820,0.161941,2.129133,...,1538.320008,1011.772043,1102.774870,368.554569,2987.846173,974.767462,1912.966380,2338.698121,1080.917490,24
1636,-0.269102,1.441967,-0.252770,-0.154110,0.902049,1.589032,0.989387,0.310906,0.689659,1.305696,...,771.825967,715.288114,648.810005,248.041180,1111.657012,478.109804,778.886489,1000.314711,501.791245,24
1637,-0.292349,1.160550,-0.117071,-0.300492,-0.126981,0.832508,-0.200080,-0.852752,0.107850,1.029377,...,296.071671,577.943817,337.169359,176.166228,110.614638,145.109934,39.662501,60.545780,37.678900,24


In [None]:
nested_cross_validationKNN(features, labels)

Fold accuracy: 0.8521739130434782
              precision    recall  f1-score   support

           1       0.92      0.85      0.88        13
           2       1.00      0.70      0.82        10
           3       0.57      0.80      0.67        10
           4       1.00      0.80      0.89        10
           5       0.92      1.00      0.96        11
           6       1.00      0.80      0.89        10
           7       0.80      0.89      0.84         9
          12       0.86      0.75      0.80         8
          13       0.88      0.88      0.88         8
          16       0.90      0.90      0.90        10
          17       0.69      0.90      0.78        10
          24       1.00      1.00      1.00         6

    accuracy                           0.85       115
   macro avg       0.88      0.86      0.86       115
weighted avg       0.88      0.85      0.86       115

Fold accuracy: 0.8695652173913043
              precision    recall  f1-score   support

          

In [None]:
nested_cross_validationRF(features, labels)

Fold accuracy: 0.908695652173913
              precision    recall  f1-score   support

           1       1.00      0.78      0.88        27
           2       1.00      1.00      1.00        21
           3       0.83      1.00      0.91        20
           4       1.00      0.90      0.95        21
           5       1.00      0.91      0.95        22
           6       1.00      0.90      0.95        20
           7       1.00      0.88      0.94        17
          12       0.86      0.80      0.83        15
          13       0.92      0.80      0.86        15
          16       0.84      1.00      0.91        21
          17       0.66      1.00      0.79        19
          24       1.00      0.92      0.96        12

    accuracy                           0.91       230
   macro avg       0.93      0.91      0.91       230
weighted avg       0.93      0.91      0.91       230

Fold accuracy: 0.908695652173913
              precision    recall  f1-score   support

           1

## Evaluating the second Feature Extraction on another subject (4)

In [None]:
# Usage example
file_path = '/content/drive/MyDrive/PAMAP2_Dataset/Protocol/subject104.dat'
processor = PreProcessor()
processor.initializeDataFrame(file_path)
processor.dataCleaning()
processor.applyPreProcessing()

subject_id = 4
subject_df = processor.getSubjectDf(subject_id)


feature_extractor = FeatureExtraction2(subject_df, subject_id)
features, labels, feature_names = feature_extractor.applyFeatureExtraction(300, 0.5, 100)

DataFrame still contains some NAN values


In [None]:
nested_cross_validationKNN(features, labels)

Fold accuracy: 0.9439252336448598
              precision    recall  f1-score   support

           1       1.00      0.91      0.95        11
           2       0.80      1.00      0.89        12
           3       0.90      0.82      0.86        11
           4       1.00      1.00      1.00        14
           6       0.91      1.00      0.95        10
           7       1.00      1.00      1.00        13
          12       1.00      0.88      0.93         8
          13       1.00      1.00      1.00         7
          16       0.89      0.89      0.89         9
          17       1.00      0.92      0.96        12

    accuracy                           0.94       107
   macro avg       0.95      0.94      0.94       107
weighted avg       0.95      0.94      0.94       107

Fold accuracy: 0.9158878504672897
              precision    recall  f1-score   support

           1       1.00      0.82      0.90        11
           2       0.92      0.92      0.92        12
          

In [None]:
nested_cross_validationRF(features, labels)

Fold accuracy: 0.9483568075117371
              precision    recall  f1-score   support

           1       1.00      0.87      0.93        23
           2       1.00      0.96      0.98        24
           3       0.85      1.00      0.92        22
           4       1.00      0.96      0.98        28
           6       1.00      1.00      1.00        20
           7       1.00      0.96      0.98        25
          12       1.00      0.94      0.97        16
          13       1.00      0.85      0.92        13
          16       0.75      0.95      0.84        19
          17       0.96      0.96      0.96        23

    accuracy                           0.95       213
   macro avg       0.96      0.94      0.95       213
weighted avg       0.96      0.95      0.95       213

Fold accuracy: 0.9389671361502347
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        22
           2       0.88      0.92      0.90        24
          

# **Feature Extraction 3**

Everything is same as Feature Extraction 2 but we do not include the gravity component or perform any feature extraction on it.

In [None]:
from scipy.signal import ellip, filtfilt
import numpy as np
import pandas as pd
from scipy.stats import skew, kurtosis, entropy
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances
from scipy.signal import welch


class FeatureExtraction3:
    def __init__(self, subjectDf, subjectID):
        self.dataFrame = subjectDf
        self.subjectID = subjectID

    @staticmethod
    def compute_time_domain_features(data, isheartrate= False):


        mean = np.mean(data)
        std_dev = np.std(data)

        if not isheartrate:
          skewness = skew(data, nan_policy='omit')
          kurt = kurtosis(data, nan_policy='omit')
          return mean, std_dev, skewness, kurt

        else:
          return mean, std_dev

    @staticmethod
    def compute_frequency_domain_features(data, fs):
        f, Pxx = welch(data, fs=fs, nperseg=len(data))
        entropy_power = entropy(Pxx)
        peak_power_freq = f[np.argmax(Pxx)]
        return entropy_power, peak_power_freq

    @staticmethod
    def compute_signal_magnitude_area(data):
        return np.sum(np.abs(data))

    @staticmethod
    def compute_pairwise_correlations(data):
        correlations = pairwise_distances(data.T, metric='correlation')
        return correlations

    def apply_iir_filter(self, data, fs, cutoff=0.3, order=4):
        b, a = ellip(order, 0.01, 100, cutoff / (0.5 * fs), btype='low')
        filtered_data = filtfilt(b, a, data, axis=0)
        body_component = data - filtered_data
        return body_component

    def generate_feature_names(self, angular_velocity_columns, acceleration_columns):
        feature_names = []

        time_domain = []
        freq_domain = []
        sma  = []

        # Generate feature names for angular velocity columns
        for col in angular_velocity_columns:
            time_domain.extend([f'{col}_mean', f'{col}_std', f'{col}_skew', f'{col}_kurt'])
            freq_domain.extend([f'{col}_entropy_power', f'{col}_peak_power_freq'])
            sma.append(f'{col}_sma')

        # manually generating feature names for heart rate column also

        col = 'heartrate'
        time_domain.extend([f'{col}_mean', f'{col}_std'])

        # Generate feature names for body and gravity components of acceleration columns
        for col in acceleration_columns:
            body_prefix = f'body_{col}'
            time_domain.extend([f'{body_prefix}_mean', f'{body_prefix}_std', f'{body_prefix}_skew', f'{body_prefix}_kurt'])
            freq_domain.extend([f'{body_prefix}_entropy_power', f'{body_prefix}_peak_power_freq'])
            sma.append(f'{body_prefix}_sma')

        feature_names = time_domain + freq_domain + sma

        return feature_names

    def sliding_window_feature_extraction(self, window_size=300, overlap=0.5, fs=100):
        angular_velocity_columns = ['handGyro1', 'handGyro2', 'handGyro3',
                                    'chestGyro1', 'chestGyro2', 'chestGyro3',
                                    'ankleGyro1', 'ankleGyro2', 'ankleGyro3']
        acceleration_columns = ['handAcc16_1', 'handAcc16_2', 'handAcc16_3',
                                'chestAcc16_1', 'chestAcc16_2', 'chestAcc16_3',
                                'ankleAcc16_1', 'ankleAcc16_2', 'ankleAcc16_3']

        heart_rate_column = ['heartrate']

        all_features = []
        all_labels = []

        stride = int(window_size * (1 - overlap))
        feature_names = self.generate_feature_names(angular_velocity_columns, acceleration_columns)

        for start in range(0, len(self.dataFrame) - window_size + 1, stride):
            window_data = self.dataFrame.loc[start:start + window_size - 1, angular_velocity_columns + acceleration_columns + heart_rate_column]
            labels = self.dataFrame.loc[start:start + window_size - 1, 'activityID']

            # Ensure the window contains only one activity
            if labels.nunique() == 1:
                label = labels.iloc[0]

                # Apply IIR filter to separate body and gravity components
                body_acc = []

                for col in acceleration_columns:
                    body = self.apply_iir_filter(window_data[col].values, fs)
                    body_acc.append(body)

                body_acc = np.array(body_acc).T


                # Extract features from gyroscope, body acceleration, and gravity acceleration signals
                time_domain_features = []
                freq_domain_features = []
                sma_values = []
                pairwise_corr_values = []

                for col in angular_velocity_columns:
                    time_domain_features.extend(self.compute_time_domain_features(window_data[col]))
                    freq_domain_features.extend(self.compute_frequency_domain_features(window_data[col], fs))
                    sma_values.append(self.compute_signal_magnitude_area(window_data[col]))

                time_domain_features.extend(self.compute_time_domain_features(window_data[heart_rate_column[0]], isheartrate= True))

                for idx in range(body_acc.shape[1]):
                    body = body_acc[:, idx]
                    time_domain_features.extend(self.compute_time_domain_features(body))
                    freq_domain_features.extend(self.compute_frequency_domain_features(body, fs))
                    sma_values.append(self.compute_signal_magnitude_area(body))


                features = np.concatenate([time_domain_features, freq_domain_features, sma_values])
                all_features.append(features)
                all_labels.append(label)

        return np.array(all_features), np.array(all_labels), feature_names

    def applyFeatureExtraction(self, window_size, overlap, fs):
        features, labels, feature_names = self.sliding_window_feature_extraction(window_size=window_size, overlap=overlap, fs=fs)
        return features, labels, feature_names

    def features_to_dataframe(self, features, labels, feature_names):
        features_df = pd.DataFrame(features, columns=feature_names)
        labels_df = pd.DataFrame(labels, columns=['label'])
        combined_df = pd.concat([features_df, labels_df], axis=1)
        return combined_df


# Usage example
file_path = '/content/drive/MyDrive/PAMAP2_Dataset/Protocol/subject101.dat'
processor = PreProcessor()
processor.initializeDataFrame(file_path)
processor.dataCleaning()
processor.applyPreProcessing()


print(processor.dataFrame['activityID'].value_counts())

subject_id = 1
subject_df = processor.getSubjectDf(subject_id)

feature_extractor = FeatureExtraction3(subject_df, subject_id)
features, labels, feature_names = feature_extractor.applyFeatureExtraction(300, 0.5, 100)
features_df = feature_extractor.features_to_dataframe(features, labels, feature_names)

activityID
1     27187
6     23575
17    23573
2     23480
16    22941
4     22253
3     21717
5     21265
7     20265
12    15890
13    14899
24    12912
Name: count, dtype: int64


In [None]:
nested_cross_validationKNN(features, labels)

Fold accuracy: 0.8260869565217391
              precision    recall  f1-score   support

           1       1.00      0.77      0.87        13
           2       1.00      0.60      0.75        10
           3       0.50      0.70      0.58        10
           4       1.00      0.80      0.89        10
           5       0.92      1.00      0.96        11
           6       1.00      0.80      0.89        10
           7       1.00      0.89      0.94         9
          12       0.75      0.75      0.75         8
          13       0.78      0.88      0.82         8
          16       0.90      0.90      0.90        10
          17       0.56      0.90      0.69        10
          24       1.00      1.00      1.00         6

    accuracy                           0.83       115
   macro avg       0.87      0.83      0.84       115
weighted avg       0.87      0.83      0.83       115

Fold accuracy: 0.8521739130434782
              precision    recall  f1-score   support

          

In [None]:
nested_cross_validationRF(features, labels)

Fold accuracy: 0.8956521739130435
              precision    recall  f1-score   support

           1       1.00      0.74      0.85        27
           2       0.91      1.00      0.95        21
           3       0.79      0.95      0.86        20
           4       1.00      0.90      0.95        21
           5       1.00      0.91      0.95        22
           6       1.00      0.85      0.92        20
           7       1.00      0.88      0.94        17
          12       0.92      0.80      0.86        15
          13       0.92      0.80      0.86        15
          16       0.75      1.00      0.86        21
          17       0.70      1.00      0.83        19
          24       1.00      0.92      0.96        12

    accuracy                           0.90       230
   macro avg       0.92      0.90      0.90       230
weighted avg       0.92      0.90      0.90       230

Fold accuracy: 0.908695652173913
              precision    recall  f1-score   support

           

In [None]:
file_path = '/content/drive/MyDrive/PAMAP2_Dataset/Protocol/subject104.dat'
processor = PreProcessor()
processor.initializeDataFrame(file_path)
processor.dataCleaning()
processor.applyPreProcessing()


print(processor.dataFrame['activityID'].value_counts())

subject_id = 4
subject_df = processor.getSubjectDf(subject_id)

feature_extractor = FeatureExtraction3(subject_df, subject_id)
features, labels, feature_names = feature_extractor.applyFeatureExtraction(300, 0.5, 100)
features_df = feature_extractor.features_to_dataframe(features, labels, feature_names)

DataFrame still contains some NAN values
activityID
4     31932
7     27533
2     25492
17    24995
3     24706
1     23047
6     22699
16    20037
12    16694
13    14285
5         1
Name: count, dtype: int64


In [None]:
nested_cross_validationKNN(features, labels)

Fold accuracy: 0.9158878504672897
              precision    recall  f1-score   support

           1       0.82      0.82      0.82        11
           2       0.71      0.83      0.77        12
           3       0.90      0.82      0.86        11
           4       1.00      1.00      1.00        14
           6       1.00      1.00      1.00        10
           7       1.00      1.00      1.00        13
          12       1.00      0.88      0.93         8
          13       1.00      1.00      1.00         7
          16       0.89      0.89      0.89         9
          17       0.92      0.92      0.92        12

    accuracy                           0.92       107
   macro avg       0.92      0.92      0.92       107
weighted avg       0.92      0.92      0.92       107

Fold accuracy: 0.8411214953271028
              precision    recall  f1-score   support

           1       0.67      0.73      0.70        11
           2       0.86      0.50      0.63        12
          

In [None]:
nested_cross_validationRF(features, labels)

Fold accuracy: 0.9342723004694836
              precision    recall  f1-score   support

           1       1.00      0.83      0.90        23
           2       0.96      0.92      0.94        24
           3       0.88      1.00      0.94        22
           4       1.00      0.96      0.98        28
           6       0.87      1.00      0.93        20
           7       1.00      0.96      0.98        25
          12       1.00      0.94      0.97        16
          13       1.00      0.85      0.92        13
          16       0.81      0.89      0.85        19
          17       0.88      0.96      0.92        23

    accuracy                           0.93       213
   macro avg       0.94      0.93      0.93       213
weighted avg       0.94      0.93      0.93       213

Fold accuracy: 0.8826291079812206
              precision    recall  f1-score   support

           1       0.95      0.91      0.93        22
           2       0.95      0.88      0.91        24
          