<a href="https://colab.research.google.com/github/Noor-Z1/PAMAP2-DataAnalysis-ML/blob/main/Subject-Specific-Classifiers/Subject1-8_RandomForest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **CNG 514 - Term Project**

### Notebook # 4

### Author: Noor Ul Zain


In this notebook, we apply our own selected feature extraction technique(refer to notebook # 2) to 8 subjects and then do 10 fold cross validation with Grid Search on a random forest model and train subject specific classifiers.


In [None]:
# mounting drive for loading the dataset files

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# import the necessary libraries

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler
import matplotlib as plt

In [None]:
from matplotlib import pyplot as plt
import pandas as pd
from sklearn.preprocessing import RobustScaler, StandardScaler
from scipy.signal import ellip, filtfilt, welch
import numpy as np
from scipy.stats import skew, kurtosis, entropy
from sklearn.metrics import pairwise_distances


class PreProcessor:
    def __init__(self):
        self.dataFrame = pd.DataFrame()

    def initializeDataFrame(self, filepath):

        colNames = ["timestamp", "activityID", "heartrate"]

        IMUhand = ['handTemperature',
                   'handAcc16_1', 'handAcc16_2', 'handAcc16_3',
                   'handAcc6_1', 'handAcc6_2', 'handAcc6_3',
                   'handGyro1', 'handGyro2', 'handGyro3',
                   'handMagne1', 'handMagne2', 'handMagne3',
                   'handOrientation1', 'handOrientation2', 'handOrientation3', 'handOrientation4']

        IMUchest = ['chestTemperature',
                    'chestAcc16_1', 'chestAcc16_2', 'chestAcc16_3',
                    'chestAcc6_1', 'chestAcc6_2', 'chestAcc6_3',
                    'chestGyro1', 'chestGyro2', 'chestGyro3',
                    'chestMagne1', 'chestMagne2', 'chestMagne3',
                    'chestOrientation1', 'chestOrientation2', 'chestOrientation3', 'chestOrientation4']

        IMUankle = ['ankleTemperature',
                    'ankleAcc16_1', 'ankleAcc16_2', 'ankleAcc16_3',
                    'ankleAcc6_1', 'ankleAcc6_2', 'ankleAcc6_3',
                    'ankleGyro1', 'ankleGyro2', 'ankleGyro3',
                    'ankleMagne1', 'ankleMagne2', 'ankleMagne3',
                    'ankleOrientation1', 'ankleOrientation2', 'ankleOrientation3', 'ankleOrientation4']

        columns = colNames + IMUhand + IMUchest + IMUankle  # All columns in one list

        procData = pd.read_table(filepath, header=None, sep='\s+')
        procData.columns = columns
        procData['subject_id'] = int(filepath[-5])
        self.dataFrame = self.dataFrame._append(procData, ignore_index=True)
        self.dataFrame.reset_index(drop=True, inplace=True)

    def dataCleaning(self):
        self.dataFrame = self.dataFrame.drop(
            ['handOrientation1', 'handOrientation2', 'handOrientation3', 'handOrientation4',
             'chestOrientation1', 'chestOrientation2', 'chestOrientation3', 'chestOrientation4',
             'ankleOrientation1', 'ankleOrientation2', 'ankleOrientation3', 'ankleOrientation4',
             'handAcc6_1', 'handAcc6_2', 'handAcc6_3', 'chestAcc6_1', 'chestAcc6_2',
             'chestAcc6_3', 'ankleAcc6_1', 'ankleAcc6_2', 'ankleAcc6_3'], axis=1)

        self.dataFrame = self.dataFrame.drop(self.dataFrame[self.dataFrame.activityID == 0].index)
        self.dataFrame = self.dataFrame.apply(pd.to_numeric, errors='ignore')
        self.dataFrame = self.dataFrame.interpolate()

    def applyPreProcessing(self):
        self.dataFrame.reset_index(drop=True, inplace=True)
        self.dataFrame.loc[:3, "heartrate"] = 100

        checkForNan = self.dataFrame.isnull().values.any()
        if checkForNan:
            print("DataFrame still contains some NAN values")

    def getSubjectDf(self, subject_id):
        return self.dataFrame[self.dataFrame['subject_id'] == subject_id]


class FeatureExtraction1:
    def __init__(self, subjectDf, subjectID):
        self.dataFrame = subjectDf
        self.subjectID = subjectID

    @staticmethod
    def compute_time_domain_features(data, isheartrate=False):
        mean = np.mean(data)
        std_dev = np.std(data)

        if not isheartrate:
            skewness = skew(data, nan_policy='omit')
            kurt = kurtosis(data, nan_policy='omit')
            return mean, std_dev, skewness, kurt
        else:
            return mean, std_dev

    @staticmethod
    def compute_frequency_domain_features(data, fs):
        f, Pxx = welch(data, fs=fs, nperseg=len(data))
        entropy_power = entropy(Pxx)
        peak_power_freq = f[np.argmax(Pxx)]
        return entropy_power, peak_power_freq

    @staticmethod
    def compute_signal_magnitude_area(data):
        return np.sum(np.abs(data))

    @staticmethod
    def compute_pairwise_correlations(data):
        correlations = pairwise_distances(data, metric='correlation')
        return correlations

    def sliding_window_feature_extraction(self, window_size=150, overlap=0, fs=100):

        angular_velocity_columns = ['handGyro1', 'handGyro2', 'handGyro3',
                                    'chestGyro1', 'chestGyro2', 'chestGyro3',
                                    'ankleGyro1', 'ankleGyro2', 'ankleGyro3']
        acceleration_columns = ['handAcc16_1', 'handAcc16_2', 'handAcc16_3',
                                'chestAcc16_1', 'chestAcc16_2', 'chestAcc16_3',
                                'ankleAcc16_1', 'ankleAcc16_2', 'ankleAcc16_3']

        heart_rate_col = ['heartrate']
        combined_columns = angular_velocity_columns + acceleration_columns

        all_features = []
        all_labels = []

        stride = int(window_size * (1 - overlap))

        for start in range(0, len(self.dataFrame) - window_size + 1, stride):
            window_data_time = self.dataFrame.loc[start:start + window_size - 1, combined_columns + heart_rate_col]
            labels = self.dataFrame.loc[start:start + window_size - 1, 'activityID']

            # Ensure the window contains only one activity
            if labels.nunique() == 1:
                label = labels.iloc[0]

                # Extract time-domain features
                time_domain_features = []
                for column in combined_columns:
                    time_domain_features.extend(self.compute_time_domain_features(window_data_time[column]))
                for column in heart_rate_col:
                    time_domain_features.extend(self.compute_time_domain_features(window_data_time[column], True))

                # Extract frequency-domain features
                freq_domain_features = []
                for column in combined_columns:
                    freq_domain_features.extend(self.compute_frequency_domain_features(window_data_time[column], fs))

                # Signal magnitude area
                sma = [self.compute_signal_magnitude_area(window_data_time[column]) for column in combined_columns]

                # Combine all features
                features = np.concatenate([time_domain_features, freq_domain_features, sma])
                all_features.append(features)
                all_labels.append(label)

        return np.array(all_features), np.array(all_labels)

    def applyFeatureExtraction(self, window_size, overlap, fs):
        features, labels = self.sliding_window_feature_extraction(window_size=window_size, overlap=overlap, fs=fs)
        return features, labels

# Usage example
file_path = '/content/drive/MyDrive/PAMAP2_Dataset/Protocol/subject101.dat'
processor = PreProcessor()
processor.initializeDataFrame(file_path)
processor.dataCleaning()
processor.applyPreProcessing()

subject_id = 1
subject_df = processor.getSubjectDf(subject_id)

feature_extractor = FeatureExtraction1(subject_df, subject_id)

window_size = 150
overlap = 0

features, labels = feature_extractor.applyFeatureExtraction(window_size, overlap, fs=100)

(1653, 128)
(1653,)


In [None]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import accuracy_score, classification_report, make_scorer
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score


class ModelEval:
    def __init__(self, cv=5, n_splits=10):
        self.models = []
        self.metrics = []
        self.subjectScores = {}
        self.cv = cv
        self.n_splits = n_splits

    def add_model(self, model, name, param_grid=None):
        """Add a machine learning model for evaluation."""
        self.models.append((name, model, param_grid))

    def add_metric(self, metric, name):
        """Add a metric for evaluation."""
        self.metrics.append((name, make_scorer(metric)))

    def evaluate_subject(self, features, labels, subject_id):
        """Evaluate all models using nested cross-validation for a specific subject and store the results."""
        subject_results = {}
        for name, model, param_grid in self.models:
            print(f"Evaluating model: {name} for subject {subject_id}")
            outer_scores, test_scores = self.nested_cross_validation(features, labels, model, param_grid)
            subject_results[name] = {'validation_scores': outer_scores, 'test_scores': test_scores}
        self.subjectScores[subject_id] = subject_results

    def nested_cross_validation(self, features, labels, model, param_grid, test_size=0.3):
        """Perform nested cross-validation and return the evaluation scores."""
        X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=test_size, random_state=42)

        outer_cv = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=42)
        if param_grid:
            clf = GridSearchCV(estimator=model, param_grid=param_grid, cv=self.cv, scoring='accuracy')
        else:
            clf = model

        outer_scores = []

        for train_index, val_index in outer_cv.split(X_train, y_train):
            X_train_inner, X_val = X_train[train_index], X_train[val_index]
            y_train_inner, y_val = y_train[train_index], y_train[val_index]

            imputer = SimpleImputer(strategy='mean')
            X_train_inner = imputer.fit_transform(X_train_inner)
            X_val = imputer.transform(X_val)

            scaler = RobustScaler().fit(X_train_inner)
            X_train_inner = scaler.transform(X_train_inner)
            X_val = scaler.transform(X_val)

            if param_grid:
                clf.fit(X_train_inner, y_train_inner)
                best_model = clf.best_estimator_
            else:
                best_model = model
                best_model.fit(X_train_inner, y_train_inner)

            y_pred = best_model.predict(X_val)

            scores = {
                'accuracy': accuracy_score(y_val, y_pred),
                'f1': f1_score(y_val, y_pred, average='weighted'),
                'precision': precision_score(y_val, y_pred, average='weighted'),
                'recall': recall_score(y_val, y_pred, average='weighted')
            }

            outer_scores.append(scores)

            print(f"Fold scores: {scores}")
            print(classification_report(y_val, y_pred, zero_division=0))

        average_scores = {
            'accuracy': np.mean([score['accuracy'] for score in outer_scores]),
            'f1': np.mean([score['f1'] for score in outer_scores]),
            'precision': np.mean([score['precision'] for score in outer_scores]),
            'recall': np.mean([score['recall'] for score in outer_scores])
        }
        print("Average validation scores:", average_scores)

        # Final evaluation on the test set
        imputer = SimpleImputer(strategy='mean')
        X_train = imputer.fit_transform(X_train)
        X_test = imputer.transform(X_test)

        scaler = RobustScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

        if param_grid:
            clf.fit(X_train, y_train)
            best_model = clf.best_estimator_
        else:
            best_model = model
            best_model.fit(X_train, y_train)

        y_pred_test = best_model.predict(X_test)

        test_scores = {
            'accuracy': accuracy_score(y_test, y_pred_test),
            'f1': f1_score(y_test, y_pred_test, average='weighted'),
            'precision': precision_score(y_test, y_pred_test, average='weighted'),
            'recall': recall_score(y_test, y_pred_test, average='weighted')
        }

        print("Test set scores:", test_scores)
        print(classification_report(y_test, y_pred_test, zero_division=0))

        return average_scores, test_scores

    def evaluate_all_subjects(self, subject_data):
        """Evaluate all models for all subjects."""
        for subject_id, (features, labels) in subject_data.items():
            self.evaluate_subject(features, labels, subject_id)

    def get_results_df(self):
        """Retrieve the results as a Pandas DataFrame."""
        results = []
        for subject_id, models in self.subjectScores.items():
            for model_name, scores in models.items():
                result = {
                    'subject_id': subject_id,
                    'model': model_name,
                    'val_accuracy': scores['validation_scores']['accuracy'],
                    'val_f1': scores['validation_scores']['f1'],
                    'val_precision': scores['validation_scores']['precision'],
                    'val_recall': scores['validation_scores']['recall'],
                    'test_accuracy': scores['test_scores']['accuracy'],
                    'test_f1': scores['test_scores']['f1'],
                    'test_precision': scores['test_scores']['precision'],
                    'test_recall': scores['test_scores']['recall']
                }
                results.append(result)
        return pd.DataFrame(results)

    def report_results(self):
        """Print the evaluation results."""
        results_df = self.get_results_df()
        print(results_df)
        return results_df



# Initialize the evaluator
evaluator = ModelEval(cv=5, n_splits=10)

evaluator.add_model(RandomForestClassifier(), "RForest", param_grid= {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10] } )


# Add metrics
evaluator.add_metric(accuracy_score, "Accuracy")
evaluator.add_metric(f1_score, "F1 Score")
evaluator.add_metric(precision_score, "Precision")
evaluator.add_metric(recall_score, "Recall")

# Generate features and labels for each subject using PreProcessor and FeatureExtraction
preprocessor = PreProcessor()
subject_data = {}

file_path =  '/content/drive/MyDrive/PAMAP2_Dataset/Protocol/subject101.dat'

preprocessor.initializeDataFrame(file_path)
preprocessor.dataCleaning()
preprocessor.applyPreProcessing()

subject_id = int(file_path[-5])
subject_df = preprocessor.getSubjectDf(subject_id)

feature_extractor = FeatureExtraction1(subject_df, subject_id)
features, labels = feature_extractor.applyFeatureExtraction(window_size=150, overlap=0, fs=100)

subject_data[subject_id] = (features, labels)

# Evaluate models for all subjects
evaluator.evaluate_all_subjects(subject_data)

# Report results
results_df = evaluator.report_results()

# Save results_df with subject id
results_df.to_csv(f"{subject_id}_results.csv", index=False)

Evaluating model: RForest for subject 1
Fold scores: {'accuracy': 0.9396551724137931, 'f1': 0.9404950503348672, 'precision': 0.9496202073788281, 'recall': 0.9396551724137931}
              precision    recall  f1-score   support

           1       1.00      0.86      0.92        14
           2       1.00      1.00      1.00        10
           3       0.82      1.00      0.90         9
           4       1.00      0.82      0.90        11
           5       1.00      0.91      0.95        11
           6       1.00      0.91      0.95        11
           7       0.90      1.00      0.95         9
          12       0.88      0.88      0.88         8
          13       1.00      1.00      1.00         7
          16       1.00      1.00      1.00        11
          17       0.77      1.00      0.87        10
          24       1.00      1.00      1.00         5

    accuracy                           0.94       116
   macro avg       0.95      0.95      0.94       116
weighted avg 

In [None]:
# Generate features and labels for each subject using PreProcessor and FeatureExtraction
preprocessor = PreProcessor()
subject_data = {}

file_path =  '/content/drive/MyDrive/PAMAP2_Dataset/Protocol/subject102.dat'

preprocessor.initializeDataFrame(file_path)
preprocessor.dataCleaning()
preprocessor.applyPreProcessing()

subject_id = int(file_path[-5])
subject_df = preprocessor.getSubjectDf(subject_id)

feature_extractor = FeatureExtraction1(subject_df, subject_id)
features, labels = feature_extractor.applyFeatureExtraction(window_size=150, overlap=0, fs=100)

subject_data[subject_id] = (features, labels)

# Evaluate models for all subjects
evaluator.evaluate_all_subjects(subject_data)

# Report results
results_df = evaluator.report_results()

# Save results_df with subject id
results_df.to_csv(f"{subject_id}_results_RF.csv", index=False)

DataFrame still contains some NAN values
Evaluating model: RForest for subject 2
Fold scores: {'accuracy': 0.9180327868852459, 'f1': 0.9175064997072865, 'precision': 0.9319577508102098, 'recall': 0.9180327868852459}
              precision    recall  f1-score   support

           1       1.00      0.92      0.96        12
           2       1.00      0.70      0.82        10
           3       0.86      1.00      0.92        12
           4       0.93      0.88      0.90        16
           5       1.00      1.00      1.00         4
           6       1.00      1.00      1.00        11
           7       1.00      1.00      1.00        13
          12       0.86      0.86      0.86         7
          13       1.00      0.88      0.93         8
          16       0.91      1.00      0.95        10
          17       0.72      1.00      0.84        13
          24       1.00      0.67      0.80         6

    accuracy                           0.92       122
   macro avg       0.94   

In [None]:
# Generate features and labels for each subject using PreProcessor and FeatureExtraction
preprocessor = PreProcessor()
subject_data = {}

file_path = '/content/drive/MyDrive/PAMAP2_Dataset/Protocol/subject103.dat'

preprocessor.initializeDataFrame(file_path)
preprocessor.dataCleaning()
preprocessor.applyPreProcessing()

subject_id = int(file_path[-5])
subject_df = preprocessor.getSubjectDf(subject_id)

feature_extractor = FeatureExtraction1(subject_df, subject_id)
features, labels = feature_extractor.applyFeatureExtraction(window_size=150, overlap=0, fs=100)

subject_data[subject_id] = (features, labels)

# Evaluate models for all subjects
evaluator.evaluate_all_subjects(subject_data)

# Report results
results_df = evaluator.report_results()

# Save results_df with subject id
results_df.to_csv(f"{subject_id}_results_RF.csv", index=False)

DataFrame still contains some NAN values
Evaluating model: RForest for subject 3
Fold scores: {'accuracy': 0.9382716049382716, 'f1': 0.9365980937870432, 'precision': 0.9450788751714678, 'recall': 0.9382716049382716}
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        14
           3       0.90      1.00      0.95         9
           4       1.00      1.00      1.00        15
          12       1.00      0.75      0.86         4
          13       1.00      0.71      0.83         7
          16       0.89      0.80      0.84        10
          17       0.81      1.00      0.90        13

    accuracy                           0.94        81
   macro avg       0.95      0.91      0.92        81
weighted avg       0.95      0.94      0.94        81

Fold scores: {'accuracy': 0.9135802469135802, 'f1': 0.9144573010557415, 'precision': 0.9276688453159041, 'recall': 0.9135802469135802

In [None]:
# Generate features and labels for each subject using PreProcessor and FeatureExtraction
preprocessor = PreProcessor()
subject_data = {}

file_path = '/content/drive/MyDrive/PAMAP2_Dataset/Protocol/subject104.dat'

preprocessor.initializeDataFrame(file_path)
preprocessor.dataCleaning()
preprocessor.applyPreProcessing()

subject_id = int(file_path[-5])
subject_df = preprocessor.getSubjectDf(subject_id)

feature_extractor = FeatureExtraction1(subject_df, subject_id)
features, labels = feature_extractor.applyFeatureExtraction(window_size=150, overlap=0, fs=100)

subject_data[subject_id] = (features, labels)

# Evaluate models for all subjects
evaluator.evaluate_all_subjects(subject_data)

# Report results
results_df = evaluator.report_results()

# Save results_df with subject id
results_df.to_csv(f"{subject_id}_results_RF.csv", index=False)

DataFrame still contains some NAN values
Evaluating model: RForest for subject 4
Fold scores: {'accuracy': 0.9537037037037037, 'f1': 0.953834763673733, 'precision': 0.9590055006721673, 'recall': 0.9537037037037037}
              precision    recall  f1-score   support

           1       1.00      0.92      0.96        12
           2       0.92      1.00      0.96        12
           3       0.83      1.00      0.91        10
           4       0.93      1.00      0.96        13
           6       1.00      0.91      0.95        11
           7       1.00      1.00      1.00        13
          12       1.00      0.88      0.93         8
          13       1.00      0.86      0.92         7
          16       0.91      1.00      0.95        10
          17       1.00      0.92      0.96        12

    accuracy                           0.95       108
   macro avg       0.96      0.95      0.95       108
weighted avg       0.96      0.95      0.95       108

Fold scores: {'accuracy': 

In [None]:
# Generate features and labels for each subject using PreProcessor and FeatureExtraction
preprocessor = PreProcessor()
subject_data = {}

file_path = '/content/drive/MyDrive/PAMAP2_Dataset/Protocol/subject105.dat'

preprocessor.initializeDataFrame(file_path)
preprocessor.dataCleaning()
preprocessor.applyPreProcessing()

subject_id = int(file_path[-5])
subject_df = preprocessor.getSubjectDf(subject_id)

feature_extractor = FeatureExtraction1(subject_df, subject_id)
features, labels = feature_extractor.applyFeatureExtraction(window_size=150, overlap=0, fs=100)

subject_data[subject_id] = (features, labels)

# Evaluate models for all subjects
evaluator.evaluate_all_subjects(subject_data)

# Report results
results_df = evaluator.report_results()

# Save results_df with subject id
results_df.to_csv(f"{subject_id}_results_RF.csv", index=False)

DataFrame still contains some NAN values
Evaluating model: RForest for subject 5
Fold scores: {'accuracy': 0.905511811023622, 'f1': 0.9068624452333991, 'precision': 0.9156269692687015, 'recall': 0.905511811023622}
              precision    recall  f1-score   support

           1       1.00      0.92      0.96        12
           2       0.92      1.00      0.96        12
           3       0.67      0.80      0.73        10
           4       0.94      1.00      0.97        15
           5       1.00      1.00      1.00        11
           6       1.00      0.91      0.95        11
           7       1.00      0.83      0.91        12
          12       0.88      1.00      0.93         7
          13       1.00      0.83      0.91         6
          16       0.91      0.83      0.87        12
          17       0.78      0.88      0.82        16
          24       1.00      0.67      0.80         3

    accuracy                           0.91       127
   macro avg       0.92     

In [None]:
# Generate features and labels for each subject using PreProcessor and FeatureExtraction
preprocessor = PreProcessor()
subject_data = {}

file_path = '/content/drive/MyDrive/PAMAP2_Dataset/Protocol/subject106.dat'

preprocessor.initializeDataFrame(file_path)
preprocessor.dataCleaning()
preprocessor.applyPreProcessing()

subject_id = int(file_path[-5])
subject_df = preprocessor.getSubjectDf(subject_id)

feature_extractor = FeatureExtraction1(subject_df, subject_id)
features, labels = feature_extractor.applyFeatureExtraction(window_size=150, overlap=0, fs=100)

subject_data[subject_id] = (features, labels)

# Evaluate models for all subjects
evaluator.evaluate_all_subjects(subject_data)

# Report results
results_df = evaluator.report_results()

# Save results_df with subject id
results_df.to_csv(f"{subject_id}_results_RF.csv", index=False)

DataFrame still contains some NAN values
Evaluating model: RForest for subject 6




Fold scores: {'accuracy': 0.9310344827586207, 'f1': 0.9327641270472486, 'precision': 0.9488146551724138, 'recall': 0.9310344827586207}
              precision    recall  f1-score   support

           1       1.00      0.91      0.95        11
           2       1.00      0.90      0.95        10
           3       0.69      1.00      0.81        11
           4       1.00      1.00      1.00        12
           5       1.00      0.67      0.80         9
           6       1.00      1.00      1.00        11
           7       1.00      1.00      1.00        13
          12       1.00      0.86      0.92         7
          13       1.00      0.80      0.89         5
          16       0.75      0.90      0.82        10
          17       1.00      1.00      1.00        17

    accuracy                           0.93       116
   macro avg       0.95      0.91      0.92       116
weighted avg       0.95      0.93      0.93       116





Fold scores: {'accuracy': 0.9568965517241379, 'f1': 0.9571335666753088, 'precision': 0.9608116978806635, 'recall': 0.9568965517241379}
              precision    recall  f1-score   support

           1       1.00      0.91      0.95        11
           2       1.00      0.90      0.95        10
           3       0.85      1.00      0.92        11
           4       1.00      1.00      1.00        12
           5       1.00      1.00      1.00        10
           6       1.00      0.91      0.95        11
           7       1.00      0.92      0.96        13
          12       1.00      1.00      1.00         6
          13       0.80      0.80      0.80         5
          16       0.91      1.00      0.95        10
          17       0.94      1.00      0.97        17

    accuracy                           0.96       116
   macro avg       0.95      0.95      0.95       116
weighted avg       0.96      0.96      0.96       116





Fold scores: {'accuracy': 0.9482758620689655, 'f1': 0.9484398778054581, 'precision': 0.9558114299493611, 'recall': 0.9482758620689655}
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        12
           2       0.82      0.90      0.86        10
           3       0.91      0.91      0.91        11
           4       1.00      1.00      1.00        12
           5       1.00      0.90      0.95        10
           6       1.00      1.00      1.00        11
           7       1.00      0.92      0.96        12
          12       1.00      0.67      0.80         6
          13       1.00      1.00      1.00         5
          16       0.77      1.00      0.87        10
          17       1.00      1.00      1.00        17

    accuracy                           0.95       116
   macro avg       0.95      0.94      0.94       116
weighted avg       0.96      0.95      0.95       116





Fold scores: {'accuracy': 0.9482758620689655, 'f1': 0.9476716025104331, 'precision': 0.9566912972085385, 'recall': 0.9482758620689655}
              precision    recall  f1-score   support

           1       1.00      0.92      0.96        12
           2       1.00      1.00      1.00        11
           3       0.79      1.00      0.88        11
           4       0.91      0.91      0.91        11
           5       1.00      1.00      1.00        10
           6       1.00      0.91      0.95        11
           7       1.00      1.00      1.00        12
          12       1.00      0.83      0.91         6
          13       1.00      0.60      0.75         5
          16       0.83      1.00      0.91        10
          17       1.00      1.00      1.00        17

    accuracy                           0.95       116
   macro avg       0.96      0.92      0.93       116
weighted avg       0.96      0.95      0.95       116





Fold scores: {'accuracy': 0.9396551724137931, 'f1': 0.9411906600812149, 'precision': 0.9474715393499422, 'recall': 0.9396551724137931}
              precision    recall  f1-score   support

           1       1.00      0.92      0.96        12
           2       1.00      0.91      0.95        11
           3       1.00      0.91      0.95        11
           4       1.00      1.00      1.00        11
           5       0.91      1.00      0.95        10
           6       1.00      0.91      0.95        11
           7       1.00      1.00      1.00        12
          12       1.00      0.83      0.91         6
          13       1.00      1.00      1.00         5
          16       0.75      0.90      0.82        10
          17       0.84      0.94      0.89        17

    accuracy                           0.94       116
   macro avg       0.95      0.94      0.94       116
weighted avg       0.95      0.94      0.94       116





Fold scores: {'accuracy': 0.9310344827586207, 'f1': 0.9328820614755278, 'precision': 0.9420124894262826, 'recall': 0.9310344827586207}
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        12
           2       1.00      0.91      0.95        11
           3       0.64      0.82      0.72        11
           4       1.00      1.00      1.00        11
           5       1.00      1.00      1.00        10
           6       1.00      0.90      0.95        10
           7       1.00      1.00      1.00        12
          12       1.00      0.71      0.83         7
          13       1.00      0.80      0.89         5
          16       0.91      1.00      0.95        10
          17       0.89      0.94      0.91        17

    accuracy                           0.93       116
   macro avg       0.95      0.92      0.93       116
weighted avg       0.94      0.93      0.93       116



  _warn_prf(average, modifier, msg_start, len(result))


Fold scores: {'accuracy': 0.9051724137931034, 'f1': 0.9040621142968557, 'precision': 0.9095202398800599, 'recall': 0.9051724137931034}
              precision    recall  f1-score   support

           1       1.00      0.92      0.96        12
           2       1.00      0.90      0.95        10
           3       0.73      0.73      0.73        11
           4       1.00      1.00      1.00        11
           5       1.00      0.90      0.95        10
           6       1.00      1.00      1.00        10
           7       1.00      0.92      0.96        12
          12       1.00      0.86      0.92         7
          13       1.00      1.00      1.00         5
          16       0.80      0.89      0.84         9
          17       0.74      0.94      0.83        18
          24       0.00      0.00      0.00         1

    accuracy                           0.91       116
   macro avg       0.86      0.84      0.84       116
weighted avg       0.91      0.91      0.90       116



Fold scores: {'accuracy': 0.9652173913043478, 'f1': 0.9642257818459191, 'precision': 0.9669565217391305, 'recall': 0.9652173913043478}
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        12
           2       1.00      1.00      1.00        10
           3       0.82      0.82      0.82        11
           4       1.00      1.00      1.00        11
           5       1.00      1.00      1.00        10
           6       1.00      1.00      1.00        10
           7       1.00      1.00      1.00        12
          12       1.00      0.71      0.83         7
          13       1.00      1.00      1.00         5
          16       1.00      1.00      1.00         9
          17       0.90      1.00      0.95        18

    accuracy                           0.97       115
   macro avg       0.97      0.96      0.96       115
weighted avg       0.97      0.97      0.96       115





Fold scores: {'accuracy': 0.9043478260869565, 'f1': 0.9048758282784749, 'precision': 0.9241895499618612, 'recall': 0.9043478260869565}
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        12
           2       1.00      0.80      0.89        10
           3       0.69      1.00      0.81        11
           4       1.00      0.64      0.78        11
           5       0.89      0.89      0.89         9
           6       0.91      0.91      0.91        11
           7       1.00      0.92      0.96        12
          12       1.00      0.86      0.92         7
          13       0.67      1.00      0.80         4
          16       0.90      0.90      0.90        10
          17       0.95      1.00      0.97        18

    accuracy                           0.90       115
   macro avg       0.91      0.90      0.89       115
weighted avg       0.92      0.90      0.90       115

Average validation scores: {'accuracy': 0.9369040479



Test set scores: {'accuracy': 0.9517102615694165, 'f1': 0.9514726733966044, 'precision': 0.9538858017488421, 'recall': 0.9517102615694165}
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        37
           2       0.98      0.98      0.98        50
           3       0.90      0.88      0.89        51
           4       0.98      0.98      0.98        57
           5       1.00      0.94      0.97        53
           6       1.00      1.00      1.00        30
           7       1.00      0.96      0.98        55
          12       0.85      0.81      0.83        21
          13       1.00      0.79      0.88        24
          16       0.91      0.95      0.93        42
          17       0.89      1.00      0.94        77

    accuracy                           0.95       497
   macro avg       0.96      0.94      0.94       497
weighted avg       0.95      0.95      0.95       497

   subject_id    model  val_accuracy    val_f1  

In [None]:
# Generate features and labels for each subject using PreProcessor and FeatureExtraction
preprocessor = PreProcessor()
subject_data = {}

file_path = '/content/drive/MyDrive/PAMAP2_Dataset/Protocol/subject107.dat'

preprocessor.initializeDataFrame(file_path)
preprocessor.dataCleaning()
preprocessor.applyPreProcessing()

subject_id = int(file_path[-5])
subject_df = preprocessor.getSubjectDf(subject_id)

feature_extractor = FeatureExtraction1(subject_df, subject_id)
features, labels = feature_extractor.applyFeatureExtraction(window_size=150, overlap=0, fs=100)

subject_data[subject_id] = (features, labels)

# Evaluate models for all subjects
evaluator.evaluate_all_subjects(subject_data)

# Report results
results_df = evaluator.report_results()

# Save results_df with subject id
results_df.to_csv(f"{subject_id}_results_RF.csv", index=False)

DataFrame still contains some NAN values
Evaluating model: RForest for subject 7


  _warn_prf(average, modifier, msg_start, len(result))


Fold scores: {'accuracy': 0.9166666666666666, 'f1': 0.912323426519333, 'precision': 0.9132069999717058, 'recall': 0.9166666666666666}
              precision    recall  f1-score   support

           1       1.00      0.92      0.96        13
           2       0.80      0.80      0.80         5
           3       0.86      1.00      0.92        12
           4       0.94      1.00      0.97        16
           5       0.00      0.00      0.00         1
           6       1.00      1.00      1.00        11
           7       1.00      0.85      0.92        13
          12       0.90      1.00      0.95         9
          13       1.00      0.80      0.89         5
          16       0.82      0.90      0.86        10
          17       0.85      0.85      0.85        13

    accuracy                           0.92       108
   macro avg       0.83      0.83      0.83       108
weighted avg       0.91      0.92      0.91       108

Fold scores: {'accuracy': 0.9629629629629629, 'f1': 0

  _warn_prf(average, modifier, msg_start, len(result))


Fold scores: {'accuracy': 0.9065420560747663, 'f1': 0.904523091045811, 'precision': 0.9158878504672897, 'recall': 0.9065420560747663}
              precision    recall  f1-score   support

           1       1.00      0.92      0.96        13
           2       1.00      0.83      0.91         6
           3       0.67      1.00      0.80        12
           4       1.00      0.93      0.97        15
           5       0.00      0.00      0.00         1
           6       1.00      0.82      0.90        11
           7       1.00      1.00      1.00        12
          12       1.00      0.88      0.93         8
          13       0.75      0.60      0.67         5
          16       0.90      0.90      0.90        10
          17       0.88      1.00      0.93        14

    accuracy                           0.91       107
   macro avg       0.84      0.81      0.82       107
weighted avg       0.92      0.91      0.90       107

Fold scores: {'accuracy': 0.9345794392523364, 'f1': 0

In [None]:
# Generate features and labels for each subject using PreProcessor and FeatureExtraction
preprocessor = PreProcessor()
subject_data = {}

file_path = '/content/drive/MyDrive/PAMAP2_Dataset/Protocol/subject108.dat'

preprocessor.initializeDataFrame(file_path)
preprocessor.dataCleaning()
preprocessor.applyPreProcessing()

subject_id = int(file_path[-5])
subject_df = preprocessor.getSubjectDf(subject_id)

feature_extractor = FeatureExtraction1(subject_df, subject_id)
features, labels = feature_extractor.applyFeatureExtraction(window_size=150, overlap=0, fs=100)

subject_data[subject_id] = (features, labels)

# Evaluate models for all subjects
evaluator.evaluate_all_subjects(subject_data)

# Report results
results_df = evaluator.report_results()

# Save results_df with subject id
results_df.to_csv(f"{subject_id}_results_RF.csv", index=False)

Evaluating model: RForest for subject 8
Fold scores: {'accuracy': 0.9590163934426229, 'f1': 0.958693225218873, 'precision': 0.9602738450074516, 'recall': 0.9590163934426229}
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        13
           2       0.91      1.00      0.95        10
           3       1.00      1.00      1.00        12
           4       0.94      1.00      0.97        15
           5       1.00      1.00      1.00         9
           6       1.00      0.91      0.95        11
           7       1.00      1.00      1.00        13
          12       1.00      0.80      0.89         5
          13       0.75      0.75      0.75         4
          16       0.91      0.91      0.91        11
          17       0.93      0.93      0.93        15
          24       1.00      1.00      1.00         4

    accuracy                           0.96       122
   macro avg       0.95      0.94      0.95       122
weighted avg  