<a href="https://colab.research.google.com/github/Paevjc/Human-Activity-Recognition/blob/main/MHealth.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DSA Internal Project: Group 4
### Notebook Author: Jeriel, Elly, Shaniah, Harshni, Ziyan
### Dataset: MHealth Dataset
### Source: Kaggle https://www.kaggle.com/datasets/nirmalsankalana/mhealth-dataset-data-set-csv/data


Problem Statement: Individuals in healthcare settings, such as physiotherapists or elderly care providers, would benefit from an automated system that accurately classifies human activities using wearable sensor data. The system should provide real-time insights into activities, aiding in the monitoring of patient mobility and overall health.
The goal is to create a machine learning pipeline that can process accelerometer and gyroscope data and classify activities accurately, enabling healthcare providers to make informed decisions about patient care.


# Data Summary (In words)


#### Activities: 12
L1: Standing still (1 min)\
L2: Sitting and relaxing (1 min)\
L3: Lying down (1 min)\
L4: Walking (1 min)\
L5: Climbing stairs (1 min)\
L6: Waist bends forward (20x)\
L7: Frontal elevation of arms (20x)\
L8: Knees bending (crouching) (20x)\
L9: Cycling (1 min)\
L10: Jogging (1 min)\
L11: Running (1 min)\
L12: Jump front & back (20x)

#### Sensor Devices: 2
alx: acceleration from the left-ankle sensor (X axis)\
aly: acceleration from the left-ankle sensor (Y axis)\
alz: acceleration from the left-ankle sensor (Z axis)\
glx: gyro from the left-ankle sensor (X axis)\
gly: gyro from the left-ankle sensor (Y axis)\
glz: gyro from the left-ankle sensor (Z axis)\
arx: acceleration from the right-lower-arm sensor (X axis)\
ary: acceleration from the right-lower-arm sensor (Y axis)\
arz: acceleration from the right-lower-arm sensor (Z axis)\
grx: gyro from the right-lower-arm sensor (X axis)\
gry: gyro from the right-lower-arm sensor (Y axis)\
grz: gyro from the right-lower-arm sensor (Z axis)\
subject: volunteer number\
Activity: corresponding activity

#### Subjects: 10


# Upload Necessary Files

*   Mhealth zip
*   requirement.txt
*   Mhealth.db
*   template.env
*   Set GPU: T4



In [None]:
# Import necessary libraries
import os
import random
import json
import uuid
import hashlib
import zipfile
import sqlite3
import joblib
from datetime import datetime
from typing import List, Sequence, TypedDict
from typing_extensions import Annotated
from PIL import Image  # Image is imported from PIL

# Data Processing and Analysis
import pandas as pd
import numpy as np
from pprint import pprint
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

# Machine Learning Libraries (Deep Learning)
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# TensorFlow and Keras
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, Dropout, LSTM, SeparableConv1D, MaxPooling1D, Input
from tensorflow.keras.optimizers import Adam
import keras
from keras.models import Sequential
from keras.layers import Dense

# Plotting Libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Table
from prettytable import PrettyTable

# Others
from tqdm import tqdm
import cv2

# 2. Loading of Dataset

## From Zip File

In [None]:

# Define file paths
zip_file_path = 'Mhealth.zip'
extraction_path = 'Mhealth/'

# Extract files from the ZIP archive
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
    zip_ref.extractall(extraction_path)

# Initialize an empty list to store the DataFrames
dfs = []

# Loop through each CSV file in the extracted folder
for root, dirs, files in os.walk(extraction_path):
    for file in files:
        if file.endswith('.csv'):
            file_path = os.path.join(root, file)
            print(f"Reading {file_path}")
            # Read each CSV file into a DataFrame
            df = pd.read_csv(file_path)
            dfs.append(df)

# Combine all DataFrames into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

# Display the combined DataFrame (first few rows)
print(df.head())


combined_df.to_csv('mhealth_data.csv', index=False)

# 3. SQL

In [None]:
# Connecting to SQL Database
conn = sqlite3.connect("mhealth.db")

# Saving DataFrame to SQLite database
df.to_sql("mobile_health", conn, if_exists="replace", index=False)

In [None]:
# Saving DataFrame to SQLite database
mhealth_data = """
SELECT *
FROM mobile_health;
"""

df = pd.read_sql_query(mhealth_data, conn)
display(df)
df.info()

In [None]:
query_act = """
SELECT DISTINCT Activity
FROM mobile_health;
"""

df_act = pd.read_sql_query(query_act, conn)
print("Distinct Activity Values with Data Types:")
print(df_act.dtypes)  # Displays data types
print(df_act.to_string(index=False))

### 3.1 Querying the *database*

In [None]:
# Activity Duration Analysis

## Purpose:
# 1. Measures the time (in seconds) each subject spends on different activities, where data is sampled at 50Hz (every 0.02 seconds).
# 2.Counts the total samples for each activity.

duration_query = """
SELECT
    subject,
    Activity,
    COUNT(*) * 0.02 as duration_seconds, -- 50Hz sampling rate
    COUNT(*) as sample_count
FROM mobile_health
GROUP BY subject, Activity
ORDER BY subject, duration_seconds DESC;
"""

duration_df = pd.read_sql_query(duration_query, conn)
display(duration_df.head())  # Display the first few rows

From the result: Identifies dominant activities for each subject, helping understand behavioral patterns.
Useful in activity recognition or health monitoring systems to track activity balance, e.g., time spent sitting vs. walking.

In [None]:
# Sensor Pattern Analysis

# Purpose:
# 1. Computes average and standard deviation of accelerometer readings for each activity.
# 2. Helps summarize sensor data patterns for different activities.

sensor_pattern_query = """
SELECT
    Activity,
    AVG(alx) as avg_left_accel_x,
    AVG(aly) as avg_left_accel_y,
    AVG(alz) as avg_left_accel_z,
    SQRT(AVG(alx * alx) - AVG(alx) * AVG(alx)) as std_left_accel_x,
    SQRT(AVG(aly * aly) - AVG(aly) * AVG(aly)) as std_left_accel_y,
    SQRT(AVG(alz * alz) - AVG(alz) * AVG(alz)) as std_left_accel_z
FROM mobile_health
GROUP BY Activity;
"""

sensor_pattern_df = pd.read_sql_query(sensor_pattern_query, conn)
display(sensor_pattern_df.head())

With result: \
Machine Learning Models: These statistics can be used as features in models to classify activities.
Activity Characterization: Helps differentiate activities based on unique motion patterns (e.g., walking vs. running).

In [None]:
# Peak Acceleration Analysis

# Purpose:
# Identifies the top 5 highest acceleration readings for each activity.

peak_accel_query = """
WITH ranked_accel AS (
    SELECT
        *,
        ROW_NUMBER() OVER (PARTITION BY Activity ORDER BY SQRT(alx*alx + aly*aly + alz*alz) DESC) as rn
    FROM mobile_health
)
SELECT * FROM ranked_accel
WHERE rn <= 5
ORDER BY Activity, rn;
"""

peak_accel_df = pd.read_sql_query(peak_accel_query, conn)
display(peak_accel_df.head())

Anomaly Detection: High accelerations might indicate sudden movements (e.g., falls, jumps).\
Feature Engineering: Peak values can act as distinguishing features in activity recognition systems.\
Biomechanics Analysis: Useful in sports or rehabilitation settings to study motion extremes.

Behavioral Insights: Reveals common transitions (e.g., sitting to walking), providing a better understanding of activity sequences.\
Healthcare Applications: Useful for monitoring recovery progress, e.g., increasing transitions from resting to walking post-surgery.\
Workflow Optimization: Can be applied in workplace studies to improve ergonomic efficiency by reducing unnecessary transitions.

In [None]:
# Activity Transitions

# Purpose:
# Counts how often subjects transition from one activity to another.

activity_transitions_query = """
WITH activity_changes AS (
    SELECT
        *,
        LAG(Activity) OVER (PARTITION BY subject ORDER BY ROWID) as prev_activity
    FROM mobile_health
)
SELECT
    prev_activity,
    Activity as current_activity,
    COUNT(*) as transition_count
FROM activity_changes
WHERE prev_activity != Activity
GROUP BY prev_activity, Activity
ORDER BY transition_count DESC;
"""

activity_transitions_df = pd.read_sql_query(activity_transitions_query, conn)
display(activity_transitions_df.head())

# 4. Exploratory Data Analysis

### 4.1 Cleaning of dataset

In [None]:
# summary of dataframe
display(df)
df.info()

In [None]:
#check for null values
print(df.isnull().sum().to_dict()) # no null values

In [None]:
# check for number of observations to see is resampling us necessary
df.subject.value_counts()

In [None]:
df['Activity'].unique()

In [None]:
# check if data is imbalanced
df['Activity'].value_counts()

In [None]:
# bar plot for visualisation of imbalanced activity ditribution
sns.set_style("whitegrid")
sns.countplot(x='Activity', data = df, order = df['Activity'].value_counts().index)
plt.title('Number of samples by Activity')
plt.xlabel('Activity')
plt.ylabel('Count of Samples')

plt.show()

As seen in the bar plot, there is significant imbalance with the majority of the samples having class label 'Activity 0'. While 'Activity 12' have the least representation in the dataset.

In [None]:
# resampling(downsampling) activity 0 to 30720 observations

df_activity_0 = df[df['Activity'] == 0]
df_activity_else = df[df['Activity'] != 0]

df_activity_0 = df_activity_0.sample(n=30720, random_state=1)
df = pd.concat([df_activity_0, df_activity_else])

In [None]:
df['Activity'].value_counts()

In [None]:
#check for null values
print(df.isnull().sum().to_dict())

In [None]:
# drop duplicates
df = df.drop_duplicates(keep='first')

In [None]:
# visualisation of how signal values of x,y,z dimensions cary with time

# map activity labels
activity_map = {
    0: 'Doing nothing',
    1: 'Standing still (1 min)',
    2: 'Sitting and relaxing (1 min)',
    3: 'Lying down (1 min)',
    4: 'Walking (1 min)',
    5: 'Climbing stairs (1 min)',
    6: 'Waist bends forward (20x)',
    7: 'Frontal elevation of arms (20x)',
    8: 'Knees bending (crouching) (20x)',
    9: 'Cycling (1 min)',
    10: 'Jogging (1 min)',
    11: 'Running (1 min)',
    12: 'Jump front & back (20x)'
}


In [None]:
sensors = {
    'Left Ankle - Accelerometer': ['alx', 'aly', 'alz'],
    'Left Ankle - Gyroscope': ['glx', 'gly', 'glz'],
    'Right Lower Arm - Accelerometer': ['arx', 'ary', 'arz'],
    'Right Lower Arm - Gyroscope': ['grx', 'gry', 'grz']
}

In [None]:
print("Activity Map:")
pprint(activity_map)

print("\nSensors:")
pprint(sensors)

### 4.2 Visualisations

In [None]:
# bar plot for visualisation of corrected activity ditribution
sns.set_style("whitegrid")
sns.countplot(x='Activity', data = df, order = df['Activity'].value_counts().index)
plt.title('Number of Readings by Activity')
plt.xlabel('Activity')
plt.ylabel('Count of Readings')

plt.show()

In [None]:
plt.figure(figsize = (30,10))
sns.countplot(x = 'subject',  hue = 'Activity', data = df)
plt.title('Activity Readings by Subjects')
plt.xlabel('Activities by Subject')
plt.ylabel('Count of Readings')
plt.tight_layout()
plt.show()

In [None]:
print(df.head())  # Check the data
print(f"Unique Activities: {df['Activity'].unique()}")  # Verify activity labels


In [None]:
# Filter data where Activity = 0
activity_0_df = df[df['Activity'] == 0]

# Count the number of readings for each sensor per subject
sensor_counts = activity_0_df.groupby('subject')[['alx', 'aly', 'alz', 'glx', 'gly', 'glz',
                                                  'arx', 'ary', 'arz', 'grx', 'gry', 'grz']].count()

# Print the result
print("Sensor Reading Counts for Activity 0 per Subject:\n")
print(sensor_counts.to_string())


In [None]:
print("Data Preview:")
from IPython.display import display  # Enables rich output for tables in Colab
display(df.head())  # This will show the data in a table format

# Display unique activities in table format
unique_activities = pd.DataFrame(df['Activity'].unique(), columns=['Unique Activities'])
print("Unique Activities:")
display(unique_activities)


Plotting

In [None]:
def plot_comparison_part1(df):
    """Plots the second half of activities (0 to 5)."""
    # Define sensor groups for visualization
    for i in range(0, 6):  # Loop through activities 0 to 5
        # Filter data for the current activity
        activity_data = df[df['Activity'] == i]

        # Check if there is no data for the current activity
        if activity_data.empty:
            print(f"No data available for Activity {i}. Skipping.")
            continue

        # Create a subplot for the current activity
        com_fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=[
                f'{activity_map.get(i, f"Activity {i}")} - {sensor_name}' for sensor_name in sensors.keys()
            ],
            shared_yaxes=True
        )

        for idx, (sensor_name, columns) in enumerate(sensors.items()):
            row = (idx // 2) + 1
            col = (idx % 2) + 1

            for column in columns:
                com_fig.add_trace(go.Scatter(
                    x=activity_data.reset_index(drop=True).index,
                    y=activity_data.reset_index(drop=True)[column],
                    mode='lines',
                    name=column,
                    line=dict(width=1.5)
                ), row=row, col=col)

        # Update layout for each subplot
        com_fig.update_layout(
            title=f'Activity: {activity_map.get(i, f"Activity {i}")}',
            title_x=0.5,
            height=800,
            xaxis_title='Time Index',
            yaxis_title='Sensor Value',
            showlegend=True
        )

        # Show the plot
        com_fig.show()

plot_comparison_part1(df)


In [None]:
def plot_comparison_part2(df):
    """Plots the second half of activities (7 to 12)."""
    for i in range(6, 13):  # Loop through activities 7 to 12
        # Filter data for the current activity
        activity_data = df[df['Activity'] == i]

        # Check if there is no data for the current activity
        if activity_data.empty:
            print(f"No data available for Activity {i}. Skipping.")
            continue

        # Create a subplot for the current activity
        com_fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=[
                f'{activity_map.get(i, f"Activity {i}")} - {sensor_name}' for sensor_name in sensors.keys()
            ],
            shared_yaxes=True
        )

        for idx, (sensor_name, columns) in enumerate(sensors.items()):
            row = (idx // 2) + 1
            col = (idx % 2) + 1

            for column in columns:
                com_fig.add_trace(go.Scatter(
                    x=activity_data.reset_index(drop=True).index,
                    y=activity_data.reset_index(drop=True)[column],
                    mode='lines',
                    name=column,
                    line=dict(width=1.5)
                ), row=row, col=col)

        # Update layout for each subplot
        com_fig.update_layout(
            title=f'Activity: {activity_map.get(i, f"Activity {i}")}',
            title_x=0.5,
            height=800,
            xaxis_title='Time Index',
            yaxis_title='Sensor Value',
            showlegend=True
        )

        # Show the plot
        com_fig.show()

plot_comparison_part2(df)


In [None]:
# Plot category : Creates horizontal bar plots to visualize the percentage distribution of a categorical variable.
# Purpose
# Summarize Categorical Data:
# Provides a quick overview of the distribution of categories.
# Example: If cat is Activity, it shows the percentage distribution of activities.
# Spot Imbalances:
# Highlights whether certain categories dominate the dataset, which is crucial for model building and balancing.

# Purpose: To create horizontal bar plots that show the percentage distribution of a categorical variable (e.g., 'Activity')

def plot_category(df, cat):
    # Calculate the percentage distribution
    array = (df[cat].value_counts().sort_values(ascending=False) / len(df)) * 100

    # Convert the percentage values to string format with a '%' symbol
    # text_values = array.values.round(1).astype(str) + '%'  # Original line causing error
    # The correct way to add '%' is to use list comprehension and format strings:
    text_values = [f'{x:.1f}%' for x in array.values]

    # Create the bar chart using Plotly Express
    cat_fig = px.bar(
        x=array.values,  # Percentage values
        y=array.index,   # Category names
        labels={'x': 'Percentage', 'y': cat},  # Axis labels
        title=f'{cat} Distribution',  # Plot title
        text=text_values,  # Add percentage text on bars
        orientation='h',  # Horizontal bar chart
        color=array.index,  # Color bars by category
        color_discrete_sequence=px.colors.qualitative.Set3  # Set color palette
    )

    # Customize layout
    cat_fig.update_layout(
        title_x=0.5,  # Center the title
        height=500,  # Set figure height
        showlegend=False,  # Hide the legend
        xaxis_title='Percentage',  # Label for x-axis
        yaxis_title=cat,  # Label for y-axis
    )

    # Display the chart
    cat_fig.show()

plot_category(df, 'Activity')

In [None]:
# Create correlation matrix excluding Activity and subject
corr_matrix = df.iloc[:, 0:-2].corr()

# Convert the correlation matrix to a format Plotly can use
corr_matrix = corr_matrix.reset_index().melt(id_vars=['index'])

# Create the heatmap with Plotly
cm_fig = px.imshow(corr_matrix.pivot(index='index', columns='variable', values='value'),
                color_continuous_scale='RdBu_r',
                labels={'index': 'Features', 'variable': 'Features', 'value': 'Correlation'},
                title="Correlation Heatmap")

cm_fig.update_layout(
    xaxis_title='Features',
    yaxis_title='Features',
    title_x=0.5,  # Center the title
)

cm_fig.show()

In [None]:
# PCA
#used here to reduce the dimensionality of sensor data (accelerometer and gyroscope measurements),making it easier to visualize and analyze activity patterns while preserving as much variance as possible.
#This allows for more efficient classification and feature extraction from the high-dimensional data.

X = df.drop(columns=['Activity', 'subject'])  # Dropping Activity and subject columns for PCA

# Perform PCA (reduce to 2 components)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
pca_df = pd.DataFrame(data=X_pca, columns=['PC1', 'PC2'])

# Add the 'Activity' column back to the PCA dataframe
pca_df['Activity'] = df['Activity']

# Plotly Express scatter plot for PCA
pca_fig = px.scatter(pca_df, x='PC1', y='PC2', color='Activity',
                 title='PCA Scatter Plot',
                 labels={'PC1': 'Principal Component 1', 'PC2': 'Principal Component 2'},
                 color_continuous_scale='viridis')

# Show the plot
pca_fig.show()

In [None]:
# Accelerometer for Ankle
# Plot for each axis (e.g., 'alx', 'aly', 'alz')

fig1 = px.box(df, x='Activity', y='alx', color='Activity',
              title="Left Ankle Accelerometer X-axis Readings Across Activities",
              labels={'Activity': 'Activity', 'alx': 'Acceleration X-axis'})

fig2 = px.box(df, x='Activity', y='aly', color='Activity',
              title="Left Ankle Accelerometer Y-axis Readings Across Activities",
              labels={'Activity': 'Activity', 'aly': 'Acceleration Y-axis'})

fig3 = px.box(df, x='Activity', y='alz', color='Activity',
              title="Left Ankle Accelerometer Z-axis Readings Across Activities",
              labels={'Activity': 'Activity', 'alz': 'Acceleration Z-axis'})

# Show all the plots
fig1.show()
fig2.show()
fig3.show()


In [None]:
# Gyroscope for Ankle
# Plot for each axis (e.g., 'glx', 'gly', 'glz')

fig4 = px.box(df, x='Activity', y='glx', color='Activity',
              title="Left Ankle Gyroscope X-axis Readings Across Activities",
              labels={'Activity': 'Activity', 'glx': 'Rotation X-axis'})

fig5 = px.box(df, x='Activity', y='gly', color='Activity',
              title="Left Ankle Gyroscope Y-axis Readings Across Activities",
              labels={'Activity': 'Activity', 'gly': 'Rotation Y-axis'})

fig6 = px.box(df, x='Activity', y='glz', color='Activity',
              title="Left Ankle Gyroscope Z-axis Readings Across Activities",
              labels={'Activity': 'Activity', 'glz': 'Rotation Z-axis'})

# Show all the plots
fig4.show()
fig5.show()
fig6.show()


In [None]:
# Accelerometer for Arm
# Plot for each axis (e.g., 'arx', 'ary', 'arz')

fig7 = px.box(df, x='Activity', y='arx', color='Activity',
              title="Right Lower Arm Accelerometer X-axis Readings Across Activities",
              labels={'Activity': 'Activity', 'arx': 'Acceleration X-axis'})

fig8 = px.box(df, x='Activity', y='ary', color='Activity',
              title="Right Lower Arm Accelerometer Y-axis Readings Across Activities",
              labels={'Activity': 'Activity', 'ary': 'Acceleration Y-axis'})

fig9 = px.box(df, x='Activity', y='arz', color='Activity',
              title="Right Lower Arm Accelerometer Z-axis Readings Across Activities",
              labels={'Activity': 'Activity', 'arz': 'Acceleration Z-axis'})

# Show all the plots
fig7.show()
fig8.show()
fig9.show()



In [None]:
# Gyroscope for Arm
# Plot for each axis (e.g., 'grx', 'gry', 'grz')

fig10 = px.box(df, x='Activity', y='grx', color='Activity',
              title="Right Lower Arm Gyroscope X-axis Readings Across Activities",
              labels={'Activity': 'Activity', 'grx': 'Rotation X-axis'})

fig11 = px.box(df, x='Activity', y='gry', color='Activity',
              title="Right Lower Arm Gyroscope Y-axis Readings Across Activities",
              labels={'Activity': 'Activity', 'gry': 'Rotation Y-axis'})

fig12 = px.box(df, x='Activity', y='grz', color='Activity',
              title="Right Lower Arm Gyroscope Z-axis Readings Across Activities",
              labels={'Activity': 'Activity', 'grz': 'Rotation Z-axis'})

# Show all the plots
fig10.show()
fig11.show()
fig12.show()


# 5. Data Preprocessing and Transformation

# Standardisation and Splitting of Dataset

In [None]:
# Make a copy of the dataframe before modifying
X = df.drop(['Activity','subject'], axis=1).copy()
y = df['Activity']

In [None]:
# Train-test splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Segment features into numerical and categorical
num_features = ['alx', 'aly', 'alz', 'glx', 'gly', 'glz', 'arx', 'ary', 'arz', 'grx', 'gry', 'grz']
# cat_features = ['subject']

# Numerical transformer (Impute missing values and scale numerical features)
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean
    ('scaler', StandardScaler())  # Scale the features
])

# Categorical transformer (Impute missing values and apply one-hot encoding)
# cat_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with most frequent category
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode the categorical feature
# ])

# Apply both transformers
data_transformer = ColumnTransformer(
    transformers=[
        ('numerical', num_transformer, num_features)
        # ('categorical', cat_transformer, cat_features)
    ])


In [None]:
# Label encoding for target variable
label_encoder = LabelEncoder()

# Apply label encoding separately to the target variable (as it's not part of the pipeline)
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

### 5.1 Pipeline and Modelling

In [None]:
# Function to create model pipeline with preprocessing and classifier
def create_pipeline(model):
    return Pipeline(steps=[
        ('data_transformer', data_transformer),  # Apply preprocessing steps
        ('classifier', model)
    ])

In [None]:
# # Function to plot confusion matrix
def plot_confusion_matrix(y_test_encoded, y_pred, model_name, activity_map):
    cm = confusion_matrix(y_test_encoded, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=activity_map.values(),
                yticklabels=activity_map.values())
    plt.title(f'Confusion Matrix - {model_name}')
    plt.xlabel('Predicted Activity')
    plt.ylabel('Actual Activity')
    plt.show()

# Function to plot confusion matrix using Plotly Express
'''def plot_confusion_matrix(y_test_encoded, y_pred, model_name, activity_map):
    cm = confusion_matrix(y_test_encoded, y_pred)

    # Convert confusion matrix to a DataFrame for better plotting with Plotly
    cm_df = pd.DataFrame(cm, index=activity_map.values(), columns=activity_map.values())

    # Plot the confusion matrix using Plotly Express
    fig = px.imshow(cm_df,
                    labels={'x': 'Predicted Activity', 'y': 'Actual Activity'},
                    title=f'Confusion Matrix - {model_name}',
                    color_continuous_scale='Blues',
                    text_auto=True)
    fig.update_xaxes(side="top")  # To put the predicted labels at the top
    fig.show()'''

In [None]:
def train_and_evaluate_model(model, X_train, X_test, y_train_encoded, y_test_encoded, model_name, activity_map):
    # Pipelining
    pipeline = create_pipeline(model)
    pipeline.fit(X_train, y_train_encoded)
    y_pred = pipeline.predict(X_test)

    print(f"{model_name} Accuracy: {accuracy_score(y_test_encoded, y_pred)}")
    print(f"\n{model_name} Classification Report:")
    print(classification_report(y_test_encoded, y_pred))

    plot_confusion_matrix(y_test_encoded, y_pred, model_name, activity_map)
    plt.show()

In [None]:
# TEST CAN DELETE AFTER

def train_and_evaluate_model(model, X_train, X_test, y_train_encoded, y_test_encoded, model_name, activity_map):
    # Pipelining
    pipeline = create_pipeline(model)
    pipeline.fit(X_train, y_train_encoded)
    y_pred = pipeline.predict(X_test)

    # Check if the model is a regressor or classifier
    if isinstance(model, (LinearRegression,)):  # Add other regressors if needed
        # Use regression metrics
        from sklearn.metrics import mean_squared_error, r2_score
        print(f"{model_name} Mean Squared Error: {mean_squared_error(y_test_encoded, y_pred)}")
        print(f"{model_name} R-squared: {r2_score(y_test_encoded, y_pred)}")
    else:
        # Use classification metrics
        print(f"{model_name} Accuracy: {accuracy_score(y_test_encoded, y_pred)}")
        print(f"\n{model_name} Classification Report:")
        print(classification_report(y_test_encoded, y_pred))
        plot_confusion_matrix(y_test_encoded, y_pred, model_name, activity_map)
        plt.show()

In [None]:
def tune_hyperparameters(model, param_grid, X_train, y_train, X_test, y_test, scoring='accuracy', cv=5):
    pipeline = create_pipeline(model)
    grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring=scoring, cv=cv, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    cv_score = grid_search.best_score_
    test_score = accuracy_score(y_test, best_model.predict(X_test))

    results_df = pd.DataFrame({
        "Best Model": [best_model],
        "Best Parameters": [best_params],
        "CV Score": [cv_score],
        "Test Score": [test_score]
    })

    return results_df

Since we have cleaned our MHealth data (by dopping irrelevant columns), encoded our vairables, train-test split as well as standardised our data, we proceed to classification models. We will be using Logistic Regression, K-Nearest Neighbours, Support Vector Machines and Random Forest Trees. After which we will compare the accuracy scores from all the models.

# Models

In [None]:
model_lr = joblib.load('LogReg.pkl')
print("LogReg loaded successfully!")

In [None]:
model_knn = joblib.load('KNN.pkl')
print("KNN loaded successfully!")

In [None]:
model_rf = RandomForestClassifier(random_state=42)
train_and_evaluate_model(model_rf, X_train, X_test, y_train_encoded, y_test_encoded, "Random Forest", activity_map)

In [None]:
joblib.dump(model_rf, 'RF.pkl')
print("Model saved as 'RF.pkl'")

In [None]:
model_rf = joblib.load('RF.pkl')
print(type(model_rf))

In [None]:
model_svm = joblib.load('SVM.pkl')
print("SVM loaded successfully!")

In [None]:
model_DT = joblib.load('DT.pkl')
print("DT loaded successfully!")

In [None]:
model_linr = joblib.load('LinR.pkl')
print("LinR loaded successfully!")

# Classification Reports

If you want to view the the model accuracy and classification reports.

In [None]:
X_test_transformed = data_transformer.transform(X_test)
y_pred = model_lr.predict(X_test_transformed)

accuracy = accuracy_score(y_test_encoded, y_pred)
print(f"Logistic Regression Accuracy: {accuracy:.10f}\n")

report = classification_report(y_test_encoded, y_pred, target_names=activity_map.values())
print("Logistic Regression Classification Report:")
print(report)

In [None]:
X_test_transformed = data_transformer.transform(X_test)
y_pred = model_knn.predict(X_test_transformed)

accuracy = accuracy_score(y_test_encoded, y_pred)
print(f"KNN Accuracy: {accuracy:.10f}\n")

report = classification_report(y_test_encoded, y_pred, target_names=activity_map.values())
print("KNN Classification Report:")
print(report)

In [None]:
X_test_transformed = data_transformer.transform(X_test)
y_pred = model_svm.predict(X_test_transformed)

accuracy = accuracy_score(y_test_encoded, y_pred)
print(f"SVM Accuracy: {accuracy:.10f}\n")

report = classification_report(y_test_encoded, y_pred, target_names=activity_map.values())
print("SVM Classification Report:")
print(report)

In [None]:
X_test_transformed = data_transformer.transform(X_test)
y_pred = model_rf.predict(X_test_transformed)

accuracy = accuracy_score(y_test_encoded, y_pred)
print(f"Random Forest Accuracy: {accuracy:.10f}\n")

report = classification_report(y_test_encoded, y_pred, target_names=activity_map.values())
print("Random Forest Classification Report:")
print(report)

In [None]:
X_test_transformed = data_transformer.transform(X_test)
y_pred = model_DT.predict(X_test_transformed)

accuracy = accuracy_score(y_test_encoded, y_pred)
print(f"Decision Tree Accuracy: {accuracy:.10f}\n")

report = classification_report(y_test_encoded, y_pred, target_names=activity_map.values())
print("Decision Tree Classification Report:")
print(report)

# Model Training (ONLY RUN IF NOT USING PKL)

In [None]:
model_lr = LogisticRegression(random_state=42, max_iter=1000)
train_and_evaluate_model(model_lr, X_train, X_test, y_train_encoded, y_test_encoded, "Logistic Regression", activity_map)

# Save the model using joblib
joblib.dump(model_lr, 'LogReg.pkl')
print("Model saved as 'LogReg.pkl'")

In [None]:
model_knn = KNeighborsClassifier()
train_and_evaluate_model(model_knn, X_train, X_test, y_train_encoded, y_test_encoded, "KNN", activity_map)

# Save the model using joblib
joblib.dump(model_knn, 'KNN.pkl')
print("Model saved as 'KNN.pkl'")

In [None]:
model_rf = RandomForestClassifier(random_state=42)
train_and_evaluate_model(model_rf, X_train, X_test, y_train_encoded, y_test_encoded, "Random Forest", activity_map)

#Save the model using joblib
joblib.dump(model_rf, 'RF.pkl')
print("Model saved as 'RF.pkl'")

In [None]:
model_svm = SVC(kernel='rbf', random_state=1, gamma=0.1)
train_and_evaluate_model(model_svm, X_train, X_test, y_train_encoded, y_test_encoded, "SVM", activity_map)

#Save the model using joblib
joblib.dump(model_svm, 'SVM.pkl')
print("Model saved as 'SVM.pkl'")

In [None]:
model_dt = DecisionTreeClassifier(random_state=42)
train_and_evaluate_model(model_dt, X_train, X_test, y_train_encoded, y_test_encoded, "Decision Tree", activity_map)

# Save the model using joblib
joblib.dump(model_dt, 'DT.pkl')
print("Model saved as 'DT.pkl'")

In [None]:
model_linr = LinearRegression()
train_and_evaluate_model(model_linr, X_train, X_test, y_train_encoded, y_test_encoded, "Linear Regression", activity_map)
joblib.dump(model_linr, 'LinR.pkl')
print("Model saved as 'LinReg.keras'")

# Hyperparameter Tuning

In [None]:
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10],  # Regularization strength
    'solver': ['liblinear', 'lbfgs'],  # Optimization algorithm
}

results_lr = tune_hyperparameters(model_lr, param_grid_lr, X_train, y_train_encoded, X_test, y_test_encoded)
print(f"Logistic Regression Results: {results_lr}")

In [None]:
param_grid_rf = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [10, 20, 30],  # Maximum depth of each tree
}

results_rf = tune_hyperparameters(model_rf, param_grid_rf, X_train, y_train_encoded, X_test, y_test_encoded)
print(f"Random Forest Results: {results_rf}")

In [None]:
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 10],  # Number of neighbors to use
    'weights': ['uniform', 'distance'],  # Weight function
}

results_knn = tune_hyperparameters(model_knn, param_grid_knn, X_train, y_train_encoded, X_test, y_test_encoded)
print(f"KNN Results: {results_knn}")

In [None]:
param_grid_svm = {
    'C': [0.1, 1, 10, 100],  # Regularization strength
    'gamma': [0.01, 0.1, 1, 10],  # Kernel coefficient
    'kernel': ['rbf', 'linear'],  # Kernel type (RBF and Linear)
}

results_svm = tune_hyperparameters(model_svm, param_grid_svm, X_train, y_train_encoded, X_test, y_test_encoded)
print(f"SVM Hyperparameter Tuning Results: {results_svm}")

In [None]:
param_grid_dt = {
    'criterion': ['gini', 'entropy'],  # Split criterion
    'max_depth': [5, 10, 15, None],   # Maximum tree depth
    'min_samples_split': [2, 5, 10],  # Minimum samples to split
    'min_samples_leaf': [1, 2, 4]     # Minimum samples at leaf node
}

model_dt = DecisionTreeClassifier(random_state=42)

results_dt = tune_hyperparameters(model_dt, param_grid_dt, X_train, y_train_encoded, X_test, y_test_encoded)

print(f"Decision Tree Results: {results_dt}")

# Human Activity Recognition

In [None]:
# Get a random sample for testing
def random_sample(df):
  """ Retrieve random values for testing prediction"""

  # Randomizse row and select 1st row
  random_select= """
  SELECT *
  FROM mobile_health
  ORDER BY RANDOM()
  LIMIT 10;
  """

  sample_data = pd.read_sql_query(random_select, conn)
  # Check the value return
  # display(sample_data)
  return sample_data

In [None]:
def table_test(df, model):
  """ Test Prediction and Display Results """
  modelname = type(model).__name__

  # Get Random Sample
  testing = random_sample(df)

  # Separate Input variables and results
  # # Features
  X_features = testing.drop(['Activity', 'subject'], axis=1)
  # # Label
  y_label = testing['Activity']

  # predict
  predict = model.predict(X_features)


  # Display as table
  table = PrettyTable()

  # Display Table
  """
  | Test   # |
  | Pred Act |
  | True Act |
  """
  print(f"Model : {modelname}")
  table.add_column("Test   #", ["Pred Act", "True Act"])
  for i in range(len(testing)):
    table.add_column(str(i+1), [predict[i], y_label[i]])

  print(table)

In [None]:
# Load Model
rf_model = joblib.load('tuned_RF.pkl')

# Check model
type(rf_model)

In [None]:
table_test(df, rf_model)