> # ðŸŒˆ Step 1: Data Processing

* * * * * * * ***   

<h4>âœ¨ Read accelerometer and gyroscope data</br></br>
âœ¨ Create required dataframe columns such as participant, label and category</br></br>
âœ¨ Merge the accelerometer data and gyroscope data</br></br>
âœ¨ Resample the merged data</br></br>
âœ¨ Export the preprocessed data</h4>

In [None]:
import pandas as pd
from glob import glob
from pathlib import Path


sensor_data_files = glob("/kaggle/input/sensor/*.csv")

def merge_sensor_file_data(sensor_data_files):
    
    common_path = '/kaggle/input/sensor/'
    
    # create empty data frames each for accelerometer and gyroscope
    df_accelerometer = pd.DataFrame()
    df_gyroscope = pd.DataFrame()

    set_accelerometer = 0
    set_gyroscope = 0


    # create a for loop that appends the CSV files into the two data frames, each for accelerometer and gyroscope
    for file in sensor_data_files :
        participant = file.split("-")[0].replace(common_path,"")
        label = file.split("-")[1]
        category = file.split("-")[2].rstrip("123").rstrip("_MetaWear_2019")
    
        df = pd.read_csv(file)
        df["participant"] = participant
        df["label"] = label
        df["category"] = category


        if 'Accelerometer' in file :
            set_accelerometer += 1
            df["set"] = set_accelerometer
            df_accelerometer = pd.concat([df_accelerometer,df])
     

        if 'Gyroscope' in file :
            set_gyroscope += 1
            df["set"] = set_gyroscope
            df_gyroscope = pd.concat([df_gyroscope,df])
       
        
    #let's use epoch (unix time) as the standard time as it will not reflect the summer time change in our project
   
    df_accelerometer.index = pd.to_datetime(df_accelerometer["epoch (ms)"],unit="ms")
    df_gyroscope.index = pd.to_datetime(df_gyroscope["epoch (ms)"],unit="ms")


    
    columns = ["epoch (ms)","time (01:00)", "elapsed (s)"]
    df_accelerometer.drop(columns,axis=1,inplace=True)
    df_gyroscope.drop(columns,axis=1,inplace=True)

    df_merged = pd.concat([df_accelerometer.iloc[:,0:3],df_gyroscope], axis=1)

    df_merged.columns = [
    "acc_x",
    "acc_y",
    "acc_z",
    "gyr_x",
    "gyr_y",
    "gyr_z",
    "participant",
    "label",
    "category",
    "set"
    ]
    
    return df_merged


def Sample(df_merged):
    # Sample Data
    sampling = {
        "acc_x": "mean",
        "acc_y": "mean",
        "acc_z": "mean",
        "gyr_x": "mean",
        "gyr_y": "mean",
        "gyr_z": "mean",
        "participant": "last",
        "label": "last",
        "category": "last",
        "set": "last"
    }

    #let us split the dataframes based on the date which uses less computational power
    days = [g for n, g in df_merged.groupby(pd.Grouper(freq='D'))]
    data_resampled = pd.concat([df.resample(rule='200ms').apply(sampling).dropna() for df in days])
    data_resampled["set"] = data_resampled["set"].astype("int")

    return data_resampled


# read and merge sensory data files
df_merged = merge_sensor_file_data(sensor_data_files)

#sample the data
df_resampled = Sample(df_merged)


# Export dataset
Path('/kaggle/working/interim1').mkdir(parents=True, exist_ok=True)
df_resampled.to_pickle("/kaggle/working/interim1/01_data_processed.pkl")
print(df_resampled["label"].unique())

* * * * ***   

> # ðŸŒˆ Step 2 : Visualize

 <h4> âœ¨ Plot particular sensor column (gyr_x) for each label  (bench,Ohp,squat,dead,row,rest) </h4>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from IPython.display import display
from pathlib import Path


mpl.style.use("seaborn-v0_8-deep")
mpl.rcParams["figure.figsize"] = (15,5)
mpl.rcParams["figure.dpi"] = 100


df_resampled = pd.read_pickle("/kaggle/input/interim1/01_data_processed.pkl")


# Select few data sets and plot for each label for gyr_x
col = "gyr_x"
for label in df_resampled["label"].unique():
    subset = df_resampled[df_resampled["label"] == label]
    fig, ax = plt.subplots()
    plt.plot(subset[:50][col].reset_index(drop = True), label=label,color="orange")
    plt.legend()
    plt.show() 
    #Path('/kaggle/working/visualization-images/folder1').mkdir(parents=True, exist_ok=True)
    #plt.savefig(f"/kaggle/working/visualization-images/folder1/{label.title()} ({col}).png")

* * * * * * * * * * ***   

<h4> âœ¨ Compare each participant based on a specific label(ohp) and a particular sensor column(acc_z) </h4>

In [None]:
# Compare participants

from pathlib import Path

subset = []
compare_column = "participant"
col = "acc_z"
label = "ohp"
subset = df_resampled.query("label == @label").sort_values(compare_column).reset_index()
subset.groupby([compare_column])[col].plot()
#ax.set_xlabel("acc_z")
#ax.set_ylabel("acc_z reading")
plt.legend() 
plt.xlabel("Samples")
plt.ylabel(f"{col} reading")

#Path('/kaggle/working/visualization-images/Compare-participants').mkdir(parents=True, exist_ok=True)
#plt.savefig(f"/kaggle/working/visualization-images/Compare-participants/{label.title()} ({col}).png")

 ** * * * * * **   

<h4> âœ¨ Compare category based on a specific label(ohp),  particular sensor column(gyr_z) and  participant(A) </h4> 

In [None]:
# Compare category - Heavy vs Medium

from pathlib import Path

subset = []
participant1 = "A"
col = "gyr_z"
label = "ohp"
compare_column = "category"
subset = df_resampled.sort_values(by=['participant']).query("label == @label").query("participant == @participant1").reset_index()[300:600]
fig, ax = plt.subplots()
subset.groupby([compare_column])[col].plot()
#set x axis and y axis values in pandas plot
ax.set_xlabel("Samples")
ax.set_ylabel(f"{col} reading")
plt.title(f"Participant {participant1}")
plt.legend() 

#Path('/kaggle/working/visualization-images/Compare_category(Heavy vs Medium)').mkdir(parents=True, exist_ok=True)
#plt.savefig(f"/kaggle/working/visualization-images/Compare_category(Heavy vs Medium)/{label.title()} ({col}).png")

* * * * * ***   

<h4> âœ¨ Compare gyroscope data (gyr_x, gyr_y, gyr_z) for a particular label(squat) and  participant(A) </h4>

In [None]:

label = "squat"
participant = "A"
subset = df_resampled.query(f"label == '{label}'").query(f"participant == '{participant}'").reset_index()[:50]
subset[["gyr_x", "gyr_y", "gyr_z"]].plot()
ax.set_xlabel("values")
ax.set_ylabel("samples")
plt.title(f"{label} {participant}")
plt.legend() 

* * * * * ***   

<h4>  âœ¨ Plot all sensor data for every label and participant </h4>

In [None]:
# Create a loop to plot all combinations per sensor

from pathlib import Path


labels = df_resampled["label"].unique()
participants = df_resampled["participant"].unique()

for label in labels:
    for participant in participants:
        subset = df_resampled.query(f"label == '{label}'").query(f"participant == '{participant}'").reset_index()
    

        if len(subset)>0 :
            fig, ax = plt.subplots()
            subset[["gyr_x", "gyr_y", "gyr_z"]].plot(ax=ax)
            ax.set_xlabel("gyroscope samples")
            ax.set_ylabel("gyroscope reading")
            plt.title(f"{label} {participant}".title())
            plt.legend()
            #Path('/kaggle/working/visualization-images/All_Labels_&_Participant/Gyroscope').mkdir(parents=True, exist_ok=True)
            #plt.savefig(f"/kaggle/working/visualization-images/All_Labels_&_Participant/Gyroscope/{label.title()} ({participant}).png")


labels = df_resampled["label"].unique()
participants = df_resampled["participant"].unique()

for label in labels:
    for participant in participants:
        subset = df_resampled.query(f"label == '{label}'").query(f"participant == '{participant}'").reset_index()
    

        if len(subset)>0 :
            fig, ax = plt.subplots()
            subset[["acc_x", "acc_y", "acc_z"]].plot(ax=ax)
            ax.set_xlabel("accelerometer samples")
            ax.set_ylabel("accelerometer reading")
            plt.title(f"{label} {participant}".title())
            plt.legend() 
            #Path('/kaggle/working/visualization-images/All_Labels_&_Participant/Accelerometer').mkdir(parents=True, exist_ok=True)
            #plt.savefig(f"/kaggle/working/visualization-images/All_Labels_&_Participant/Accelerometer/{label.title()} ({participant}).png")

* * * * * * * * * ***   

In [None]:
# Combine plots in one figure
label = "row"
participant = "D"
subset = df_resampled.query(f"label == '{label}'").query(f"participant == '{participant}'").reset_index()

fig , ax = plt.subplots(nrows=2, sharex=True, figsize=(20,10))
subset[["acc_x", "acc_y", "acc_z"]].plot(ax=ax[0])
subset[["gyr_x", "gyr_y", "gyr_z"]].plot(ax=ax[1])
plt.legend()

#just have some styling
ax[0].set_xlabel("acc samples") 
ax[0].set_ylabel("acc reading") 
ax[1].set_xlabel("gyr samples")
ax[1].set_ylabel("gyr reading") 
ax[0].legend(loc="upper left",bbox_to_anchor=(0.1,1.15), ncol=3, fancybox=True, shadow=True)
ax[1].legend(loc="upper left",bbox_to_anchor=(0.1,1.15), ncol=3, fancybox=True, shadow=True) 

* * * * * * * * ***   

<h4> âœ¨ Plot all sensor data for every label and participant </h4>

In [None]:
from pathlib import Path
      
# Loop over all combinations and export for both sensors
labels = df_resampled["label"].unique()
participants = df_resampled["participant"].unique()

for label in labels:
    for participant in participants:
        subset = df_resampled.query(f"label == '{label}'").query(f"participant == '{participant}'").reset_index()
    

        if len(subset)>0 :
            fig , ax = plt.subplots(nrows=2, sharex=True, figsize=(20,10))
            subset[["acc_x", "acc_y", "acc_z"]].plot(ax=ax[0])
            subset[["gyr_x", "gyr_y", "gyr_z"]].plot(ax=ax[1])

            ax[0].legend(loc="upper center",bbox_to_anchor=(0.1,1.15), ncol=3, fancybox=True, shadow=True)
            ax[1].legend(loc="upper center",bbox_to_anchor=(0.1,1.15), ncol=3, fancybox=True, shadow=True)
            ax[1].set_xlabel("samples")
            ax[0].set_ylabel("accelerometer readings")
            ax[1].set_ylabel("gyroscope readings")
            ax[0].set_title(f"Participant : {participant}\nExercise : {label}")
            #Path('/kaggle/working/visualization-images/All_Labels_&_Participant/Accelerometer_Gyroscope').mkdir(parents=True, exist_ok=True)
            #plt.savefig(f"/kaggle/working/visualization-images/All_Labels_&_Participant/Accelerometer_Gyroscope/{label.title()} ({participant}).png")

# You can get the visualization images under (Datasets -> Visualization-images)

* * * * * * * * * ***   

> # ðŸŒˆ Step 3: Outlier detection 

<h3> âœ¨ Defining the three different methods for outlier detection </h3>

<h4> 1.Interquartile range</h4>
<h4> 2.Chauvenet's criterion </h4>
<h4> 3.Local Outlier Factor </h4>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import scipy
from sklearn.neighbors import LocalOutlierFactor  # pip install scikit-learn
from pathlib import Path

plt.style.use("fivethirtyeight")
plt.rcParams["figure.figsize"] = (20,5)
plt.rcParams["figure.dpi"] = 100


# Method 1 : Interquartile range (distribution-based)
def iqr(df, column):

    df = df.copy()

    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    df[column + "_outlier"] = (df[column] < lower_bound) | (
        df[column] > upper_bound
    )

    return df


# Method 2  : Insert Chauvenet's function
def chauvenet(df, col, C=2):
    
    df = df.copy()
    # Compute the mean and standard deviation.
    mean = df[col].mean()
    std = df[col].std()
    N = len(df.index)
    criterion = 1.0 / (C * N)

    # Consider the deviation for the data points.
    deviation = abs(df[col] - mean) / std

    # Express the upper and lower bounds.
    low = -deviation / math.sqrt(C)
    high = deviation / math.sqrt(C)
    prob = []
    mask = []

    # Pass all rows in the dataset.
    for i in range(0, len(df.index)):
        # Determine the probability of observing the point
        prob.append(
            1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i]))
        )
        # And mark as an outlier when the probability is below our criterion.
        mask.append(prob[i] < criterion)
    df[col + "_outlier"] = mask
    return df




# Method 3 : Local outlier factor (distance based)
def local_outlier_factor(df, columns, n=20):
    
    df = df.copy()

    lof = LocalOutlierFactor(n_neighbors=n)
    data = df[columns]
    outliers = lof.fit_predict(data)
    scores = lof.negative_outlier_factor_

    df["outlier_lof"] = outliers == -1
    return df, outliers, scores



# Plotting outliers
def visualize_outliers(df, col, outlier_col,method, save_plot):
    
    df = df.dropna(axis=0, subset=[col, outlier_col])
    df[outlier_col] = df[outlier_col].astype("bool")

    fig, ax = plt.subplots()

    plt.xlabel("samples")
    plt.ylabel(col + "  reading")

   # Plot non outliers in default color
    ax.plot(
        df.index[~df[outlier_col]],
        df[col][~df[outlier_col]],
        "+",
    )
    # Plot data points that are outliers in red
    ax.plot(
        df.index[df[outlier_col]],
        df[col][df[outlier_col]],
        "r+",
    )

    plt.legend(
        [ "no outlier " + col, "outlier " + col],
        loc="upper center",
        ncol=2,
        fancybox=True,
        shadow=True,
    )
    #plt.show()

    if (save_plot) :
        Path(f'/kaggle/working/Outlier_detection/{method}').mkdir(parents=True, exist_ok=True)
        plt.savefig(f"/kaggle/working/Outlier_detection/{method}/{col}.png")





* * * * * * ***   

<h4> âœ¨ Utilizing InterQuartile range </h4>

In [None]:

# Apply every method to detect outliers
# Apply Method 1 : Interquartile range 



merged_df = pd.read_pickle("/kaggle/input/interim1/01_data_processed.pkl")
columns = list(merged_df.columns[:6])

save_plot = True # Assign true if you want to save plot 

for c in columns:
    df_with_outlier_column  = iqr(merged_df, c)
    df_with_outlier_column.reset_index()
    visualize_outliers(df = df_with_outlier_column, col=c, outlier_col = c +"_outlier",method="IQR",save_plot=save_plot)  
    

* * * * * * ***   

<h4> âœ¨ Utilizing Chauvenet's criterion </h4>

In [None]:
# Apply Method 2 : Insert Chauvenet's criterion

from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

save_plot = True # Assign true if you want to save plot 
merged_df = pd.read_pickle("/kaggle/input/interim1/01_data_processed.pkl")
columns = list(merged_df.columns[:6])
for c in columns:
    df_with_outlier_column = chauvenet(merged_df, c)
    df_with_outlier_column.reset_index()
    visualize_outliers(df = df_with_outlier_column, col=c, outlier_col = c +"_outlier",method="Chauvenet",save_plot=save_plot)   
    

* * * * * * ***   

<h4> âœ¨ Utilizing Local outlier factor </h4>

In [None]:
# Apply Method 3 : Local outlier factor (distance based)

from pathlib import Path
merged_df = pd.read_pickle("/kaggle/input/interim1/01_data_processed.pkl")
columns = list(merged_df.columns[:6])
save_plot = True # Assign true if you want to save plot 

df_with_outlier_column, outliers, scores = local_outlier_factor(df = merged_df, columns = columns)

for c in columns: 
    visualize_outliers(df = df_with_outlier_column , col = c , outlier_col = "outlier_lof",method="Local_outlier_factor",save_plot=save_plot) 
 

* * * * * * ***   

<h4> âœ¨ Among the three approaches, selecting Chauvenet's criterion </h4>

 <font size="3"> Replace outliers with NaN and export the dataset </font>

In [None]:
# Choose the best method : Among the three methods such as IQR, chauvenet's criteria and local outlier factor: Chauvenet's criteria identifies few outliers
import warnings
warnings.filterwarnings('ignore')

outliers_removed_df = merged_df.copy()
for col in columns:
    for label in merged_df["label"].unique():
        df_with_outlier_column = chauvenet(merged_df[merged_df["label"]==label], col)
        df_with_outlier_column.loc[df_with_outlier_column[col + "_outlier"],col] = np.nan

        outliers_removed_df.loc[(outliers_removed_df["label"]==label),col] = df_with_outlier_column[col]
            

outliers_removed_df.to_pickle("/kaggle/working/interim1/02_outliers_removed_chauvenets.pkl") 


* * * * * * ***   

> #  **ðŸŒˆ _Step 4: Feature Engineering_**

<font size="3">
ðŸ“š <font color="black"> 4.1 : Handling outliers - <font color="royalblue"> <span style="font-weight:bold"> Interpolation </span> </br>
ðŸ“š <font color="black"> 4.2 : Reduce noise/smoothing data - <font color="royalblue"> <span style="font-weight:bold"> Kalman filter <span style="font-weight:normal">vs </span> Butter worth low pass filter </span> </br>
ðŸ“š <font color="black"> 4.3 : Reduce dimensionality of datasets  - <font color="royalblue"> <span style="font-weight:bold"> Principal component Analysis </span> </br>
ðŸ“š  <font color="black"> 4.4 : Qunatify model variability/Model optimization - <font color="royalblue"> <span style="font-weight:bold"> Sum of sqaures attribute </span> </br>
ðŸ“š <font color="black">  4.5 : Temporal Abstraction - <font color="royalblue"> <span style="font-weight:bold"> Rolling mean </span> </br>
ðŸ“š <font color="black">  4.6 : Frequency Abstraction  - <font color="royalblue"> <span style="font-weight:bold"> Variational Mode Decomposition <span style="font-weight:normal">vs </span> Fourier transformation </span> </br>
ðŸ“š  <font color="black"> 4.7 : Resolve overlapping windows
</font>

In [None]:
!pip install vmdpy 
!pip install pykalman
import pywt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from IPython.display import display
from datatransformation1 import LowPassFilter, PrincipalComponentAnalysis
from frequencyabstractionvmd import Variational_Mode_Decomposition
from frequencyabstraction1 import FourierTransformation
from temporalabstraction1 import NumericalAbstraction
from sklearn.cluster import KMeans
from multiprocessing import Pool
from scipy.signal import savgol_filter
from pykalman import KalmanFilter
from scipy.signal import hilbert
from vmdpy import VMD  #  pip install vmdpy
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="vmdpy")


plt.style.use("fivethirtyeight")
plt.rcParams["figure.figsize"] = (20,5)
plt.rcParams["figure.dpi"] = 100
plt.rcParams["lines.linewidth"] = 2


df = pd.read_pickle("/kaggle/input/interim1/02_outliers_removed_chauvenets.pkl")
display(df)
predictor_columns = list(df.columns[:6])



* * * * * * ***   

<h3> ðŸ“š 4.1 Dealing outliers with interpolation(imputation)</h3>

In [None]:
# Dealing with missing values
for column in predictor_columns:
    df[column] = df[column].interpolate()

print("After dealing missing values with interpolation in each column  \n")
print(df.isnull().sum())


* * * * * * ***   

<h4> âœ¨ Finding average duration of every label/exercise indicated by every set</h4>

In [None]:
# average duration of every set
for s in df["set"].unique():
    start    = df[df["set"]==s].index[0]
    stop     = df[df["set"]==s].index[-1]
    duration = stop - start

    df.loc[ df["set"]==s, "duration"] = duration.seconds


# find the average duration based on category
duration_df = df.groupby(["category"])["duration"].mean() 
print(duration_df)

* * * * * * ***   

<h3>ðŸ“š 4.2: Reduce noise/smoothing data </h3>

<font size="3" color="black"> Let's look at two approaches in reducing the noise of accelermoter and gyroscope readings <br>

<font size="3" color="black">  1. Butter worth low pass filter - <font color="royalblue">Existing approach <br>
    
<font size="3" color="black">  2. Kalman filter - <font color="royalblue"> Proposed Approach 

</font>

* * * * * * ***   

<h4> 
âœ¨ Let's look at reducing noise using Butter worth low pass filter 
<font color="royalblue">(Exisiting approach) </font>
</h4>

In [None]:
# Butterworth lowpass filter

from pathlib import Path

df_lowpass = df.copy()
LowPass = LowPassFilter()

# sample frequency
fs = 1000/200

#Lower cutoff value gives higher smoothness
cut_off = 1.2

column = "gyr_z"
butterworth_column = "gyr_z_lowpass"
set1 = 55
df_lowpass = LowPass.low_pass_filter(df_lowpass,column,fs,cut_off, order=5)
subset = df_lowpass[df_lowpass["set"]==set1] 
fig , ax = plt.subplots(nrows=1, sharex=True, figsize=(20,5))
plt.plot(subset[column].reset_index(drop=True), label = "raw data/ Original data")
plt.plot(subset[butterworth_column].reset_index(drop=True), label = "butter worth filter")
ax.legend(loc="upper right",bbox_to_anchor=(1.0,1.15), fancybox=True, shadow=True)
#ax.legend(loc="upper right",bbox_to_anchor=(0.5,1.15), fancybox=True, shadow=True)  
plt.xlabel("Samples")
plt.ylabel(f"{column}")
plt.title(f"{column}\nset : {set1}")
Path('/kaggle/working/Build_features/Butterworth_lowpass_filter').mkdir(parents=True, exist_ok=True)
plt.savefig(f"/kaggle/working/Build_features/Butterworth_lowpass_filter/{column}(set{set1}).png")

column = "acc_y"
butterworth_column = "acc_y_lowpass"
set2 = 25
df_lowpass = LowPass.low_pass_filter(df_lowpass,column,fs,cut_off, order=5)
subset = df_lowpass[df_lowpass["set"]==set2] 
fig , ax = plt.subplots(nrows=1, sharex=True, figsize=(20,10))
plt.plot(subset[column].reset_index(drop=True), label = "raw data/Original reading",color = "blue")
plt.plot(subset[butterworth_column].reset_index(drop=True), label = "butter worth filter", color = "orange")
ax.legend(loc="upper right",bbox_to_anchor=(1.0,1.15), fancybox=True, shadow=True)
#ax.legend(loc="upper center",bbox_to_anchor=(0.5,1.15), fancybox=True, shadow=True) 
plt.xlabel("Samples")
plt.ylabel(f"{column}")
plt.title(f"{column}\nset : {set2}")
Path('/kaggle/working/Build_features/Butterworth_lowpass_filter').mkdir(parents=True, exist_ok=True)
plt.savefig(f"/kaggle/working/Build_features/Butterworth_lowpass_filter/{column}(set{set2}).png")

#ax[0].plot(subset["acc_y"].reset_index(drop=True), label = "raw data/Original reading")
#ax[1].plot(subset["acc_y_lowpass"].reset_index(drop=True), label = "butter worth filter")
#ax[0].legend(loc="upper center",bbox_to_anchor=(0.5,1.15), fancybox=True, shadow=True)
#ax[1].legend(loc="upper center",bbox_to_anchor=(0.5,1.15), fancybox=True, shadow=True)  

* * * * * * ***   

<h4> âœ¨ Let's look at reducing noise using Kalman filter 
<font color="royalblue">(Proposed approach) </font> 
</h4>

In [None]:
# Kalman filter
# avoids sharp peaks and valleys
# reduces noise when there's no data in the raw data
#---------------------------------------------------------------

from pathlib import Path


df_kalman = df.copy()
column = "gyr_z"
set1 = 55
subset = df_kalman[df_kalman["set"]==set1][column].reset_index(drop=True)
kf = KalmanFilter(transition_matrices=[1],
                  observation_matrices=[1],
                  initial_state_mean=subset.iloc[0],
                  initial_state_covariance=1,
                  observation_covariance=1,
                  transition_covariance=0.8)
           
        
# Fit model and make predictions
state_means, _ = kf.filter(subset)
state_means = state_means.flatten()   
        
# Plot original data and Kalman filter predictions


plt.plot(subset, label='raw data/ Original data',color="blue")
plt.plot(state_means, label='Kalman Filter Predictions',color="red")
plt.legend()
plt.xlabel("Values")
plt.ylabel(f"{column}")
plt.title(f"{column}\nset : {set1}")
#plt.show()
#Path('/kaggle/working/Build_features/Kalman_filter').mkdir(parents=True, exist_ok=True)
#plt.savefig(f"/kaggle/working/Build_features/Kalman_filter/{column}(set{set1}).png")

* * * * * * ***   

<h4> âœ¨ Comparing butter worth low pass filter and kalman filter</h4>
<font size="3"> 
* Butter worth low pass filter has noise compared to Kalman filter</br>
* Kalman filter applies smoothing without changing the original data
</font>

In [None]:
from pathlib import Path

# Butterworth lowpass filter
#-----------------------------------------------------------------------------
df_lowpass = df.copy()
LowPass = LowPassFilter()

# sample frequency
fs = 1000/200

#Lower cutoff value gives higher smoothness
cut_off = 1.2

column = "gyr_z"
butterworth_column = "gyr_z_lowpass"
set1 = 55
df_lowpass = LowPass.low_pass_filter(df_lowpass,column,fs,cut_off, order=5)
subset_Butterworth = df_lowpass[df_lowpass["set"]==set1] 

# Kalman filter
#-------------------------------------------------------------------------------
df_kalman = df.copy()
subset_kalman = df_kalman[df_kalman["set"]==set1][column].reset_index(drop=True)
kf = KalmanFilter(transition_matrices=[1],
                  observation_matrices=[1],
                  initial_state_mean=subset_kalman.iloc[0],
                  initial_state_covariance=1,
                  observation_covariance=1,
                  transition_covariance=0.8)
           
        
# Fit model and make predictions
state_means, _ = kf.filter(subset_kalman)
state_means = state_means.flatten()    
        

fig , ax = plt.subplots(nrows=1, sharex=True, figsize=(20,5))
plt.plot(subset_Butterworth[column].reset_index(drop=True), label = "raw data/ Original data")
plt.plot(subset_Butterworth[butterworth_column].reset_index(drop=True), label = "butter worth filter",color = "red")
plt.plot(state_means, label='Kalman Filter Predictions',color="black")
ax.legend(loc="upper right",bbox_to_anchor=(1.0,1.15), fancybox=True, shadow=True)
#ax.legend(loc="upper right",bbox_to_anchor=(0.5,1.15), fancybox=True, shadow=True)  
plt.xlabel("Samples")
plt.ylabel(f"{column}")
plt.title(f"{column}\nset : {set1}")

#Path('/kaggle/working/Build_features/Compare_Butterworth_&_Kalman').mkdir(parents=True, exist_ok=True)
#plt.savefig(f"/kaggle/working/Build_features/Compare_Butterworth_&_Kalman/{column}(set{set1}).png") 


* * * * * * ***   

<h4> âœ¨ Apply Kalman filter to all the accelerometer and gyroscope readings and export the dataset</h4>

In [None]:

# Replace accelermometer and gyrosocpe column with the Kalman filter values
#----------------------------------------------------------------------------

from pathlib import Path

df_kalman = df.copy()

#Replace variables with required values for below ones 
#predictor_columns = ["acc_x"]  
#sets = [55]

save_plot = True # assign True if you want to save plot
sets = df["set"].unique()
for s in sets:
    for col in predictor_columns:
        
        subset = df_kalman[df_kalman["set"]==s][col].reset_index(drop=True)
        #print(subset)


        # Define Kalman filter model
        # avoids sharp peaks and valleys
        # reduces noise when there's no data in the raw data
        kf = KalmanFilter(transition_matrices=[1],
                          observation_matrices=[1],
                          initial_state_mean=subset.iloc[0],
                          initial_state_covariance=1,
                          observation_covariance=1,
                          transition_covariance=0.8)
           
        
        # Fit model and make predictions
        state_means, _ = kf.filter(subset)
        state_means = state_means.flatten()
        
        df_kalman.loc[df_kalman['set'] == s, col] = state_means
        # Plot original data and Kalman filter predictions
        plt.plot(subset, label='raw data',color="blue")
        plt.plot(state_means, label='Kalman Filter Predictions',color="red")
        plt.legend()
        plt.xlabel("Values")
        plt.ylabel(f"{col}")
        plt.title(f"{col}\nset : {s}")
        plt.show()

        if (save_plot):
            Path('/kaggle/working/Build_features/Kalman_filter_All(Set_&_Column)').mkdir(parents=True, exist_ok=True)
            plt.savefig(f"/kaggle/working/Build_features/Kalman_filter_All(Set_&_Column)/{col}(set{s}).png") 

#export the dataset
Path('/kaggle/working/interim1').mkdir(parents=True, exist_ok=True)
df_kalman.to_pickle("/kaggle/working/interim1/01_kalman_filter_applied.pkl")

# compare the original data with the kalman filtered data
#df["acc_y"][:100].reset_index(drop=True).plot()
#df_kalman["acc_y"][:100].reset_index(drop=True).plot()

* * * * * * ***   

<h4> âœ¨ Apply butter worth low pass filter to accelerometer and gyroscope readings  and export the dataset</h4>

In [None]:
from pathlib import Path

df_lowpass = df.copy()


# defining new object of LowPassFilter() class
LowPass = LowPassFilter()

fs = 1000/200

cut_off = 1.2

predictor_columns = list(df.columns[:6])

for col in predictor_columns:
    df_lowpass = LowPass.low_pass_filter(df_lowpass,col,fs,cut_off, order=5)
    df_lowpass[col] = df_lowpass[col+"_lowpass"]
    del df_lowpass[col+"_lowpass"]

# export the dataset
Path('/kaggle/working/interim1').mkdir(parents=True, exist_ok=True)
df_lowpass.to_pickle("/kaggle/working/interim1/01_butter_worth_lowpass_applied.pkl")

* * * * * * ***   

<h3> ðŸ“š 4.3: Apply Principal component Analysis(PCA) to Reduce dimensionality of datasets</h3>
<font size="3">
1. Kalman dataset<br>
2. Butter worth low pass dataset
</font>           
             

* * * * * * ***   

<h4> âœ¨ Let's Apply PCA to Kalman dataset</h4>

In [None]:
from sklearn.decomposition import PCA
pca1 = PCA()
# Principal component analysis PCA
# This dataframe has undergone kalman filter
df_kalman = pd.read_pickle("/kaggle/input/interim1/01_kalman_filter_applied.pkl")
predictor_columns = list(df_kalman.columns[:6])
df_pca_kalman = df_kalman.copy()

# define the object of the PrincipalComponentAnalysis class
pca = PrincipalComponentAnalysis()

# decide the optimal number of principal components
pc_values = pca.determine_pc_explained_variance(df_pca_kalman, predictor_columns)  


# visualize the elbow curve
print("PCA values are : ",pc_values)
plt.figure(figsize=(10,10))
plt.plot(range(1, len(predictor_columns)+1),pc_values)
plt.xlabel("principal component number")
plt.ylabel("explained vaiance")
plt.show()  

# --- Fit PCA ---
pca1.fit(df_pca_kalman[predictor_columns])
# --- Determine loadings (feature contributions to PCs) ---
loadings = pd.DataFrame(
    pca1.components_.T,
    columns=[f'PC{i+1}' for i in range(len(predictor_columns))],
    index=predictor_columns
)
print("\nPCA Loadings (feature contributions to each PC):")
print(loadings)

# --- Select top PCs that explain most variance ---
# Example: take first 2 PCs (you can adjust based on elbow curve)
top_n_pcs = 3

# Average absolute contribution across selected PCs
selected_loadings = loadings.iloc[:, :top_n_pcs]
feature_importance = selected_loadings.abs().mean(axis=1)

# Rank features by their contribution
top_features = feature_importance.sort_values(ascending=False)
#print(f"\nTop contributing features across top {top_n_pcs} PCs:")
#print(top_features)

# --- Optional: Weight by explained variance ---
#weights = explained_variance[:top_n_pcs]

explained_variance = pca1.explained_variance_ratio_
weights = explained_variance[:top_n_pcs]

weighted_importance = (selected_loadings.abs() * weights).sum(axis=1)
top_weighted_features = weighted_importance.sort_values(ascending=False)
print(f"\nTop features weighted by explained variance (top {top_n_pcs} PCs):")
print(top_weighted_features)

# --- Visualization ---
plt.figure(figsize=(8,5))
top_weighted_features.plot(kind='barh')
plt.title(f"Feature Contributions to Top {top_n_pcs} Principal Components")
plt.xlabel("Weighted Contribution")
plt.ylabel("Feature")
plt.gca().invert_yaxis()
plt.show()

In [None]:
from pathlib import Path

# from the above graph, we'll use the elbow technique, which is  the point at which the rate of change in variance diminishes. It is is 3, 
# Perform apply_pca function the dataframe has normalized values for the predictor columns [acc_x, acc_y, acc_z. gyr_x, gyr_y, gyr_z] 
# New columns added are "pca_1","pca_2", "pca_3"
df_pca_kalman = pca.apply_pca(df_pca_kalman, predictor_columns, 3)


#visualize the new pca columns for a particular set

subset = df_pca_kalman[df_pca_kalman["set"]==35]
print(subset)
subset[["pca_1","pca_2", "pca_3" ]].plot()

#export dataset
Path('/kaggle/working/interim1').mkdir(parents=True, exist_ok=True)
df_pca_kalman.to_pickle("/kaggle/working/interim1/01_pca_kalman_applied.pkl")

* * * * * * ***   

<h4> âœ¨ Let's  Apply PCA to butterworth dataset</h4>

In [None]:
df_butterworth = pd.read_pickle("/kaggle/input/interim1/01_butter_worth_lowpass_applied.pkl")

# Principal component analysis PCA
# This dataframe has undergone Butterworth low-pass filter

predictor_columns = list(df_butterworth.columns[:6])
df_pca_butterworth = df_butterworth.copy()

# define the object of the PrincipalComponentAnalysis class
pca1 = PrincipalComponentAnalysis()

# First, we should decide the optimal number of principal components
pc_values1 = pca1.determine_pc_explained_variance(df_pca_butterworth, predictor_columns)  


# visualize the elbow curve
print("PCA values are : ",pc_values1)
plt.figure(figsize=(10,10))
plt.plot(range(1, len(predictor_columns)+1),pc_values1)
plt.xlabel("principal component number")
plt.ylabel("explained vaiance")
plt.show()  


In [None]:
from pathlib import Path

# from the above graph, we'll use the elbow technique, which is  the point at which the rate of change in variance diminishes. It is is 3, 
# Perform apply_pca function the dataframe has normalized values for the predictor columns [acc_x, acc_y, acc_z. gyr_x, gyr_y, gyr_z] 
# New columns added are "pca_1","pca_2", "pca_3"
df_pca_butterworth = pca1.apply_pca(df_pca_butterworth, predictor_columns, 3)


#visualize the new pca columns for a particular set

subset1 = df_pca_butterworth[df_pca_butterworth["set"]==35]
print(subset1)
subset1[["pca_1","pca_2", "pca_3" ]].plot()

#export dataset
Path('/kaggle/working/interim1').mkdir(parents=True, exist_ok=True)
df_pca_butterworth.to_pickle("/kaggle/working/interim1/01_pca_butterworth_applied.pkl")

* * * * * * ***   

<h2> ðŸ“š 4.4: Sum of sqaures attribute</h2> 
<font size="3"> <br>  
* Quantify model variability/Model optimization<br>
* Apply to <br>
    1. Kalman dataset<br>
    2. Butter worth low pass dataset
</font>

* * * * * * * * * * * ***   

<h4>âœ¨ Let's Apply Sum of sqaures attribute to Kalman dataset</h4>

In [None]:
# Sum of squares attributes
# --------------------------------------------------------------
# this gives scalar magnitude, which is impartial to the device orientation
# helps the model to generalize better to different participants

from IPython.display import display
import pandas as pd
import numpy as np

df_kalman_new = pd.read_pickle("/kaggle/input/interim1/01_pca_kalman_applied.pkl")

df_ss_kalman = df_kalman_new.copy()
acc_r      = df_ss_kalman["acc_x"]**2 + df_ss_kalman["acc_y"]**2 + df_ss_kalman["acc_z"]**2
gyr_r      = df_ss_kalman["gyr_x"]**2 + df_ss_kalman["gyr_y"]**2 + df_ss_kalman["gyr_z"]**2

df_ss_kalman["acc_r"] = np.sqrt(acc_r)
df_ss_kalman["gyr_r"] = np.sqrt(gyr_r)
print("Sum of sqaured attributes are".title())
display(df_ss_kalman[["acc_r","gyr_r"]])


set1 = 50
print(f"Sum of sqaured attributes set : {set1}")
subset = df_ss_kalman[df_ss_kalman["set"]==set1].reset_index()
subset[["acc_r","gyr_r" ]].plot(subplots=True)

* * * * * * ***   

<h4>âœ¨ Let's Apply Sum of sqaures attribute to butter worth dataset</h4>

In [None]:
# Sum of squares attributes
# --------------------------------------------------------------
# this gives scalar maginitude, which is impartial to the device orientation
# helps the model to generalize better to different participants

from IPython.display import display
import pandas as pd
import numpy as np

df_butterworth_new = pd.read_pickle("/kaggle/input/interim1/01_pca_butterworth_applied.pkl")

df_ss_butterworth = df_butterworth_new.copy()
acc_r      = df_ss_butterworth["acc_x"]**2 + df_ss_butterworth["acc_y"]**2 + df_ss_butterworth["acc_z"]**2
gyr_r      = df_ss_butterworth["gyr_x"]**2 + df_ss_butterworth["gyr_y"]**2 + df_ss_butterworth["gyr_z"]**2

df_ss_butterworth["acc_r"] = np.sqrt(acc_r)
df_ss_butterworth["gyr_r"] = np.sqrt(gyr_r)
print("Sum of sqaured attributes are".title())
display(df_ss_butterworth[["acc_r","gyr_r"]])


set1 = 50
print(f"Sum of sqaured attributes set : {set1}")
subset = df_ss_butterworth[df_ss_butterworth["set"]==set1].reset_index()
subset[["acc_r","gyr_r" ]].plot(subplots=True)

* * * * * * ***   

<h3> ðŸ“š 4.5: Temporal Abstraction </h3>
<font size="3"> <br>
* Smooth out time series or sequential data by taking the average of a fixed-size sliding window over your data <br>
* Apply to <br>               
    1. Kalman dataset<br>
    2. Butter worth low pass dataset
</font>

* * * * * * * ***   

<h4> âœ¨ Let's Apply temporal abstraction to kalman dataset</h4>

In [None]:
# Temporal abstraction
# --------------------------------------------------------------
from pathlib import Path

df_temp_kalman = df_ss_kalman.copy()
NumAbs = NumericalAbstraction()

# predictor columns are already added in the beginning as predictor columns["acc_x", "acc_y", "acc_z", "gyr_x", "gyr_y", "gyr_z"]
# let's add new columns ["acc_r" , "gyr_r"] to it

predictor_columns = predictor_columns + ["acc_r" , "gyr_r"]

# choosing the optimum window size is trial and error method
# if we use window size 5 => we can't use first 4 values = > windows size is taking current value and previous 4 values, altogether 5 values 
# currently each row of dataframe is for 200ms,  we want to have each row of data frame for 1s or 1000ms => so for that we need window size 1000/200 = 5

ws = int(1000/200)


df_temp_list = []

for s in df_temp_kalman["set"].unique():
    subset = df_temp_kalman[df_temp_kalman["set"]==s].copy()
    for col in predictor_columns:
        subset = NumAbs.abstract_numerical(subset, [col], ws, "mean")
        subset = NumAbs.abstract_numerical(subset, [col], ws, "std")
    
    df_temp_list.append(subset)

df_temporal_kalman = pd.concat(df_temp_list) 


set1 = 71
print(f"Plot for set {set1} ")
set = df_temporal_kalman[df_temporal_kalman["set"] == set1]
set[["acc_y","acc_y_temp_mean_ws_5","acc_y_temp_std_ws_5"]].plot()


# export dataset
#--------------------------------------------------------------------------------
Path('/kaggle/working/interim1').mkdir(parents=True, exist_ok=True)
df_temporal_kalman.to_pickle("/kaggle/working/interim1/01_temporal_abstraction_kalman_applied.pkl")

* * * * ***   

<h4> âœ¨ Display Kalman dataset with temporal abstraction </h4>

In [None]:
df_new = pd.read_pickle("/kaggle/input/interim1/01_temporal_abstraction_kalman_applied.pkl")
freq_features = [f for f in df_new.columns if ("temp" in f)]
print("\nNumber of frequency features are : ",len(freq_features))
print("\nFrequency features are :\n\n ", freq_features)
print("\n\nDisplying dataframe :\n", df_new[freq_features])

* * * * * * * ***   

<h3> âœ¨ Let's Apply temporal abstraction to Butterworth dataset</h3>

In [None]:
# Temporal abstraction
# --------------------------------------------------------------
from pathlib import Path

df_temp_butterworth = df_ss_butterworth.copy()
NumAbs = NumericalAbstraction()

# predictor columns are already added in the beginning as predictor columns["acc_x", "acc_y", "acc_z", "gyr_x", "gyr_y", "gyr_z"]
# let's add new columns ["acc_r" , "gyr_r"] to it

predictor_columns = predictor_columns + ["acc_r" , "gyr_r"]

# choosing the optimum window size is trial and error method
# if we use window size 5 => we can't use first 4 values = > windows size is taking current value and previous 4 values, altogether 5 values 
# currently each row of dataframe is for 200ms,  we want to have each row of data frame for 1s or 1000ms => so for that we need window size 1000/200 = 5

ws = int(1000/200)

df_temp_list = []

for s in df_temp_butterworth["set"].unique():
    subset = df_temp_butterworth[df_temp_butterworth["set"]==s].copy()
    for col in predictor_columns:
        subset = NumAbs.abstract_numerical(subset, [col], ws, "mean")
        subset = NumAbs.abstract_numerical(subset, [col], ws, "std")
    
    df_temp_list.append(subset)

df_temporal_butterworth = pd.concat(df_temp_list) 


set1 = 71
print(f"Plot for set {set1} ")
set = df_temporal_butterworth[df_temporal_butterworth["set"] == set1]
set[["acc_y","acc_y_temp_mean_ws_5","acc_y_temp_std_ws_5"]].plot()


# export dataset
#--------------------------------------------------------------------------------
Path('/kaggle/working/interim1').mkdir(parents=True, exist_ok=True)
df_temporal_butterworth.to_pickle("/kaggle/working/interim1/01_temporal_abstraction_butter_worth_applied.pkl")

* * * * * * ***   

<h4>âœ¨ Display Butter worth dataset with temporal abstraction </h4>

In [None]:
df_new = pd.read_pickle("/kaggle/input/interim1/01_temporal_abstraction_butter_worth_applied.pkl")
freq_features = [f for f in df_new.columns if ("temp" in f)]
print("\nNumber of frequency features are : ",len(freq_features))
print("\nFrequency features are :\n\n ", freq_features)
print("\n\nDisplying dataframe :\n", df_new[freq_features])

* * * * ***   

<h3>ðŸ“š 4.6: Frequency abstraction</h3>
<font size="3"> <br>
Let's analyse the two different approaches of frequency abstraction<br>
    1. Fourier transformation<br>
    2. Variational mode decomposition

* * * * * * * ***   

<h4> âœ¨ Lets look at frequency abstraction through Fourier transformation</h4>

In [None]:
# Fourier transformation
#------------------------------------------------
from pathlib import Path

df_butterworth_new = pd.read_pickle("/kaggle/input/interim1/01_temporal_abstraction_butter_worth_applied.pkl")
df_freq = df_butterworth_new.copy().reset_index()
freqAbs = FourierTransformation()

fs = int(1000/200)

# window size
ws = int(2800/200)


#loop over each set and perform fourier transformation
df_freq_list = []
for s in df_freq["set"].unique():
    subset = df_freq[df_freq["set"]==s].reset_index(drop=True).copy()
    for col in predictor_columns:
        subset = freqAbs.abstract_frequency(subset, [col], ws, fs)
        df_freq_list.append(subset)

df_freq = pd.concat(df_freq_list).set_index("epoch (ms)",drop=True)

freq_features = [f for f in df_freq.columns if ("freq" in f)]
print(df_freq[freq_features])

# export dataset
#--------------------------------------------------------------------------------
Path('/kaggle/working/interim1').mkdir(parents=True, exist_ok=True)
df_freq.to_pickle("/kaggle/working/interim1/01_butter_worth_fourier_transformation_applied.pkl")

* * * * * * * ***   

<h4> âœ¨ Display dataframe with fourier transformation</h4>

In [None]:
from IPython.display import display
import warnings
warnings.filterwarnings('ignore')

df_fourier = pd.read_pickle("/kaggle/input/interim1/01_butter_worth_fourier_transformation_applied.pkl")
freq_features = [f for f in df_fourier.columns if ("freq" in f)]
print("\nlength of newly created columns : ",len(freq_features),"\n\n")
display(df_fourier[freq_features])


* * * * * * ***   

<h4> âœ¨ Lets look at frequency abstraction through Variational mode decomposition</h4>

In [None]:
# Frequency features
# --------------------------------------------------------------
#Applying variational mode decomposition
import pandas as pd
from pathlib import Path

df_freq = pd.DataFrame() 

df_new = pd.read_pickle("/kaggle/input/interim1/01_temporal_abstraction_kalman_applied.pkl")
df_freq = df_new.copy().reset_index()
freqAbs1 = Variational_Mode_Decomposition

# Loop over each unique "set"
for s in df_freq["set"].unique():
    subset = df_freq[df_freq["set"] == s].reset_index(drop=True)  # reset index for safe VMD
    mask = df_freq['set'] == s
    n_target = mask.sum()  # number of rows to fill in df_freq

    for col in predictor_columns:
        # Run VMD on the column
        subset1 = subset[col]
        subset_vmd = freqAbs1.vmd(subset1, col, 8)
        subset_vmd = subset_vmd  # reset index

        n_vmd = len(subset_vmd)  # number of rows from VMD output

        # Ensure lengths match
        if n_vmd < n_target:
            # pad with NaN
            subset_vmd = subset_vmd.reindex(range(n_target))
        elif n_vmd > n_target:
            # truncate extra rows
            subset_vmd = subset_vmd.iloc[:n_target]

        # Assign safely back to df_freq
        for column in subset_vmd.columns:
            df_freq.loc[mask, column] = subset_vmd[column].values

df_freq = df_freq.set_index("epoch (ms)",drop=True)
# export dataset
#--------------------------------------------------------------------------------
Path('/kaggle/working/interim1').mkdir(parents=True, exist_ok=True)
df_freq.to_pickle("/kaggle/working/interim1/01_Kalman_filter_Variational_mode_decomposition_applied.pkl")

* * * * * * ***   

<h4> âœ¨ Display dataframe with Variational Mode decomposition(VMD)</h4>

In [None]:
from IPython.display import display

#df_VMD = pd.read_pickle("/kaggle/input/interim1/01_VMD_applied.pkl")
df_VMD = pd.read_pickle("/kaggle/input/interim1/01_Kalman_filter_Variational_mode_decomposition_applied.pkl")
freq_features = [f for f in df_VMD.columns if ("VMD" in f)]
print("\n\nlength of newly created VMD columns : ",len(freq_features),"\n\n")
display(df_VMD)






* * * * * * ***   

<h3> 4.7: ðŸ“šDealing with overlapping windows</h3>
<font size="3">  <br>
Dealing with overlapping windows for below datasets<br>
    1. Dataset with Variational mode decomposition/VMD <br>
    2. Dataset with Fast fourier transformation/(fft)<br>
</font>

In [None]:
# --------------------------------------------------------------
# Dealing with overlapping windows
# --------------------------------------------------------------

from IPython.display import display
import pandas as pd
from pathlib import Path

df_fourier = pd.read_pickle("/kaggle/input/interim1/01_butter_worth_fourier_transformation_applied.pkl")
df_VMD = pd.read_pickle("/kaggle/input/interim1/01_Kalman_filter_Variational_mode_decomposition_applied.pkl")

df_fourier = df_fourier.dropna()
df_VMD = df_VMD.dropna()

# values between every rows are correlated or overlap after we did the rolling function.
# we should avoid them when building model as this may result in overfitting
# we should allow certain percentage of overlap while removing the rest of the data
# getting rid of 50% of data is recommended- we can do by skipping every other row

df_fourier = df_fourier.iloc[::2] 
df_VMD = df_VMD.iloc[::2]

display(df_fourier)
display(df_VMD)

#export dataset
#--------------------------------------------------------------------------------------
Path('/kaggle/working/interim1').mkdir(parents=True, exist_ok=True)
df_fourier.to_pickle("/kaggle/working/interim1/01_Train_model_fourier.pkl")
df_VMD.to_pickle("/kaggle/working/interim1/01_Train_model_VMD.pkl")

* * * ***   


> # ðŸŒˆ Step 5 : Predictive modelling

<h4> âœ¨ Split features into different subsets</h4>
<h4> âœ¨ Carry out forward feature selection utilizing decision tree and identify the best 10 features and their scores</h4>
<h4> âœ¨ Grid search Across different machine learning Approaches </h4>

<h3> ðŸ“š Apply predictive modelling to Dataset with Fourier transformation </h3>

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from learningalgorithms1 import ClassificationAlgorithms
import seaborn as sns
import itertools
from sklearn.metrics import accuracy_score, confusion_matrix

# Plot settings
plt.style.use("fivethirtyeight")
plt.rcParams["figure.figsize"] = (20, 5)
plt.rcParams["figure.dpi"] = 100
plt.rcParams["lines.linewidth"] = 2



# apply to datframe with fourier transform
df = pd.read_pickle("/kaggle/input/interim1/01_Train_model_fourier.pkl")


# Create a training and test set
# drop unnecessary columns
df_train  = df.drop(["participant", "category", "set","duration"],axis=1)

#split data frame into training and test set

X = df_train.drop("label",axis=1)
y = df_train["label"]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)


<h4> âœ¨ Split features into different subsets</h4>

In [None]:
basic_features = ["acc_x","acc_y","acc_z","gyr_x","gyr_y","gyr_z"]
sqaure_features = ["acc_r", "gyr_r"]
pca_features = ["pca_1","pca_2","pca_3"]
time_features = [f for f in df_train.columns if ("_temp_" in f)]
freq_features = [f for f in df_train.columns if ("freq" in f) or ("_pse" in f)]


feature_set_1 = list(basic_features)
feature_set_2 = list(basic_features + sqaure_features + pca_features )
feature_set_3 = list(feature_set_2 + time_features )
feature_set_4 = list(feature_set_3 + freq_features)

<h4> âœ¨ Carry out forward feature selection utilizing decision tree and identify the best 10 features and their scores</h4>

In [None]:
# Perform forward feature selection using a simple decision tree
# Select 10 best performing features and create another subset which will be feature_set_5
#create an object of ClassificationAlgorithms

features_exist = True
learner = ClassificationAlgorithms()
max_features  = 10

if(features_exist):
    print("Forward feature selection completed")

else :
    # this will select the best 10 features of the dataframe that have high performance
    selected_features, ordered_features, ordered_scores = learner.forward_selection(max_features, X_train, y_train)
    print("\nselected_features : \n", selected_features)
    print("\nordered_scores : \n", ordered_scores)

<h4> âœ¨ Noted down the best 10 features and their scores </h4>

In [None]:
selected_features = [
    'acc_z_freq_0.0_Hz_ws_14', 
    'acc_x_freq_0.0_Hz_ws_14', 
    'gyr_r_freq_0.0_Hz_ws_14', 
    'acc_z', 
    'acc_y_freq_0.0_Hz_ws_14', 
    'gyr_r_freq_1.429_Hz_ws_14', 
    'gyr_x_freq_0.714_Hz_ws_14', 
    'acc_y_max_freq', 
    'acc_z_freq_1.071_Hz_ws_14', 
    'acc_r_max_freq'
]



ordered_scores = [
    0.885556704584626, 
    0.9886246122026887, 
    0.9955187866253016, 
    0.9975870389520854, 
    0.9982764563943468, 
    0.9993105825577387, 
    0.9993105825577387, 
    0.9993105825577387, 
    0.9993105825577387, 
    0.9993105825577387
]

<h4> âœ¨ Grid search Across different machine learning Approaches </h4>
<font size="3"> <br>
* <font color="black">Grid search through approaches such as <font color="Blue"> 
   Neural Network, Random Forest, K Nearest Neighbor, Naive Bayes and Decision Tree <br> 
    <font color="black">
*  Export the dataset of different approaches and their corresponding scores
</font>

In [None]:

# Goes through all possible combinations of the parameters 
# Evaluates the modelâ€™s performance for each combination (through cross-validation)

from pathlib import Path


possible_feature_sets = [
        feature_set_1,
        feature_set_2, 
        feature_set_3,
        feature_set_4,
        selected_features   
    ]
    
    
feature_names = [
    
        "Feature Set 1",
        "Feature Set 2",
        "Feature Set 3",
        "Feature Set 4",
        "Selected Features"
        
    ]




if (Path("/kaggle/input/interim1/01_score_df.pkl")).is_file():
    print("Grid search completed")

else :
    print("Performing Grid search.............")
    iterations = 1
    score_df = pd.DataFrame()
    
    
    for i, f in zip(range(len(possible_feature_sets)), feature_names):
        print("Feature set:", i)
        selected_train_X = X_train[possible_feature_sets[i]]
        selected_test_X = X_test[possible_feature_sets[i]]
    
        # First run non deterministic classifiers to average their score.
        performance_test_nn = 0
        performance_test_rf = 0
    
        for it in range(0, iterations):
            print("\tTraining neural network,", it)
            (
                class_train_y,
                class_test_y,
                class_train_prob_y,
                class_test_prob_y,
            ) = learner.feedforward_neural_network(
                selected_train_X,
                y_train,
                selected_test_X,
                gridsearch=False,
            )
            performance_test_nn += accuracy_score(y_test, class_test_y)
    
            print("\tTraining random forest,", it)
            (
                class_train_y,
                class_test_y,
                class_train_prob_y,
                class_test_prob_y,
            ) = learner.random_forest(
                selected_train_X, y_train, selected_test_X, gridsearch=True
            )
            performance_test_rf += accuracy_score(y_test, class_test_y)
    
        performance_test_nn = performance_test_nn / iterations
        performance_test_rf = performance_test_rf / iterations
    
      
        print("\tTraining KNN")
        (
            class_train_y,
            class_test_y,
            class_train_prob_y,
            class_test_prob_y,
        ) = learner.k_nearest_neighbor(
            selected_train_X, y_train, selected_test_X, gridsearch=True
        )
        performance_test_knn = accuracy_score(y_test, class_test_y)
    
        print("\tTraining decision tree")
        (
            class_train_y,
            class_test_y,
            class_train_prob_y,
            class_test_prob_y,
        ) = learner.decision_tree(
            selected_train_X, y_train, selected_test_X, gridsearch=True
        )
        performance_test_dt = accuracy_score(y_test, class_test_y)
    
        print("\tTraining naive bayes")
        (
            class_train_y,
            class_test_y,
            class_train_prob_y,
            class_test_prob_y,
        ) = learner.naive_bayes(selected_train_X, y_train, selected_test_X)
    
        performance_test_nb = accuracy_score(y_test, class_test_y)
    
        # Save results to dataframe
        models = ["NN", "RF", "KNN", "DT", "NB"]
        new_scores = pd.DataFrame(
            {
                "model": models,
                "feature_set": f,
                "accuracy": [
                    performance_test_nn,
                    performance_test_rf,
                    performance_test_knn,
                    performance_test_dt,
                    performance_test_nb,
                ],
            }
        )
        score_df = pd.concat([score_df, new_scores])
    
    #export the dataset
    score_df.to_pickle("/kaggle/working/01_score_df.pkl") 


<h4> âœ¨ Visualize the models and their scores </h4>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

score_df = pd.read_pickle("/kaggle/input/interim1/01_score_df.pkl")

print(score_df.sort_values(by="accuracy", ascending=False))


plt.figure(figsize=(10,10))
sns.barplot(x='model',y='accuracy',hue='feature_set',data=score_df)
plt.xlabel("Model")
plt.ylabel("Accuracy")
plt.ylim(0.7,1)
plt.legend(loc="lower right")
plt.show()

> # ðŸŒˆ Step 6:  Model Evaluation and Validation

<h4> âœ¨ Evaluating Model based on exisiting Best approach (Random Forest) </h4>

In [None]:
# Import required libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.datasets import load_iris
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import itertools


rf = RandomForestClassifier(
    n_estimators=100,   # number of trees
    max_depth=3,     # let the trees expand until all leaves are pure
    random_state=42,    # for reproducibility
    n_jobs=-1           # use all CPU cores for faster training
)

# Train the model
rf.fit(X_train, y_train)

#Predict on the test set
y_pred = rf.predict(X_test)
pred_prob_test_y = rf.predict_proba(X_test)
class_test_prob_y = pd.DataFrame(pred_prob_test_y, columns=rf.classes_)

classes = class_test_prob_y.columns

cm = confusion_matrix(y_test, y_pred, labels=classes)

# create confusion matrix
plt.figure(figsize=(10, 10))
plt.imshow(cm, interpolation="nearest", cmap=plt.cm.Blues)
plt.title("Confusion matrix")
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)

thresh = cm.max() / 2.0
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    plt.text(
        j,
        i,
        format(cm[i, j]),
        horizontalalignment="center",
        color="white" if cm[i, j] > thresh else "black",
    )
plt.ylabel("True label")
plt.xlabel("Predicted label")
plt.grid(False)
plt.show() 

from sklearn.metrics import f1_score, classification_report

# Evaluate performance
print("âœ… Accuracy:", accuracy_score(y_test, y_pred))

# --- Compute F1 scores ---
macro_f1 = f1_score(y_test, y_pred, average='macro')
micro_f1 = f1_score(y_test, y_pred, average='micro')
weighted_f1 = f1_score(y_test, y_pred, average='weighted')

print("\nF1 Scores:")
print(f"Macro-F1:    {macro_f1:.4f}")
print(f"Micro-F1:    {micro_f1:.4f}")
print(f"Weighted-F1: {weighted_f1:.4f}")


print("\nDetailed classification report:")
print("\nðŸ“Š Classification Report:\n", classification_report(y_test, y_pred))






  >  # ðŸŒˆ Proposed Approach CNN + BiLSTM + Attention with Augmentation

<h4> âœ¨ Install required dependencies for BiLSTM approach </h4>

In [None]:
!pip install --upgrade --force-reinstall \
    numpy==1.26.4 \
    pandas==2.2.2 \
    matplotlib==3.8.4 \
    scipy==1.13.1 \
    scikit-learn==1.5.2 \
    tensorflow==2.18.0 \
    jax==0.4.34 \
    jaxlib==0.4.34 \
    google-cloud-bigquery==3.31.0 \
    google-cloud-bigquery-storage==2.30.0 \
    protobuf==4.25.3 \
    --no-cache-dir


>  # ðŸŒˆ Train model

<h4> âœ¨ CNN + BiLSTM + Attention with Augmentation </h4>

In [None]:
# =============================================
# CNN + BiLSTM + Attention+Augmentation with Sliding Windows
# =============================================
import os, random
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models, regularizers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score, classification_report
from scipy.stats import mode
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from kerastuner import HyperParameters
from kerastuner.tuners import RandomSearch
from pathlib import Path


# --------------------------
# Fix seeds for reproducibility
# --------------------------
SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# --------------------------
# Load Data
# --------------------------
df = pd.read_pickle("/kaggle/input/interim1/01_Train_model_VMD.pkl")  

# Original IMU feature columns
imu_cols = [c for c in df.columns if c not in ["participant", "category", "set","duration","epoch (ms)","label"]]

# --------------------------
# Add derived features (optional but helps)
# --------------------------


feature_cols = imu_cols 

X_all = df[feature_cols].values.astype('float32')
y_all = df['label'].values

# Encode labels
label_encoder = LabelEncoder()
y_all_encoded = label_encoder.fit_transform(y_all)
print("Label mapping:", dict(zip(label_encoder.classes_, y_all_encoded)))

# --------------------------
# Create sequences using sliding windows with majority label
# --------------------------
window_size = 50  # timesteps per sequence
step = 25        # higher overlap
X_sequences = []
y_sequences = []

for start in range(0, len(X_all) - window_size + 1, step):
    end = start + window_size
    X_seq = X_all[start:end]
    # majority label in window
   
    y_seq = int(mode(y_all_encoded[start:end], keepdims=True).mode[0])
    
    X_sequences.append(X_seq)
    y_sequences.append(y_seq)

X_sequences = np.array(X_sequences)
y_sequences = np.array(y_sequences)

print("Total sequences:", X_sequences.shape[0], "Sequence shape:", X_sequences.shape[1:])

# --------------------------
# Train/Test split
# --------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_sequences, y_sequences, test_size=0.2, stratify=y_sequences, random_state=SEED
)

# --------------------------
# Compute class weights
# --------------------------
classes = np.unique(y_train)
cw = class_weight.compute_class_weight('balanced', classes=classes, y=y_train)
class_weights = {int(c): float(w) for c, w in zip(classes, cw)}

# --------------------------
# Normalize features
# --------------------------
n_features = X_train.shape[2]
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.reshape(-1, n_features)).reshape(X_train.shape)
X_test_scaled  = scaler.transform(X_test.reshape(-1, n_features)).reshape(X_test.shape)

# --------------------------
# On-the-Fly Augmentation
# --------------------------
def augment_sequence_tf(seq):
    seq = tf.cast(seq, tf.float32)
    noise = tf.random.normal(shape=tf.shape(seq), mean=0.0, stddev=0.01)
    seq = seq + noise
    scale = tf.random.uniform(shape=[], minval=0.95, maxval=1.05)
    seq = seq * scale
    return seq

def build_dataset(X, y, batch_size=32, training=True):
    dataset = tf.data.Dataset.from_tensor_slices((X, y))
    if training:
        dataset = dataset.shuffle(buffer_size=len(X), seed=SEED)
        dataset = dataset.map(lambda seq, label: (augment_sequence_tf(seq), label),
                              num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset

train_dataset = build_dataset(X_train_scaled, y_train, batch_size=32, training=True)
val_dataset   = build_dataset(X_test_scaled, y_test, batch_size=32, training=False)



if (Path("/kaggle/input/modelset/deep_learning_model_Final.h5")).is_file():
    print("There is an existing model...")
    best_model = tf.keras.models.load_model("/kaggle/input/modelset/deep_learning_model_Final.h5")


else:
    # --------------------------
    # Model Builder (CNN + BiLSTM + Attention)
    # --------------------------
    
    
    def build_model_imu(hp):
        inp = layers.Input(shape=(window_size, n_features))
        x = layers.Masking(mask_value=0.0)(inp)
        x = layers.LayerNormalization()(x)
        
        # CNN residual blocks
        for i in range(2):
            filters = hp.Int(f'conv{i}_filters', 128, 256, step=64)
            kernel = hp.Choice(f'conv{i}_kernel', [3,5])
            x_prev = x
            x = layers.Conv1D(filters, kernel, padding='same', activation='relu')(x)
            x = layers.BatchNormalization()(x)
            if x_prev.shape[-1] != filters:
                x_res = layers.Conv1D(filters, 1, padding='same')(x_prev)
            else:
                x_res = x_prev
            x = layers.Add()([x, x_res])
            x = layers.MaxPooling1D(2)(x)
        
        # BiLSTM
        lstm_units = hp.Int('lstm_units', 128, 256, step=64)
        x = layers.Bidirectional(layers.LSTM(lstm_units, return_sequences=True, dropout=0.2))(x)
        
        # Optional MultiHead Attention
        use_attn = hp.Choice('use_attention', [True, False])
        if use_attn:
            attn_heads = hp.Choice('attn_heads', [2, 4])
            key_dim = hp.Choice('attn_key_dim', [16, 32])
            attn = layers.MultiHeadAttention(num_heads=attn_heads, key_dim=key_dim)(x, x)
            x = layers.Add()([x, attn])
            x = layers.LayerNormalization()(x)
        
        x = layers.GlobalAveragePooling1D()(x)
        
        # Dense classifier
        dense_units = hp.Int('dense_units', 128, 256, step=64)
        x = layers.Dense(dense_units, activation='relu', kernel_regularizer=regularizers.l2(1e-4))(x)
        x = layers.Dropout(hp.Float('dropout', 0.2, 0.4, step=0.1))(x)
        out = layers.Dense(len(np.unique(y_all_encoded)), activation='softmax')(x)
        
        lr = hp.Choice('learning_rate', [1e-3, 5e-4, 1e-4])
        model = models.Model(inputs=inp, outputs=out)
        model.compile(optimizer=tf.keras.optimizers.Adam(lr), 
                      loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        return model
    
    # --------------------------
    # Hyperparameter Tuning
    # --------------------------
    tuner = RandomSearch(
        build_model_imu,
        objective='val_accuracy',
        max_trials=20,
        executions_per_trial=1,
        directory='tuner_vmd',
        project_name='imu_vmd_tuned'
    )
    
    tuner.search(train_dataset, validation_data=val_dataset, epochs=50,
                 class_weight=class_weights,
                 callbacks=[EarlyStopping(patience=8, restore_best_weights=True),
                            ReduceLROnPlateau(factor=0.5, patience=4, min_lr=1e-6)],
                 verbose=1)
    
    # --------------------------
    # Evaluate Best Model
    # --------------------------
    best_model = tuner.get_best_models(num_models=1)[0]
    loss, acc = best_model.evaluate(val_dataset)
    print(f"\nTest Accuracy: {acc*100:.2f}%")
    
    best_model.save("/kaggle/working/deep_learning_model.h5")


# --------------------------
# Predictions & Confusion Matrix
# --------------------------
loss, acc = best_model.evaluate(val_dataset)
print(f"\nTest Accuracy: {acc*100:.2f}%")
y_pred_probs = best_model.predict(val_dataset)
y_pred = np.argmax(y_pred_probs, axis=1)

y_true_labels = label_encoder.inverse_transform(y_test)
y_pred_labels = label_encoder.inverse_transform(y_pred)

cm = confusion_matrix(y_true_labels, y_pred_labels, labels=label_encoder.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)
disp.plot(cmap='Blues', xticks_rotation=45)
plt.title("Test Set Confusion Matrix")
plt.show()

# --------------------------
# F1 Scores & Classification Report
# --------------------------
macro_f1 = f1_score(y_test, y_pred, average='macro')
micro_f1 = f1_score(y_test, y_pred, average='micro')
weighted_f1 = f1_score(y_test, y_pred, average='weighted')

print("\nF1 Scores:")
print(f"Macro-F1:    {macro_f1:.4f}")
print(f"Micro-F1:    {micro_f1:.4f}")
print(f"Weighted-F1: {weighted_f1:.4f}")

print("\nDetailed classification report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
