📌 Pre-processing for Sensor data

Here we are going to represent step by step our processing of data for the human activities :

🔹 Walking

🔹 Running

🔹 Standing up

🔹 Sitting down

🔹 Climbing stairs



The whole dedication includes the following steps:

🔹 Segmentation with overlap(20%)

🔹 Trimming 

🔹 Normalization

🔹 Calculate magnitude



STEP 1 - SPLIT & OVERLAP

For the following activities (running and walking) we split our data in parts of 10 seconds with overlap each other up to 20%

Note: The data we collect in those two phases(running and walking) was continously is parts of 1-1.5 minutes so we need to split it to smaller

In [None]:
import pandas as pd
import os

def segment_session(session_folder, output_base, window_size=10, step_size=8):
    
    #Devides the signals Accelerometer, Gravity , Gyroscope into parts of 10 seconds with 20% overlap.
    #Each part shifts by 8 seconds
    
    signal_files = ['Accelerometer.csv', 'Gravity.csv', 'Gyroscope.csv']
    dfs = {}

    # load of csv files and check if they exist
    for signal in signal_files:
        path = os.path.join(session_folder, signal)
        if os.path.exists(path):
            df = pd.read_csv(path)
            if df.empty or 'seconds_elapsed' not in df.columns:
                continue
            dfs[signal] = df

    #We use the minimum max time across all signals to ensure alignment
    max_time = min(df['seconds_elapsed'].max() for df in dfs.values())
    start = 0
    segment_id = 0

    #Repeat until the end of the session
    while start + window_size <= max_time:
        end = start + window_size

        #create the folder for each segment
        segment_folder = os.path.join(output_base, f"segment_{segment_id+1:03d}")
        os.makedirs(segment_folder, exist_ok=True)

        #save each signal segment in the corresponding folder
        for signal, df in dfs.items():
            segment_df = df[(df['seconds_elapsed'] >= start) & (df['seconds_elapsed'] < end)]
            segment_df.to_csv(os.path.join(segment_folder, signal), index=False)

        start += step_size
        segment_id += 1


In [None]:
# Define the input folder containing raw sensor sessions
base_folder = r"C:\Users\Vasil\DataAnalysisLearningMethodsAss1\Data-SA\run"
# Define the output folder for segmented sessions
output_base = r"C:\Users\Vasil\DataAnalysisLearningMethodsAss1\Data-SA\run_segments"

# Loop through each session folder inside the 'run' directory

for session_name in os.listdir(base_folder):
    session_path = os.path.join(base_folder, session_name)
    # Ensure it's a directory
    if os.path.isdir(session_path):
        segment_session(session_path, os.path.join(output_base, session_name), window_size=10, step_size=8)


STEP 2 - TRIMMING

For the following human activities (standing up, sitting down, climbing stairs) we used trimming to remove the first 1.5 and last 1.5 seconds of each measurement to ensure that we will not include the movement of the phone inside our pocket

Note:The data we collect for the following phases(standing up, sitting down, climbing stairs) was in parts of 8 seconds max and in total 30 files so we didn't need to split them

In [None]:
def trim_signals_in_folder(input_folder, output_folder, trim_seconds=1.5):
    
    #Trims the first and last 1.5 seconds from each signal file in the input folder

    signal_files = ['Accelerometer.csv', 'Gravity.csv', 'Gyroscope.csv']
    #Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    #Loop through each signal file
    for signal_file in signal_files:
        file_path = os.path.join(input_folder, signal_file)
        #Check if the file exists
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            if df.empty or 'seconds_elapsed' not in df.columns:
                continue
                #Get the time range of the signal
            min_time = df['seconds_elapsed'].min()
            max_time = df['seconds_elapsed'].max()
            #Skip trimming if the signal is too short
            if max_time - min_time <= 2 * trim_seconds:
                continue
                #Keep only the rows within the trimmed time window
            df = df[(df['seconds_elapsed'] >= min_time + trim_seconds) &
                    (df['seconds_elapsed'] <= max_time - trim_seconds)]
            #Save the new trimmed signal
            df.to_csv(os.path.join(output_folder, signal_file), index=False)


In [None]:
# Define the input folder
input_root = r"C:\Users\Vasil\DataAnalysisLearningMethodsAss1\Data-SA\sit"

# Define the output folder
output_root = r"C:\Users\Vasil\DataAnalysisLearningMethodsAss1\Data-SA\sit_trimmed"

# Loop through each session folder inside 'sit'
for folder_name in os.listdir(input_root):
    input_path = os.path.join(input_root, folder_name)
    output_path = os.path.join(output_root, folder_name)

    # Check if the current item is a directory
    if os.path.isdir(input_path):
        # Apply trimming to remove first and last 1.5 seconds from each signal file
        trim_signals_in_folder(input_path, output_path, trim_seconds=1.5)


STEP 3 - NORMALIZATION & MAGNITUDE

For all the new files that we have created until now we normalize them and calculate magnitude.


In [None]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

def normalize_and_add_magnitude(input_folder, output_folder):
   
   # Applies Min-Max Normalization to the x, y, z columns and computes the magnitude of the vector.


    signal_files = ['Accelerometer.csv', 'Gravity.csv', 'Gyroscope.csv']
    os.makedirs(output_folder, exist_ok=True)
    # Initialize the MinMaxScaler for normalization
    scaler = MinMaxScaler()

    for signal_file in signal_files:
        file_path = os.path.join(input_folder, signal_file)
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            #Skip if the dataframe is empty or missing required columns
            if df.empty or not all(col in df.columns for col in ['x', 'y', 'z']):
                continue
            # Apply Min-Max scaling to x, y, z and compute magnitude
            df[['x', 'y', 'z']] = scaler.fit_transform(df[['x', 'y', 'z']])
            df['magnitude'] = np.sqrt(df['x']**2 + df['y']**2 + df['z']**2)
            #Save the new processed DataFrame in output folder which includes normalized x, y, z and magnitude
            df.to_csv(os.path.join(output_folder, signal_file), index=False)


In [None]:
# Define the input-output folder pairs for preprocessing
folders_to_process = {
    # Folder with segmented 'run' data (each session contains multiple segment folders)
    r"C:\Users\Vasil\DataAnalysisLearningMethodsAss1\Data-SA\run_segments":
        r"C:\Users\Vasil\DataAnalysisLearningMethodsAss1\Data-SA\run_processed",

    # Folder with trimmed 'sit' data (each folder contains sensor files directly)
    r"C:\Users\Vasil\DataAnalysisLearningMethodsAss1\Data-SA\sit_trimmed":
        r"C:\Users\Vasil\DataAnalysisLearningMethodsAss1\Data-SA\sit_processed"
}

# Loop through each input-output folder pair
for input_root, output_root in folders_to_process.items():
    for folder_name in os.listdir(input_root):
        input_path = os.path.join(input_root, folder_name)
        output_path = os.path.join(output_root, folder_name)

        # Check if the current item is a directory 
        if os.path.isdir(input_path):

            # Special case: run_segments contains nested folders (segment_001, segment_002, ...)
            #Here we check if the processing data is included in run_segments because the earlier version failed
            if "run_segments" in input_root:
                for segment_name in os.listdir(input_path):
                    segment_path = os.path.join(input_path, segment_name)
                    output_segment = os.path.join(output_path, segment_name)

                    # Only process if the segment folder exists
                    if os.path.isdir(segment_path):
                        # Apply normalization and magnitude calculation to each segment
                        normalize_and_add_magnitude(segment_path, output_segment)

            else:
                # For sit_trimmed: sensor files are directly inside each folder
                # Apply normalization and magnitude calculation directly
                normalize_and_add_magnitude(input_path, output_path)

