In [25]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# Define the path to the filtered data folder
data_folder = "Data_Filtered"  # Update this to the path where your filtered data is stored

# Define a mapping from folder names to integer labels
label_mapping = {
    'auto': 0,
    'bus': 1,
    'fiets': 2,
    'lopen': 3,
    'metro': 4,
    'paard': 5,
    'tram': 6,
    'trein': 7
}

# Initialize a list to hold the combined data
combined_data_list = []

# Iterate over each subfolder in the data_filtered folder
for subfolder in os.listdir(data_folder):
    subfolder_path = os.path.join(data_folder, subfolder)
    if os.path.isdir(subfolder_path):
        # Read the accelerometer, gyroscope, and linear accelerometer CSV files
        accelerometer_file = os.path.join(subfolder_path, 'Accelerometer.csv')
        gyroscope_file = os.path.join(subfolder_path, 'Gyroscope.csv')
        linear_accelerometer_file = os.path.join(subfolder_path, 'Linear Accelerometer.csv')

        
        if os.path.exists(accelerometer_file) and os.path.exists(gyroscope_file) and os.path.exists(linear_accelerometer_file):
            acc_data = pd.read_csv(accelerometer_file)
            gyro_data = pd.read_csv(gyroscope_file)
            lin_acc_data = pd.read_csv(linear_accelerometer_file)

                        
            # Rename columns to include the type of data for clarity
            acc_data.columns = ['Time (s)', 'acc_mean_X (m/s^2)', 'acc_mean_Y (m/s^2)', 'acc_mean_Z (m/s^2)']
            gyro_data.columns = ['Time (s)', 'gyro_mean_X (rad/s)', 'gyro_mean_Y (rad/s)', 'gyro_mean_Z (rad/s)']
            lin_acc_data.columns = ['Time (s)', 'lin_acc_mean_X (m/s^2)', 'lin_acc_mean_Y (m/s^2)', 'lin_acc_mean_Z (m/s^2)']
            
            # Reset index to align them properly for merging
            acc_data.reset_index(drop=True, inplace=True)
            gyro_data.reset_index(drop=True, inplace=True)
            lin_acc_data.reset_index(drop=True, inplace=True)

            merged_data = pd.concat([acc_data, gyro_data, lin_acc_data], axis=1)

            # Determine the label based on the folder name
            label = next((label_mapping[key] for key in label_mapping if key in subfolder), None)
            
            if label is not None:
                # Add the label column based on the folder name
                merged_data['label'] = label
            
            # Append the merged data to the list
            combined_data_list.append(merged_data)

# Concatenate all the combined data into a single DataFrame
combined_data = pd.concat(combined_data_list, ignore_index=True)

# Check if data has been loaded and merged correctly
print(combined_data.head())


    Time (s)  acc_mean_X (m/s^2)  acc_mean_Y (m/s^2)  acc_mean_Z (m/s^2)  \
0  10.250332            0.012065           -0.550455            9.800024   
1  10.746557            0.020518           -0.545073            9.797954   
2  11.247795            0.035690           -0.493657            9.779238   
3  11.749034            0.132072           -0.061541            9.886638   
4  12.250274            0.720517            0.683281            9.873849   

    Time (s)  gyro_mean_X (rad/s)  gyro_mean_Y (rad/s)  gyro_mean_Z (rad/s)  \
0  10.247826            -0.000817            -0.000605            -0.001003   
1  10.749063            -0.000022             0.000382            -0.001239   
2  11.250301             0.001961            -0.000264            -0.000147   
3  11.751541             0.005399            -0.000180            -0.006632   
4  12.252781             0.008621            -0.002708            -0.079796   

    Time (s)  lin_acc_mean_X (m/s^2)  lin_acc_mean_Y (m/s^2)  \
0  1

In [26]:
combined_data

Unnamed: 0,Time (s),acc_mean_X (m/s^2),acc_mean_Y (m/s^2),acc_mean_Z (m/s^2),Time (s).1,gyro_mean_X (rad/s),gyro_mean_Y (rad/s),gyro_mean_Z (rad/s),Time (s).2,lin_acc_mean_X (m/s^2),lin_acc_mean_Y (m/s^2),lin_acc_mean_Z (m/s^2),label
0,10.250332,0.012065,-0.550455,9.800024,10.247826,-0.000817,-0.000605,-0.001003,10.247826,-0.007027,0.010127,0.004482,0
1,10.746557,0.020518,-0.545073,9.797954,10.749063,-0.000022,0.000382,-0.001239,10.749063,0.000415,0.015643,0.005915,0
2,11.247795,0.035690,-0.493657,9.779238,11.250301,0.001961,-0.000264,-0.000147,11.250301,0.008570,0.063228,-0.020871,0
3,11.749034,0.132072,-0.061541,9.886638,11.751541,0.005399,-0.000180,-0.006632,11.751541,0.109061,0.475696,0.093381,0
4,12.250274,0.720517,0.683281,9.873849,12.252781,0.008621,-0.002708,-0.079796,12.252781,0.650630,1.088782,0.073953,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
25357,584.251912,-1.015524,1.287457,9.791637,584.250657,0.009569,-0.001612,-0.028967,584.250657,-0.029563,0.021925,0.116329,7
25358,584.753296,-1.022664,1.293318,9.727238,584.747027,0.000948,-0.004969,-0.014095,584.747027,-0.033096,-0.001747,0.053025,7
25359,585.249665,-1.060643,1.271176,9.746895,585.248410,-0.003231,0.001832,0.011691,585.248410,-0.031055,0.007185,0.074944,7
25360,585.746035,-0.990698,1.324092,9.750749,585.749794,0.004313,-0.006254,0.003432,585.749794,-0.020011,0.033301,0.073540,7


In [27]:
# Split the data into training (65%), validation (10%), and test sets (25%)
train_data, test_data = train_test_split(combined_data, test_size=0.35, random_state=42)
val_data, test_data = train_test_split(test_data, test_size=0.7143, random_state=42)  # 0.7143 * 0.35 ≈ 0.25

# Save the datasets to CSV files
train_data.to_csv('train_data.csv', index=False)
val_data.to_csv('val_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)

# Verify the split
print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")
print(f"Test set size: {len(test_data)}")

Training set size: 16485
Validation set size: 2536
Test set size: 6341
