In [5]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# Define the path to the filtered data folder
data_folder = "Data_Features"  # Update this to the path where your filtered data is stored

# Define a mapping from folder names to integer labels
label_mapping = {
    'auto': 0,
    'bus': 1,
    'fiets': 2,
    'lopen': 3,
    'metro': 4,
    'paard': 5,
    'tram': 6,
    'trein': 7
}

# Initialize a list to hold the combined data
combined_data_list = []

# Iterate over each subfolder in the data_filtered folder
for subfolder in os.listdir(data_folder):
    subfolder_path = os.path.join(data_folder, subfolder)
    if os.path.isdir(subfolder_path):
        # Read the accelerometer, gyroscope, and linear accelerometer CSV files
        accelerometer_file = os.path.join(subfolder_path, 'Accelerometer_pca.csv')
        gyroscope_file = os.path.join(subfolder_path, 'Gyroscope_pca.csv')
        linear_accelerometer_file = os.path.join(subfolder_path, 'Linear Accelerometer_pca.csv')

        
        if os.path.exists(accelerometer_file) and os.path.exists(gyroscope_file) and os.path.exists(linear_accelerometer_file):
            acc_data = pd.read_csv(accelerometer_file)
            gyro_data = pd.read_csv(gyroscope_file)
            lin_acc_data = pd.read_csv(linear_accelerometer_file)

                        
            # Rename columns to include the type of data for clarity
            acc_data.columns = ['Time (s)', 'acc_mean_X (m/s^2)', 'acc_mean_Y (m/s^2)', 'acc_mean_Z (m/s^2)', 'pca1', 'pca2', 'pca3']
            gyro_data.columns = ['Time (s)', 'gyro_mean_X (rad/s)', 'gyro_mean_Y (rad/s)', 'gyro_mean_Z (rad/s)', 'pca1', 'pca2', 'pca3']
            lin_acc_data.columns = ['Time (s)', 'lin_acc_mean_X (m/s^2)', 'lin_acc_mean_Y (m/s^2)', 'lin_acc_mean_Z (m/s^2)', 'pca1', 'pca2', 'pca3']
            
            # Reset index to align them properly for merging
            acc_data.reset_index(drop=True, inplace=True)
            gyro_data.reset_index(drop=True, inplace=True)
            lin_acc_data.reset_index(drop=True, inplace=True)

            merged_data = pd.concat([acc_data, gyro_data, lin_acc_data], axis=1)

            # Determine the label based on the folder name
            label = next((label_mapping[key] for key in label_mapping if key in subfolder), None)
            
            if label is not None:
                # Add the label column based on the folder name
                merged_data['label'] = label
            
            # Append the merged data to the list
            combined_data_list.append(merged_data)

# Concatenate all the combined data into a single DataFrame
combined_data = pd.concat(combined_data_list, ignore_index=True)

# Check if data has been loaded and merged correctly
print(combined_data.head())


    Time (s)  acc_mean_X (m/s^2)  acc_mean_Y (m/s^2)  acc_mean_Z (m/s^2)  \
0  10.250332            0.012065           -0.550455            9.800024   
1  10.746557            0.020518           -0.545073            9.797954   
2  11.247795            0.035690           -0.493657            9.779238   
3  11.749034            0.132072           -0.061541            9.886638   
4  12.250274            0.720517            0.683281            9.873849   

       pca1      pca2      pca3   Time (s)  gyro_mean_X (rad/s)  \
0  0.197689  0.025060  0.004187  10.247826            -0.000817   
1  0.206563  0.029507  0.001701  10.749063            -0.000022   
2  0.226069  0.078197 -0.020044  11.250301             0.001961   
3  0.364178  0.504528  0.061908  11.751541             0.005399   
4  1.018785  1.189206 -0.000517  12.252781             0.008621   

   gyro_mean_Y (rad/s)  ...      pca2      pca3   Time (s)  \
0            -0.000605  ...  0.000219 -0.000629  10.247826   
1             0.

In [6]:
combined_data

Unnamed: 0,Time (s),acc_mean_X (m/s^2),acc_mean_Y (m/s^2),acc_mean_Z (m/s^2),pca1,pca2,pca3,Time (s).1,gyro_mean_X (rad/s),gyro_mean_Y (rad/s),...,pca2.1,pca3.1,Time (s).2,lin_acc_mean_X (m/s^2),lin_acc_mean_Y (m/s^2),lin_acc_mean_Z (m/s^2),pca1.1,pca2.2,pca3.2,label
0,10.250332,0.012065,-0.550455,9.800024,0.197689,0.025060,0.004187,10.247826,-0.000817,-0.000605,...,0.000219,-0.000629,10.247826,-0.007027,0.010127,0.004482,0.059783,0.163531,-0.024303,0
1,10.746557,0.020518,-0.545073,9.797954,0.206563,0.029507,0.001701,10.749063,-0.000022,0.000382,...,0.001036,0.000318,10.749063,0.000415,0.015643,0.005915,0.067140,0.169246,-0.023269,0
2,11.247795,0.035690,-0.493657,9.779238,0.226069,0.078197,-0.020044,11.250301,0.001961,-0.000264,...,0.003017,-0.000311,11.250301,0.008570,0.063228,-0.020871,0.073942,0.215601,-0.052480,0
3,11.749034,0.132072,-0.061541,9.886638,0.364178,0.504528,0.061908,11.751541,0.005399,-0.000180,...,0.006362,-0.000737,11.751541,0.109061,0.475696,0.093381,0.168371,0.634951,0.039775,0
4,12.250274,0.720517,0.683281,9.873849,1.018785,1.189206,-0.000517,12.252781,0.008621,-0.002708,...,0.008466,-0.008022,12.252781,0.650630,1.088782,0.073953,0.697701,1.256174,-0.018890,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25357,584.251912,-1.015524,1.287457,9.791637,-0.064398,0.907601,-0.035740,584.250657,0.009569,-0.001612,...,-0.028105,0.002307,584.250657,-0.029563,0.021925,0.116329,-0.006877,0.003131,0.067062,7
25358,584.753296,-1.022664,1.293318,9.727238,-0.068099,0.910779,-0.100614,584.747027,0.000948,-0.004969,...,-0.011029,-0.001486,584.747027,-0.033096,-0.001747,0.053025,-0.009745,-0.026775,0.006418,7
25359,585.249665,-1.060643,1.271176,9.746895,-0.106768,0.888803,-0.082151,585.248410,-0.003231,0.001832,...,0.011728,-0.000245,585.248410,-0.031055,0.007185,0.074944,-0.008096,-0.015545,0.027288,7
25360,585.746035,-0.990698,1.324092,9.750749,-0.037913,0.942992,-0.076682,585.749794,0.004313,-0.006254,...,0.004280,0.006369,585.749794,-0.020011,0.033301,0.073540,-0.002541,0.011939,0.022844,7


In [9]:
# Split the data into training (65%), validation (10%), and test sets (25%)
train_data, test_data = train_test_split(combined_data, test_size=0.35, random_state=42)
val_data, test_data = train_test_split(test_data, test_size=0.7143, random_state=42)  # 0.7143 * 0.35 ≈ 0.25

# Save the datasets to CSV files
train_data.to_csv('Sets/pca/train_data_pca.csv', index=False)
val_data.to_csv('Sets/pca/val_data.csv_pca', index=False)
test_data.to_csv('Sets/pca/test_data_pca.csv', index=False)

# Verify the split
print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")
print(f"Test set size: {len(test_data)}")

Training set size: 16485
Validation set size: 2536
Test set size: 6341
