In [19]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# Define the path to the filtered data folder
data_folder = "Data_Features"  # Update this to the path where your filtered data is stored

# Define a mapping from folder names to integer labels
label_mapping = {
    'auto': 0,
    'bus': 1,
    'fiets': 2,
    'lopen': 3,
    'metro': 4,
    'paard': 5,
    'tram': 6,
    'trein': 7
}

# Initialize a list to hold the combined data
combined_data_list = []

# Iterate over each subfolder in the data_filtered folder
for subfolder in os.listdir(data_folder):
    subfolder_path = os.path.join(data_folder, subfolder)
    if os.path.isdir(subfolder_path):
        # Read the accelerometer, gyroscope, and linear accelerometer CSV files
        accelerometer_file = os.path.join(subfolder_path, 'Accelerometer_pca.csv')
        gyroscope_file = os.path.join(subfolder_path, 'Gyroscope_pca.csv')
        linear_accelerometer_file = os.path.join(subfolder_path, 'Linear Accelerometer_pca.csv')

        
        if os.path.exists(accelerometer_file) and os.path.exists(gyroscope_file) and os.path.exists(linear_accelerometer_file):
            acc_data = pd.read_csv(accelerometer_file)
            gyro_data = pd.read_csv(gyroscope_file)
            lin_acc_data = pd.read_csv(linear_accelerometer_file)

            #acc_data = acc_data.iloc[10:]
            #gyro_data = gyro_data.iloc[10:]
            #lin_acc_data = lin_acc_data.iloc[10:]          
                     
            # Reset index to align them properly for merging
            acc_data.reset_index(drop=True, inplace=True)
            gyro_data.reset_index(drop=True, inplace=True)
            lin_acc_data.reset_index(drop=True, inplace=True)

            merged_data = pd.concat([acc_data, gyro_data, lin_acc_data], axis=1)

            # Determine the label based on the folder name
            label = next((label_mapping[key] for key in label_mapping if key in subfolder), None)
            
            if label is not None:
                # Add the label column based on the folder name
                merged_data['label'] = label
            
            # Append the merged data to the list
            combined_data_list.append(merged_data)

# Concatenate all the combined data into a single DataFrame
combined_data = pd.concat(combined_data_list, ignore_index=True)

# Check if data has been loaded and merged correctly
print(combined_data.head())


    Time (s)         X         Y         Z      pca1      pca2   Time (s)  \
0  10.250332  0.012065 -0.550455  9.800024  0.197689  0.025060  10.247826   
1  10.746557  0.020518 -0.545073  9.797954  0.206563  0.029507  10.749063   
2  11.247795  0.035690 -0.493657  9.779238  0.226069  0.078197  11.250301   
3  11.749034  0.132072 -0.061541  9.886638  0.364178  0.504528  11.751541   
4  12.250274  0.720517  0.683281  9.873849  1.018785  1.189206  12.252781   

          X         Y         Z      pca1      pca2   Time (s)         X  \
0 -0.000817 -0.000605 -0.001003 -0.005252  0.000219  10.247826 -0.007027   
1 -0.000022  0.000382 -0.001239 -0.005561  0.001036  10.749063  0.000415   
2  0.001961 -0.000264 -0.000147 -0.004455  0.003017  11.250301  0.008570   
3  0.005399 -0.000180 -0.006632 -0.010975  0.006362  11.751541  0.109061   
4  0.008621 -0.002708 -0.079796 -0.083859  0.008466  12.252781  0.650630   

          Y         Z      pca1      pca2  label  
0  0.010127  0.004482  0.0597

In [20]:
combined_data['label'].value_counts()

5    5463
2    5161
6    3742
0    3404
4    2634
3    2621
7    2560
1    2137
Name: label, dtype: int64

In [18]:
# Split the data into training (65%), validation (10%), and test sets (25%)
train_data, test_data = train_test_split(combined_data, test_size=0.35, random_state=42)
val_data, test_data = train_test_split(test_data, test_size=0.7143, random_state=42)  # 0.7143 * 0.35 ≈ 0.25

# Save the datasets to CSV files
train_data.to_csv('Sets/pca/train_data_pca.csv', index=False)
val_data.to_csv('Sets/pca/val_data_pca.csv', index=False)
test_data.to_csv('Sets/pca/test_data_pca.csv', index=False)

# Verify the split
print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")
print(f"Test set size: {len(test_data)}")

Training set size: 18019
Validation set size: 2772
Test set size: 6931
