In [17]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# Define the path to the filtered data folder
data_folder = "Data_Features"  # Update this to the path where your filtered data is stored

# Define a mapping from folder names to integer labels
label_mapping = {
    'auto': 0,
    'bus': 1,
    'fiets': 2,
    'lopen': 3,
    'metro': 4,
    'paard': 5,
    'tram': 6,
    'trein': 7
}

# Initialize a list to hold the combined data
combined_data_list = []

# Iterate over each subfolder in the data_filtered folder
for subfolder in os.listdir(data_folder):
    subfolder_path = os.path.join(data_folder, subfolder)
    if os.path.isdir(subfolder_path):
        # Read the accelerometer, gyroscope, and linear accelerometer CSV files
        accelerometer_file = os.path.join(subfolder_path, 'Accelerometer_pca.csv')
        gyroscope_file = os.path.join(subfolder_path, 'Gyroscope_pca.csv')
        linear_accelerometer_file = os.path.join(subfolder_path, 'Linear Accelerometer_pca.csv')

        
        if os.path.exists(accelerometer_file) and os.path.exists(gyroscope_file) and os.path.exists(linear_accelerometer_file):
            acc_data = pd.read_csv(accelerometer_file)
            gyro_data = pd.read_csv(gyroscope_file)
            lin_acc_data = pd.read_csv(linear_accelerometer_file)

            #acc_data = acc_data.iloc[10:]
            #gyro_data = gyro_data.iloc[10:]
            #lin_acc_data = lin_acc_data.iloc[10:]          
                     
            # Reset index to align them properly for merging
            acc_data.reset_index(drop=True, inplace=True)
            gyro_data.reset_index(drop=True, inplace=True)
            lin_acc_data.reset_index(drop=True, inplace=True)

            merged_data = pd.concat([acc_data, gyro_data, lin_acc_data], axis=1)

            # Determine the label based on the folder name
            label = next((label_mapping[key] for key in label_mapping if key in subfolder), None)
            
            if label is not None:
                # Add the label column based on the folder name
                merged_data['label'] = label
            
            # Append the merged data to the list
            combined_data_list.append(merged_data)

# Concatenate all the combined data into a single DataFrame
combined_data = pd.concat(combined_data_list, ignore_index=True)

# Check if data has been loaded and merged correctly
print(combined_data.head())


    Time (s)         X         Y         Z      pca1      pca2   Time (s)  \
0  10.250332  0.012065 -0.550455  9.800024  0.197689  0.025060  10.247826   
1  10.746557  0.020518 -0.545073  9.797954  0.206563  0.029507  10.749063   
2  11.247795  0.035690 -0.493657  9.779238  0.226069  0.078197  11.250301   
3  11.749034  0.132072 -0.061541  9.886638  0.364178  0.504528  11.751541   
4  12.250274  0.720517  0.683281  9.873849  1.018785  1.189206  12.252781   

          X         Y         Z      pca1      pca2   Time (s)         X  \
0 -0.000817 -0.000605 -0.001003 -0.005252  0.000219  10.247826 -0.007027   
1 -0.000022  0.000382 -0.001239 -0.005561  0.001036  10.749063  0.000415   
2  0.001961 -0.000264 -0.000147 -0.004455  0.003017  11.250301  0.008570   
3  0.005399 -0.000180 -0.006632 -0.010975  0.006362  11.751541  0.109061   
4  0.008621 -0.002708 -0.079796 -0.083859  0.008466  12.252781  0.650630   

          Y         Z      pca1      pca2  label  
0  0.010127  0.004482  0.0597

In [11]:
combined_data

Unnamed: 0,Time (s),X,Y,Z,pca1,pca2,pca3,temp_X_mean,temp_X_max,temp_X_min,...,X_spectral_energy,Y_fft,Y_highest_freq,Y_power_spectrum_entropy,Y_spectral_energy,Z_fft,Z_highest_freq,Z_power_spectrum_entropy,Z_spectral_energy,label
0,15.247708,2.494864,0.263151,9.758139,2.743937,0.599160,-0.118427,1.557725,3.100055,0.020518,...,41108.195285,39.872792,87.362427,1.857633,20556.979697,0.316000,26.334736,1.280020,1119.503967,0
1,15.748955,2.860747,0.109276,9.817755,3.095071,0.415121,-0.055684,1.841748,3.100055,0.035690,...,52870.138478,46.710232,63.890691,2.043158,15106.631747,6.951303,13.071928,1.782434,474.306258,0
2,16.250203,3.165679,-1.349452,9.623139,3.259581,-1.074087,-0.173764,2.154747,3.165679,0.132072,...,75160.975457,194.815729,194.815729,1.072146,51371.200053,8.740888,9.665143,1.911626,379.834090,0
3,16.751452,3.731824,-0.821574,9.727975,3.874168,-0.596682,-0.106445,2.514722,3.731824,0.720517,...,74785.044691,106.335301,194.815729,1.337469,62533.340658,10.144990,10.144990,1.925830,461.223776,0
4,17.252702,2.968281,0.206020,9.766919,3.210098,0.498500,-0.113336,2.739498,3.731824,1.290322,...,104643.341996,169.352712,194.815729,1.509854,90613.638529,19.132215,19.132215,1.723994,819.712523,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27527,584.251912,-1.015524,1.287457,9.791637,-0.064398,0.907601,-0.035740,-0.896511,-0.842324,-1.015524,...,8252.880864,17.057331,27.701665,2.001584,2839.573213,4.826826,8.702672,1.988325,349.481772,7
27528,584.753296,-1.022664,1.293318,9.727238,-0.068099,0.910779,-0.100614,-0.914545,-0.845503,-1.022664,...,6741.452200,11.950764,27.701665,1.989785,2801.016567,0.802026,8.702672,1.908035,336.549025,7
27529,585.249665,-1.060643,1.271176,9.746895,-0.106768,0.888803,-0.082151,-0.935499,-0.845503,-1.060643,...,6209.506596,16.866691,27.701665,2.030599,2970.914734,1.509922,8.702672,1.922666,337.963464,7
27530,585.746035,-0.990698,1.324092,9.750749,-0.037913,0.942992,-0.076682,-0.950018,-0.847723,-1.060643,...,6213.126147,24.438290,27.701665,2.076171,3476.098376,2.769441,8.702672,1.925967,338.550653,7


In [18]:
# Split the data into training (65%), validation (10%), and test sets (25%)
train_data, test_data = train_test_split(combined_data, test_size=0.35, random_state=42)
val_data, test_data = train_test_split(test_data, test_size=0.7143, random_state=42)  # 0.7143 * 0.35 ≈ 0.25

# Save the datasets to CSV files
train_data.to_csv('Sets/pca/train_data_pca.csv', index=False)
val_data.to_csv('Sets/pca/val_data_pca.csv', index=False)
test_data.to_csv('Sets/pca/test_data_pca.csv', index=False)

# Verify the split
print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")
print(f"Test set size: {len(test_data)}")

Training set size: 18019
Validation set size: 2772
Test set size: 6931
