In [19]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# Define the path to the filtered data folder
data_folder = "Data_Filtered"  # Update this to the path where your filtered data is stored

# Define a mapping from folder names to integer labels
label_mapping = {
    'auto': 0,
    'bus': 1,
    'fiets': 2,
    'lopen': 3,
    'metro': 4,
    'paard': 5,
    'tram': 6,
    'trein': 7
}

# Initialize a list to hold the combined data
combined_data_list = []

# Iterate over each subfolder in the data_filtered folder
for subfolder in os.listdir(data_folder):
    subfolder_path = os.path.join(data_folder, subfolder)
    if os.path.isdir(subfolder_path):
        # Read the accelerometer, gyroscope, and linear accelerometer CSV files
        accelerometer_file = os.path.join(subfolder_path, 'Accelerometer.csv')
        gyroscope_file = os.path.join(subfolder_path, 'Gyroscope.csv')
        linear_accelerometer_file = os.path.join(subfolder_path, 'Linear Accelerometer.csv')

        
        if os.path.exists(accelerometer_file) and os.path.exists(gyroscope_file) and os.path.exists(linear_accelerometer_file):
            acc_data = pd.read_csv(accelerometer_file)
            gyro_data = pd.read_csv(gyroscope_file)
            lin_acc_data = pd.read_csv(linear_accelerometer_file)

            #acc_data = acc_data.iloc[10:]
            #gyro_data = gyro_data.iloc[10:]
            #lin_acc_data = lin_acc_data.iloc[10:]          
                     
            # Reset index to align them properly for merging
            acc_data.reset_index(drop=True, inplace=True)
            gyro_data.reset_index(drop=True, inplace=True)
            lin_acc_data.reset_index(drop=True, inplace=True)

            merged_data = pd.concat([acc_data, gyro_data, lin_acc_data], axis=1)

            # Determine the label based on the folder name
            label = next((label_mapping[key] for key in label_mapping if key in subfolder), None)
            
            if label is not None:
                # Add the label column based on the folder name
                merged_data['label'] = label
            
            # Append the merged data to the list
            combined_data_list.append(merged_data)

# Concatenate all the combined data into a single DataFrame
combined_data = pd.concat(combined_data_list, ignore_index=True)

# Check if data has been loaded and merged correctly
print(combined_data.head())


    Time (s)         X         Y         Z   Time (s)         X         Y  \
0  10.250332  0.012065 -0.550455  9.800024  10.247826 -0.000817 -0.000605   
1  10.746557  0.020518 -0.545073  9.797954  10.749063 -0.000022  0.000382   
2  11.247795  0.035690 -0.493657  9.779238  11.250301  0.001961 -0.000264   
3  11.749034  0.132072 -0.061541  9.886638  11.751541  0.005399 -0.000180   
4  12.250274  0.720517  0.683281  9.873849  12.252781  0.008621 -0.002708   

          Z   Time (s)         X         Y         Z  label  
0 -0.001003  10.247826 -0.007027  0.010127  0.004482      0  
1 -0.001239  10.749063  0.000415  0.015643  0.005915      0  
2 -0.000147  11.250301  0.008570  0.063228 -0.020871      0  
3 -0.006632  11.751541  0.109061  0.475696  0.093381      0  
4 -0.079796  12.252781  0.650630  1.088782  0.073953      0  


In [13]:
combined_data

Unnamed: 0,Time (s),X,Y,Z,pca1,pca2,pca3,temp_X_mean,temp_X_max,temp_X_min,...,temp_Z_max,temp_Z_min,temp_Z_std,temp_Z_median,temp_Z_sum,temp_Z_skew,temp_Z_kurt,Z_lag1,Z_lag2,label
0,15.247708,2.494864,0.263151,9.758139,2.743937,0.599160,-0.118427,1.557725,3.100055,0.020518,...,0.127293,-0.135981,0.079461,-0.007478,0.010748,0.048280,-0.330709,-0.030613,-0.135981,0
1,15.748955,2.860747,0.109276,9.817755,3.095071,0.415121,-0.055684,1.841748,3.100055,0.035690,...,0.127293,-0.135981,0.079867,-0.005254,0.031387,-0.044906,-0.410053,-0.049673,-0.030613,0
2,16.250203,3.165679,-1.349452,9.623139,3.259581,-1.074087,-0.173764,2.154747,3.165679,0.132072,...,0.127293,-0.175428,0.097940,-0.010124,-0.123170,-0.266835,-0.750303,0.026554,-0.049673,0
3,16.751452,3.731824,-0.821574,9.727975,3.874168,-0.596682,-0.106445,2.514722,3.731824,0.720517,...,0.127293,-0.175428,0.091243,-0.040143,-0.274135,0.063869,-0.212551,-0.175428,0.026554,0
4,17.252702,2.968281,0.206020,9.766919,3.210098,0.498500,-0.113336,2.739498,3.731824,1.290322,...,0.127293,-0.175428,0.084224,-0.040143,-0.367548,0.257009,0.920773,-0.057584,-0.175428,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25187,584.251912,-1.015524,1.287457,9.791637,-0.064398,0.907601,-0.035740,-0.896511,-0.842324,-1.015524,...,0.190208,-0.011882,0.054805,0.077716,0.816298,0.287568,1.189687,0.024659,0.106400,7
25188,584.753296,-1.022664,1.293318,9.727238,-0.068099,0.910779,-0.100614,-0.914545,-0.845503,-1.022664,...,0.190208,-0.011882,0.055527,0.072851,0.795252,0.400066,1.001402,0.116329,0.024659,7
25189,585.249665,-1.060643,1.271176,9.746895,-0.106768,0.888803,-0.082151,-0.935499,-0.845503,-1.060643,...,0.190208,-0.011882,0.055306,0.078153,0.805854,0.335785,1.041396,0.053025,0.116329,7
25190,585.746035,-0.990698,1.324092,9.750749,-0.037913,0.942992,-0.076682,-0.950018,-0.847723,-1.060643,...,0.190208,-0.011882,0.054454,0.074242,0.771063,0.573064,1.553297,0.074944,0.053025,7


In [20]:
# Split the data into training (65%), validation (10%), and test sets (25%)
train_data, test_data = train_test_split(combined_data, test_size=0.35, random_state=42)
val_data, test_data = train_test_split(test_data, test_size=0.7143, random_state=42)  # 0.7143 * 0.35 ≈ 0.25

# Save the datasets to CSV files
train_data.to_csv('Sets/org/train_data.csv', index=False)
val_data.to_csv('Sets/org/val_data.csv', index=False)
test_data.to_csv('Sets/org/test.csv', index=False)

# Verify the split
print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")
print(f"Test set size: {len(test_data)}")

Training set size: 18019
Validation set size: 2772
Test set size: 6931
