In [1]:
import os
import pandas as pd

In [8]:
# Define the directories
base_dir = "IEEE Sensor Xlsx Data"
folders = ["ECG_handled_fake", "EEG_handled_fake", "EMG_handled_fake"]
merge_output_dir = r"IEEE Sensor Xlsx Data\Merge_data"

# Create the output directory if it doesn't exist
os.makedirs(merge_output_dir, exist_ok=True)

# Function to merge data for a specific subfolder name
def merge_data_for_subfolder(subfolder_name):
    data_dict = {}
    total_rows_before_merge = 0  # Variable to store total row count before merging
    eeg_row_counts = {}  # Dictionary to store row counts of EEG_handled subfolders

    # Loop through each base folder
    for folder in folders:
        folder_path = os.path.join(base_dir, folder)
        subfolder_path = os.path.join(folder_path, subfolder_name)
        if os.path.exists(subfolder_path):
            for file in os.listdir(subfolder_path):
                file_path = os.path.join(subfolder_path, file)
                if file.endswith(".xlsx"):
                    try:
                        df = pd.read_excel(file_path, engine='openpyxl')
                        df['time'] = df['time'].round(0)  # Round 'time' to 0 decimal places
                        total_rows_before_merge += len(df)  # Add row count of current file
                        if subfolder_name not in data_dict:
                            data_dict[subfolder_name] = []
                        data_dict[subfolder_name].append(df)

                        # Count rows for EEG_handled subfolders
                        if folder == "EEG_handled_fake":
                            if subfolder_name not in eeg_row_counts:
                                eeg_row_counts[subfolder_name] = 0
                            eeg_row_counts[subfolder_name] += len(df)
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")

    print(f"Total rows before merging for {subfolder_name}: {total_rows_before_merge}")

    # Print EEG row counts before merging
    for subfolder, row_count in eeg_row_counts.items():
        print(f"Rows in EEG_handled subfolder {subfolder}: {row_count}")

    # Merge DataFrames in each subfolder using 'time' and 'label' as keys
    merged_data = []
    for key, dfs in data_dict.items():
        print(f"Merging data for folder: {key}")
        merged_df = dfs[0]
        for df in dfs[1:]:
            merged_df = pd.merge(merged_df, df, on=['time', 'label'], how='inner')
        merged_data.append(merged_df)
        print(f"Data shape after merging {key}: {merged_df.shape}")

    # Concatenate all merged DataFrames vertically
    final_df = pd.concat(merged_data, axis=0, ignore_index=True)

    # Drop the 'time' column
    final_df = final_df.drop(columns=['time'])

    # Fill NaN values with 0
    final_df = final_df.fillna(0)

    # Log the final shape
    print(f"Final data shape for {subfolder_name}: {final_df.shape}")
    total_rows_after_merge = len(final_df)
    print(f"Total rows after merging for {subfolder_name}: {total_rows_after_merge}")

    # Save the final DataFrame to a new Excel file
    final_output_path = os.path.join(merge_output_dir, f"{subfolder_name}_merged_data.xlsx")
    final_df.to_excel(final_output_path, index=False)

    # Output the path to the final merged file
    print(f"Final merged file saved to: {final_output_path}")

# Get all subfolder names from EEG_handled
eeg_handled_path = os.path.join(base_dir, "EEG_handled_fake")
subfolders = [name for name in os.listdir(eeg_handled_path) if os.path.isdir(os.path.join(eeg_handled_path, name))]

# Merge data for each subfolder
for subfolder_name in subfolders:
    merge_data_for_subfolder(subfolder_name)


Total rows before merging for 20105_handled: 9
Rows in EEG_handled subfolder 20105_handled: 3
Merging data for folder: 20105_handled
Data shape after merging 20105_handled: (3, 36)
Final data shape for 20105_handled: (3, 35)
Total rows after merging for 20105_handled: 3
Final merged file saved to: IEEE Sensor Xlsx Data\Merged_data\20105_handled_merged_data.xlsx
Total rows before merging for 20112A_handled: 9
Rows in EEG_handled subfolder 20112A_handled: 3
Merging data for folder: 20112A_handled
Data shape after merging 20112A_handled: (3, 36)
Final data shape for 20112A_handled: (3, 35)
Total rows after merging for 20112A_handled: 3
Final merged file saved to: IEEE Sensor Xlsx Data\Merged_data\20112A_handled_merged_data.xlsx
Total rows before merging for 20244_handled: 9
Rows in EEG_handled subfolder 20244_handled: 3
Merging data for folder: 20244_handled
Data shape after merging 20244_handled: (3, 36)
Final data shape for 20244_handled: (3, 35)
Total rows after merging for 20244_handl