In [4]:
import pickle

# Load the .pkl file for Subject 3 (S3.pkl)
with open('S2//S2.pkl', 'rb') as file:
    data = pickle.load(file, encoding='latin1')

# Check the keys in the loaded dictionary
print(data.keys())  # This will show the keys like 'signal', 'label', etc.


dict_keys(['signal', 'label', 'subject'])


In [7]:
import pickle

# Load the .pkl file for Subject 3 (S3.pkl) with encoding
with open('S2/S2.pkl', 'rb') as file:
    data = pickle.load(file, encoding='latin1')

# Check the available keys to see what's inside
print(data.keys())  # Should show 'signal', 'label', etc.

# Access sensor data
chest_data = data['signal']['chest']  # Chest (RespiBAN) data
wrist_data = data['signal']['wrist']  # Wrist (Empatica E4) data
labels = data['label']  # Protocol condition labels

# Check the keys of chest and wrist data dictionaries
print('Chest data keys:', chest_data.keys())  # Check what sensors are available
print('Wrist data keys:', wrist_data.keys())  # Check wrist sensors

dict_keys(['signal', 'label', 'subject'])
Chest data keys: dict_keys(['ACC', 'ECG', 'EMG', 'EDA', 'Temp', 'Resp'])
Wrist data keys: dict_keys(['ACC', 'BVP', 'EDA', 'TEMP'])


In [8]:
# Access a specific sensor, for example, EDA (electrodermal activity) from chest and wrist
chest_eda = chest_data['EDA']  # Replace 'EDA' with the correct key from the chest data keys
wrist_eda = wrist_data['EDA']  # Replace 'EDA' with the correct key from the wrist data keys

# Check the shape of the EDA data (you can print the length or first few rows)
print('Chest EDA shape:', len(chest_eda))  # For a dictionary, we check length
print('First few values of Chest EDA:', chest_eda[:5])

print('Wrist EDA shape:', len(wrist_eda))
print('First few values of Wrist EDA:', wrist_eda[:5])

Chest EDA shape: 4255300
First few values of Chest EDA: [[5.25054932]
 [5.26733398]
 [5.24330139]
 [5.24940491]
 [5.28640747]]
Wrist EDA shape: 24316
First few values of Wrist EDA: [[1.138257]
 [1.125444]
 [1.011405]
 [1.033188]
 [0.935807]]


In [26]:
import pandas as pd
import os
import zipfile

# Function to resample data
def resample_data(df, original_freq, target_freq=4):
    """
    Resample the dataframe based on the target frequency (4Hz by default).
    original_freq: the original sampling rate of the data
    target_freq: the new sampling rate (default: 4 Hz)
    """
    factor = original_freq // target_freq
    return df.iloc[::factor, :].reset_index(drop=True)

# Define the base directory where subject folders are stored
base_dir = 'BeforeProcessing'  # Change to your actual path
output_dir = 'CorrectedFrequency'  # Directory to save processed data
os.makedirs(output_dir, exist_ok=True)  # Create output folder if it doesn't exist

# List of subject folders (e.g., S2, S3, S4...)
subject_folders = [f for f in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, f))]

# Iterate through each subject folder
for subject in subject_folders:
    subject_path = os.path.join(base_dir, subject)
    zip_file_path = os.path.join(subject_path, f'{subject}_E4_Data.zip')  # Path to the zipped data
    extract_dir = os.path.join(subject_path, 'E4_Data')  # Directory to extract the zip content
    output_path = os.path.join(output_dir, f'{subject}_processed.csv')

    try:
        # Unzip the Empatica E4 data
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(extract_dir)
        print(f'Unzipped data for {subject}')

        # Load wrist-worn data (ACC, BVP, EDA, TEMP) from the extracted folder
        acc_file = os.path.join(extract_dir, 'ACC.csv')
        bvp_file = os.path.join(extract_dir, 'BVP.csv')
        eda_file = os.path.join(extract_dir, 'EDA.csv')
        temp_file = os.path.join(extract_dir, 'TEMP.csv')
        hr_file = os.path.join(extract_dir, 'HR.csv')  # Load HR from the correct file

        # Load each sensor data (skip the first two lines as they contain metadata)
        acc_data = pd.read_csv(acc_file, skiprows=2, header=None, names=['ACC_X', 'ACC_Y', 'ACC_Z'])
        bvp_data = pd.read_csv(bvp_file, skiprows=2, header=None, names=['BVP'])
        eda_data = pd.read_csv(eda_file, skiprows=2, header=None, names=['EDA'])
        temp_data = pd.read_csv(temp_file, skiprows=2, header=None, names=['TEMP'])
        hr_data = pd.read_csv(hr_file, skiprows=2, header=None, names=['HR'])  # Correctly load HR data

        # Resample ACC (32 Hz to 4 Hz) and BVP (64 Hz to 4 Hz)
        acc_resampled = resample_data(acc_data, original_freq=32)
        bvp_resampled = resample_data(bvp_data, original_freq=64)

        # HR data is generally at 1 Hz, resampling it is not necessary, but if needed, you could:
        # hr_resampled = resample_data(hr_data, original_freq=1)  # Uncomment if needed

        # EDA and TEMP are already at 4 Hz, so we can use them directly
        # Concatenate all data into a single dataframe
        processed_data = pd.concat([acc_resampled, bvp_resampled, eda_data, temp_data, hr_data], axis=1)

        # Save the processed data to a new CSV file
        processed_data.to_csv(output_path, index=False)
        print(f'Successfully processed and saved data for {subject}')

    except FileNotFoundError as fnf_error:
        print(f"File not found for {subject}: {fnf_error}")
    except Exception as e:
        print(f"Error processing {subject}: {e}")


Unzipped data for S10
Successfully processed and saved data for S10
Unzipped data for S11
Successfully processed and saved data for S11
Unzipped data for S13
Successfully processed and saved data for S13
Unzipped data for S14
Successfully processed and saved data for S14
Unzipped data for S15
Successfully processed and saved data for S15
Unzipped data for S16
Successfully processed and saved data for S16
Unzipped data for S17
Successfully processed and saved data for S17
Unzipped data for S2
Successfully processed and saved data for S2
Unzipped data for S3
Successfully processed and saved data for S3
Unzipped data for S4
Successfully processed and saved data for S4
Unzipped data for S5
Successfully processed and saved data for S5
Unzipped data for S6
Successfully processed and saved data for S6
Unzipped data for S7
Successfully processed and saved data for S7
Unzipped data for S8
Successfully processed and saved data for S8
Unzipped data for S9
Successfully processed and saved data for

In [18]:
import pandas as pd
import os
import numpy as np

# Function to calculate score based on sensor values, without considering BVP
def map_labels(sensor_score, stress, frustration, anxiety, arousal, condition):
    """Map sensor scores and questionnaire data to labels with a more balanced distribution."""
    # Combine sensor score and psychological questionnaire data
    overall_score = (sensor_score * 0.7) + ((stress + frustration + anxiety + arousal) / 4 * 0.3)

    # Debugging output
    print(f"Sensor Score: {sensor_score:.2f}, Stress: {stress:.2f}, Frustration: {frustration:.2f}, "
          f"Anxiety: {anxiety:.2f}, Arousal: {arousal:.2f}, Overall Score: {overall_score:.2f}")

    # Adjust thresholds for more balanced distribution
    if overall_score > 2.5 or (sensor_score >= 2.0 and (stress > 2.0 or frustration > 2.0 or anxiety > 2.0)):
        return "Meltdown"
    elif 1.5 < overall_score <= 2.5 or (sensor_score >= 1.5 and (stress > 1.5 or frustration > 1.5 or anxiety > 1.5)):
        return "Pre-Meltdown"
    elif overall_score <= 1.0 and condition == "Base":
        return "Calm"
    else:
        return "Neutral"

# Helper function to normalize questionnaire scores
def normalize_score(score, min_val, max_val):
    return (score - min_val) / (max_val - min_val) * 4 + 1  # Scale to 1-5 range

# Updated calculate_sensor_score function
def calculate_sensor_score(hr, temp, eda):
    score = 0
    # HR: Stress if HR > 75 bpm, calm if between 60-70 bpm
    if hr > 75:
        score += 2
    elif 60 <= hr <= 70:
        score -= 0.5

    # TEMP: Stress if outside 32.5°C - 34.5°C range
    if temp < 32.5 or temp > 34.5:
        score += 2
    else:
        score -= 0.5

    # EDA: Stress if above 0.3, calm if below 0.1
    if eda > 0.3:
        score += 2
    elif eda < 0.1:
        score -= 0.5

    return score / 3  # Normalize to 0-2 range

# Load the SX_quest.csv file for each subject
def process_questionnaire_and_label(quest_folder, subject_id, sensor_data):
    quest_file = os.path.join(quest_folder, f'{subject_id}', f'{subject_id}_quest.csv')
    
    try:
        # Read the questionnaire file
        with open(quest_file, 'r') as f:
            lines = f.readlines()
        
        # Parse the protocol order
        order_line = [line for line in lines if '# ORDER' in line][0]
        conditions = [cond.strip() for cond in order_line.split(';')[1:] if cond.strip()]

        # Parse times
        start_line = [line for line in lines if '# START' in line][0]
        end_line = [line for line in lines if '# END' in line][0]
        
        start_times = [float(t) for t in start_line.split(';')[1:] if t.strip()]
        end_times = [float(t) for t in end_line.split(';')[1:] if t.strip()]
        
        # Extract questionnaire data
        panas_lines = [line for line in lines if '# PANAS' in line]
        stai_lines = [line for line in lines if '# STAI' in line]
        dim_lines = [line for line in lines if '# DIM' in line]
        
        # Process only the main conditions (baseline, stress, amusement, meditation)
        valid_conditions = ['Base', 'TSST', 'Fun', 'Medi 1', 'Medi 2']
        
        labels = []
        valid_start_times = []
        valid_end_times = []
        
        for i, condition in enumerate(conditions):
            if condition in valid_conditions and i < len(start_times):
                try:
                    # Find corresponding questionnaire responses
                    condition_idx = valid_conditions.index(condition)
                    
                    # Get PANAS scores for this condition
                    panas_scores = [float(x) for x in panas_lines[condition_idx].split(';')[1:] if x.strip()]
                    
                    # Get STAI scores for this condition
                    stai_scores = [float(x) for x in stai_lines[condition_idx].split(';')[1:] if x.strip()]
                    
                    # Get DIM (SAM) scores for this condition
                    dim_scores = [float(x) for x in dim_lines[condition_idx].split(';')[1:] if x.strip()]
                    
                    # Calculate sensor scores for this time window (excluding BVP)
                    start_sample = int(start_times[i] * 700)  # Convert to samples (700 Hz)
                    end_sample = int(end_times[i] * 700)
                    
                    if start_sample < len(sensor_data) and end_sample <= len(sensor_data):
                        sensor_window = sensor_data.iloc[start_sample:end_sample]
                        
                        avg_hr = sensor_window['HR'].mean()
                        avg_temp = sensor_window['TEMP'].mean()
                        avg_eda = sensor_window['EDA'].mean()
                        
                        stress = normalize_score(float(panas_scores[1]), 1, 5)  # Distressed (stress)
                        frustration = normalize_score(float(panas_scores[4]), 1, 5)  # Annoyed (frustration)
                        anxiety = normalize_score(float(stai_scores[2]), 1, 4)  # "I am jittery" from STAI
                        arousal = normalize_score(float(dim_scores[1]), 1, 9)  # Arousal from SAM

                        sensor_score = calculate_sensor_score(avg_hr, avg_temp, avg_eda)
                        
                        # Generate label
                        label = map_labels(
                            sensor_score,
                            stress,
                            frustration,
                            anxiety,
                            arousal,
                            condition
                        )
                        
                        labels.append(label)
                        valid_start_times.append(start_times[i])
                        valid_end_times.append(end_times[i])
                        
                        print(f"\nProcessed {condition} for {subject_id}:")
                        print(f"Time window: {start_times[i]} - {end_times[i]}")
                        print(f"Label: {label}")
                        print(f"Sensor averages - HR: {avg_hr:.2f}, TEMP: {avg_temp:.2f}, EDA: {avg_eda:.2f}")
                        print(f"Sensor score: {sensor_score:.2f}")
                        print(f"Questionnaire scores - Stress: {stress:.2f}, Frustration: {frustration:.2f}, Anxiety: {anxiety:.2f}, Arousal: {arousal:.2f}")
                        print(f"Overall score: {(sensor_score * 0.7) + ((stress + frustration + anxiety + arousal) / 4 * 0.3):.2f}")

                except Exception as e:
                    print(f"Error processing condition {condition} for {subject_id}: {str(e)}")
                    continue
        
        return labels, valid_start_times, valid_end_times
        
    except Exception as e:
        print(f"\nError processing questionnaire for {subject_id}:")
        print(f"Error type: {type(e).__name__}")
        print(f"Error message: {str(e)}")
        return [], [], []

def label_and_merge_sensor_data(processed_folder, subject_id, labels, start_times, end_times):
    try:
        processed_file = os.path.join(processed_folder, f'{subject_id}_processed.csv')
        sensor_data = pd.read_csv(processed_file)
        
        # Initialize all labels as Neutral
        sensor_data['Label'] = "Neutral"
        
        # Apply labels for valid time windows
        for label, start, end in zip(labels, start_times, end_times):
            start_idx = min(int(start * 700), len(sensor_data) - 1)
            end_idx = min(int(end * 700), len(sensor_data) - 1)
            sensor_data.loc[start_idx:end_idx, 'Label'] = label
            
        sensor_data.to_csv(processed_file, index=False)
        print(f"\nSuccessfully labeled data for {subject_id}")
        print(f"Label distribution:")
        print(sensor_data['Label'].value_counts())
        
    except Exception as e:
        print(f"\nError saving labeled data for {subject_id}: {str(e)}")

def main():
    quest_folder = 'BeforeProcessing'
    processed_folder = 'CorrectedFrequency'
    subjects = [f'S{i}' for i in range(2, 18) if i != 1 and i != 12]
    
    for subject_id in subjects:
        print(f"\nProcessing subject {subject_id}...")
        
        try:
            sensor_file = os.path.join(processed_folder, f'{subject_id}_processed.csv')
            
            if not os.path.exists(sensor_file):
                print(f"Sensor data file not found for {subject_id}")
                continue
                
            sensor_data = pd.read_csv(sensor_file)
            print(f"Loaded sensor data for {subject_id}, shape: {sensor_data.shape}")
            
            labels, start_times, end_times = process_questionnaire_and_label(quest_folder, subject_id, sensor_data)
            
            if labels:
                label_and_merge_sensor_data(processed_folder, subject_id, labels, start_times, end_times)
            else:
                print(f"No valid labels generated for {subject_id}")
                
        except Exception as e:
            print(f"Error processing {subject_id}: {str(e)}")
            continue

if __name__ == "__main__":
    main()



Processing subject S2...
Loaded sensor data for S2, shape: (31497, 8)
Sensor Score: 1.33, Stress: 1.00, Frustration: 1.00, Anxiety: 1.00, Arousal: 1.50, Overall Score: 1.27

Processed Base for S2:
Time window: 7.08 - 26.32
Label: Neutral
Sensor averages - HR: 74.15, TEMP: 34.93, EDA: 0.59
Sensor score: 1.33
Questionnaire scores - Stress: 1.00, Frustration: 1.00, Anxiety: 1.00, Arousal: 1.50
Overall score: 1.27

Successfully labeled data for S2
Label distribution:
Label
Neutral    31497
Name: count, dtype: int64

Processing subject S3...
Loaded sensor data for S3, shape: (30895, 8)
Sensor Score: 0.33, Stress: 1.00, Frustration: 1.00, Anxiety: 1.00, Arousal: 2.50, Overall Score: 0.65

Processed Base for S3:
Time window: 6.44 - 26.04
Label: Calm
Sensor averages - HR: 69.20, TEMP: 32.57, EDA: 1.93
Sensor score: 0.33
Questionnaire scores - Stress: 1.00, Frustration: 1.00, Anxiety: 1.00, Arousal: 2.50
Overall score: 0.65

Successfully labeled data for S3
Label distribution:
Label
Neutral   

In [20]:
import pandas as pd
import os
from collections import defaultdict

def count_labels_in_folder(processed_folder):
    label_counter = defaultdict(int)  # Use a defaultdict to automatically initialize counts to zero

    # Iterate through each file in the specified folder
    for filename in os.listdir(processed_folder):
        if filename.endswith('.csv'):  # Check for CSV files
            file_path = os.path.join(processed_folder, filename)
            
            try:
                # Load the sensor data file
                sensor_data = pd.read_csv(file_path)
                
                # Count occurrences of each label
                for label in sensor_data['Label'].dropna().unique():  # Using dropna() to ignore any NaN labels
                    label_counter[label] += sensor_data['Label'].value_counts().get(label, 0)

            except Exception as e:
                print(f"Error reading {filename}: {str(e)}")
    
    return label_counter

def main():
    processed_folder = 'CorrectedFrequency'  # Adjust the folder path as needed
    label_counts = count_labels_in_folder(processed_folder)

    # Print the final counts for each label
    print("Final Label Counts:")
    for label, count in label_counts.items():
        print(f"{label}: {count}")

if __name__ == "__main__":
    main()


Final Label Counts:
Neutral: 283455
Meltdown: 42004
Pre-Meltdown: 32798
Calm: 69537


In [3]:
import pandas as pd
import os

# Function to merge data from multiple subjects
def merge_subject_data(folder_path):
    merged_data = pd.DataFrame()
    
    # Iterate through all CSV files in the specified folder
    for filename in os.listdir(folder_path):
        if filename.endswith('_processed.csv'):  # Ensure we only process the correct files
            subject_file = os.path.join(folder_path, filename)
            # Load subject data
            subject_data = pd.read_csv(subject_file)
            
            # Extract subject ID from filename
            subject_id = filename.split('_')[0]
            subject_data['SubjectID'] = subject_id  # Add SubjectID column
            
            # Create a new time column starting from 0
            subject_data['Time'] = range(len(subject_data))  # Add a sequential time column
            
            # Merge subject data into the combined dataframe
            merged_data = pd.concat([merged_data, subject_data], ignore_index=True)
    
    return merged_data

# Example usage
folder_path = 'CorrectedFrequency'  # Path to the folder containing subject files
merged_data = merge_subject_data(folder_path)

# Show a sample of the merged data
print(merged_data.head(10))  # Display first 10 rows of the merged data

# Save merged data to a CSV file
merged_data.to_csv('merged_subject_data.csv', index=False)
print("Data merged and saved successfully!")


   ACC_X  ACC_Y  ACC_Z     BVP       EDA    TEMP     HR    Label SubjectID  \
0    5.0    1.0   63.0   -0.00  0.000000  382.21  71.00  Neutral       S10   
1    5.0    1.0   63.0   -0.05  0.226414  382.21  71.00  Neutral       S10   
2    5.0    1.0   63.0    6.20  0.287814  382.21  70.67  Neutral       S10   
3    6.0    1.0   63.0   22.90  0.294210  382.21  71.25  Neutral       S10   
4    5.0    1.0   63.0   93.76  0.295489   31.19  72.20  Neutral       S10   
5    5.0    1.0   63.0  208.53  0.294210   31.19  73.50  Neutral       S10   
6    5.0    1.0   63.0 -537.23  0.296768   31.19  74.57  Neutral       S10   
7    6.0    1.0   63.0   71.43  0.295489   31.19  75.50  Neutral       S10   
8    5.0    1.0   63.0  141.30  0.291652   31.19  76.33  Neutral       S10   
9    5.0    1.0   63.0   37.59  0.294210   31.19  76.80  Neutral       S10   

   Time  
0     0  
1     1  
2     2  
3     3  
4     4  
5     5  
6     6  
7     7  
8     8  
9     9  
Data merged and saved successfu