# Integartion of feature extraction script to generate phase2 dataset.


Phase 2's Dataset 2 is generated by extracting sleep metrics from individual subjects' CSV files using the provided code. The code calculates various sleep statistics, including total sleep duration, sleep latency, time spent in different sleep stages, and the Arousal Index (AI), compiling these metrics for each subject. These results are consolidated into a single dataset for further analysis and predictive modeling.

In [39]:
import os
import pandas as pd

In [40]:
# Step 1: Load the Excel file with multiple sheet

# Update with your Excel file path
FILE_PATH = "data/sleepstagingarterialeatures.xlsx"
# Folder to save CSV files
OUTPUT_FOLDER = "data/subjects"

# Create output folder if it doesn't exist
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)

# Get the sheet names from the Excel file
excel_file = pd.ExcelFile(FILE_PATH)
# Get a list of all sheet names
sheet_names = excel_file.sheet_names

# Step 2: Extract each sheet and save as an individual CSV
for sheet_name in sheet_names:
    # Read each sheet from Excel
    data = pd.read_excel(FILE_PATH, sheet_name=sheet_name)
    csv_filename = f"{OUTPUT_FOLDER}/{sheet_name}.csv"
    # Save as a CSV file
    data.to_csv(csv_filename, index=False)

In [41]:
# Load dataset and calculate sleep metrics
def calculate_sleep_metrics(file_path):
    """
    Calculates various sleep metrics based on the input CSV file data.

    Args:
        file_path (str): The path to the CSV file containing sleep data.

    Returns:
        dict: A dictionary containing calculated sleep metrics.
    """
    data = pd.read_csv(file_path)
    
    # Define constants for sleep classes and segment duration
    stage_classes = [1, 2, 3, 4, 5]
    rem_classes = [1, 2, 3]
    segment_duration = 30  # Each segment represents 30 seconds

    # Find the first occurrence of Class 6 (wake after sleep onset)
    first_occurrence_index = data[data['Class'] == 6].index.min()
    temp_index = None

    # Find the first occurrence of rem_classes after Class 6
    for index, value in enumerate(data['Class']):
        if value in rem_classes and temp_index is None:
            temp_index = index
            break
        if value == 6:
            temp_index = None

    # Calculate time difference between Class 6 and rem_classes
    segment_difference = temp_index - first_occurrence_index
    time_difference_seconds = segment_difference * segment_duration
    sleep_duration_minutes = time_difference_seconds // 60

    # Calculate the total time spent in sleep stages [1, 2, 3, 5]
    total_time = 0
    index = 0

    while index < len(data['Class']):
        start_index = None
        end_index = None

        if data['Class'][index] in stage_classes:
            start_index = index
            while index < len(data['Class']) and data['Class'][index] in stage_classes:
                index += 1
            end_index = index - 1
            if start_index is not None and end_index >= 0:
                segment_difference = end_index - start_index + 1
                total_time += segment_difference * segment_duration
        index += 1

    sleep_duration_minutes = total_time // 60

    # Calculate time difference between Class 5 and rem_classes
    timestamp2_index = data[data['Class'].isin(rem_classes)].index.min()
    timestamp1_index = data[data['Class'] == 5].index.min()

    segment_difference = timestamp1_index - timestamp2_index
    time_difference_seconds = segment_difference * segment_duration
    sleep_difference_minutes = time_difference_seconds // 60

    # Calculate total times for each sleep stage class
    stage_totals = {cls: 0 for cls in stage_classes}
    index = 0
    total_segments = len(data['Class'])

    while index < total_segments:
        start_index = None
        end_index = None

        if data['Class'][index] in stage_classes:
            cls = data['Class'][index]
            start_index = index
            while index < total_segments and data['Class'][index] == cls:
                index += 1
            end_index = index - 1
            if start_index is not None and end_index >= 0:
                segment_difference = end_index - start_index + 1
                stage_totals[cls] += segment_difference * segment_duration
        index += 1

    # Calculate the total time and convert it to minutes
    total_time = sum(stage_totals.values())
    hours, remainder = divmod(total_time, 3600)
    minutes, seconds = divmod(remainder, 60)

    # Convert total times to percentages
    def calculate_percentage(total_time, class_time):
        return (class_time / total_time) * 100 if total_time != 0 else 0

    percentages = {cls: calculate_percentage(total_time, stage_totals[cls]) for cls in stage_classes}

    # Calculate arousal index
    arousal_count = 0
    total_sleep_time = 0
    index = 0

    while index < len(data['Class']):
        if data['Class'][index] in [2, 3, 4, 5]:
            total_sleep_time += segment_duration
            if index < len(data['Class']) - 1 and data['Class'][index + 1] in [1, 6]:
                arousal_count += 1
        index += 1

    total_sleep_time_hours = total_sleep_time / 3600
    arousal_index = (
        arousal_count / total_sleep_time_hours if total_sleep_time_hours > 0 else 0
    )

    return {
        "Subject": file_path.split('/')[-1].split('.')[0],
        "Sleep Duration (minutes)": sleep_duration_minutes,
        "Sleep Latency (minutes)": sleep_difference_minutes,
        "REM1% (minutes)": percentages[1],
        "REM2% (minutes)": percentages[2],
        "REM3% (minutes)": percentages[3],
        "REM %": percentages[5],
        "Arousal Index": arousal_index,
    }


In [42]:
# Step 4: Run the calculation on each
#  and subject CSV and combine results
all_subjects_data = []
for sheet_name in sheet_names:
    csv_filename = f"{OUTPUT_FOLDER}/{sheet_name}.csv"
    subject_metrics = calculate_sleep_metrics(csv_filename)
    all_subjects_data.append(subject_metrics)

# Step 5: Convert the results
#  and into a DataFrame and save as a new CSV
dataset2 = pd.DataFrame(all_subjects_data)
dataset2.to_csv("data/dataset2.csv", index=False)