# Integartion of feature extraction script to generate phase2 dataset.


Phase 2's Dataset 2 is generated by extracting sleep metrics from individual subjects' CSV files using the provided code. The code calculates various sleep statistics, including total sleep duration, sleep latency, time spent in different sleep stages, and the Arousal Index (AI), compiling these metrics for each subject. These results are consolidated into a single dataset for further analysis and predictive modeling.

In [39]:
import os
import pandas as pd

In [40]:
# Step 1: Load the Excel file with multiple sheet

# Update with your Excel file path
file_path = "data/SleepStagingArterialFeatures.xlsx"
# Folder to save CSV files
output_folder = "data/subjects"

# Create output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
    
# Get the sheet names from the Excel file
excel_file = pd.ExcelFile(file_path)
# Get a list of all sheet names
sheet_names = excel_file.sheet_names

# Step 2: Extract each sheet and save as an individual CSV
for sheet_name in sheet_names:
    # Read each sheet from Excel
    data = pd.read_excel(file_path, sheet_name=sheet_name)
    csv_filename = f"{output_folder}/{sheet_name}.csv"
    # Save as a CSV file
    data.to_csv(csv_filename, index=False)

In [41]:
# Load dataset and calculate sleep metrics
def calculate_sleep_metrics(csv_filename):
    data = pd.read_csv(csv_filename)

    # Find the first occurrence of Class 6
    first_occurrence_index = data[data['Class'] == 6].index.min()
    temp_index = None

    # Find the first occurrence of [1, 2, 3, 4] after Class 6
    for index, value in enumerate(data['Class']):
        if value in [1, 2, 3, 4] and temp_index is None:
            temp_index = index
            break
        elif value == 6:
            temp_index = None

    # Calculate time difference between Class 6 and [1, 2, 3, 4]
    segment_difference = temp_index - first_occurrence_index
    time_difference_seconds = segment_difference * 30
    hours, remainder = divmod(time_difference_seconds, 3600)
    minutes, seconds = divmod(remainder, 60)

    # Calculate total time for classes [1, 2, 3, 5]
    total_time = 0
    index = 0

    while index < len(data['Class']):
        start_index = None
        end_index = None

        if data['Class'][index] in [1, 2, 3, 4, 5]:
            start_index = index
            while index < len(data['Class']) and data['Class'][index] in [1, 2, 3, 5]:
                index += 1
            end_index = index - 1
            if start_index is not None and end_index >= 0:
                segment_difference = end_index - start_index + 1
                total_time += segment_difference * 30
        index += 1

    hours, remainder = divmod(total_time, 3600)
    minutes, seconds = divmod(remainder, 60)
    sleep_duration_minutes = total_time // 60

    # Calculate time difference between Class 5 and [1, 2, 3, 4]
    timestamp2_index = data[data['Class'].isin([1, 2, 3, 4])].index.min()
    timestamp1_index = data[data['Class'] == 5].index.min()

    segment_difference = timestamp1_index - timestamp2_index
    time_difference_seconds = segment_difference * 30
    sleep_difference_minutes = time_difference_seconds // 60

    hours, remainder = divmod(time_difference_seconds, 3600)
    minutes, seconds = divmod(remainder, 60)

    # Calculate total times for each class
    total_time_1 = total_time_2 = total_time_3 = total_time_4 = total_time_rem = 0
    index = 0
    total_segments = len(data['Class'])

    while index < total_segments:
        start_index = None
        end_index = None

        if data['Class'][index] == 1:
            start_index = index
            while index < total_segments and data['Class'][index] == 1:
                index += 1
            end_index = index - 1
            if start_index is not None and end_index >= 0:
                segment_difference = end_index - start_index + 1
                total_time_1 += segment_difference * 30

        elif data['Class'][index] == 2:
            start_index = index
            while index < total_segments and data['Class'][index] == 2:
                index += 1
            end_index = index - 1
            if start_index is not None and end_index >= 0:
                segment_difference = end_index - start_index + 1
                total_time_2 += segment_difference * 30

        elif data['Class'][index] == 3:
            start_index = index
            while index < total_segments and data['Class'][index] == 3:
                index += 1
            end_index = index - 1
            if start_index is not None and end_index >= 0:
                segment_difference = end_index - start_index + 1
                total_time_3 += segment_difference * 30

        elif data['Class'][index] == 4:
            start_index = index
            while index < total_segments and data['Class'][index] == 4:
                index += 1
            end_index = index - 1
            if start_index is not None and end_index >= 0:
                segment_difference = end_index - start_index + 1
                total_time_4 += segment_difference * 30

        elif data['Class'][index] == 5:
            start_index = index
            while index < total_segments and data['Class'][index] == 5:
                index += 1
            end_index = index - 1
            if start_index is not None and end_index >= 0:
                segment_difference = end_index - start_index + 1
                total_time_rem += segment_difference * 30

        index += 1

    total_time = (total_time_1 + total_time_2 + total_time_3 + 
                  total_time_4 + total_time_rem)

    # Convert total times to hours, minutes, and seconds
    def convert_time(seconds):
        hours, remainder = divmod(seconds, 3600)
        minutes, seconds = divmod(remainder, 60)
        return int(hours), int(minutes), int(seconds)

    # Calculate percentage for each stage
    def calculate_percentage(total_time, class_time):
        return (class_time / total_time) * 100 if total_time != 0 else 0

    percent_1 = calculate_percentage(total_time, total_time_1)
    percent_2 = calculate_percentage(total_time, total_time_2)
    percent_3 = calculate_percentage(total_time, total_time_3)
    percent_4 = calculate_percentage(total_time, total_time_4)
    percent_rem = calculate_percentage(total_time, total_time_rem)

    # Arousal index calculation
    arousal_count = 0
    total_sleep_time = 0
    index = 0

    while index < len(data['Class']):
        if data['Class'][index] in [2, 3, 4, 5]:
            total_sleep_time += 30
            if index < len(data['Class']) - 1 and data['Class'][index + 1] in [1, 6]:
                arousal_count += 1
        index += 1

    total_sleep_time_hours = total_sleep_time / 3600
    arousal_index = (arousal_count / total_sleep_time_hours 
                     if total_sleep_time_hours > 0 else 0)
    
    return {
        "Subject": csv_filename.split('/')[-1].split('.')[0],
        "Sleep Duration (hours)": sleep_duration_minutes,
        "Sleep Latency (minutes)": sleep_difference_minutes,
        "REM1% (minutes)": percent_1,
        "REM2% (minutes)": percent_2,
        "REM3% (minutes)": percent_3,
        "REM %": percent_rem,
        "Arousal Index": arousal_index
    }


In [42]:
# Step 4: Run the calculation on each
#  and subject CSV and combine results
all_subjects_data = []
for sheet_name in sheet_names:
    csv_filename = f"{output_folder}/{sheet_name}.csv"
    subject_metrics = calculate_sleep_metrics(csv_filename)
    all_subjects_data.append(subject_metrics)

# Step 5: Convert the results
#  and into a DataFrame and save as a new CSV
dataset2 = pd.DataFrame(all_subjects_data)
dataset2.to_csv("data/dataset2.csv", index=False)