In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
patient_list_file_dir = "/scratch/sshuvo13/scratch_run_dir_sajib/patient_list_wo_two/male_ids.txt"
with open(patient_list_file_dir, 'r', encoding='utf-8') as f:
    patient_list = [line.strip() for line in f.readlines()]

## Doing the train test split on the list of patient IDs, so that we train and test on separate patients, effectively preventing data leakage.

In [3]:
train_patient_ids, test_patient_ids = train_test_split(patient_list, test_size=0.2, random_state=42)


In [4]:
def get_patient_features(patient_id: str, directory: str) -> pd.DataFrame:
    """
    Finds the CSV file containing the patient_id in its name within the given directory,
    reads it into a pandas DataFrame, and returns it.

    Args:
        patient_id (str): The unique patient identifier (e.g. "00000995-100507").
        directory (str): Path to the directory containing the CSV files.

    Returns:
        pd.DataFrame: The patient's feature data.

    Raises:
        FileNotFoundError: If no matching CSV file is found.
    """
    # Look for a CSV file containing the patient ID in its name
    matching_files = [
        f for f in os.listdir(directory)
        if f.endswith('.csv') and patient_id in f
    ]

    if not matching_files:
        raise FileNotFoundError(f"No CSV file found for patient ID: {patient_id}")

    # If multiple matches, take the first one (or modify to handle differently)
    file_path = os.path.join(directory, matching_files[0])

    # Read the CSV file into a pandas DataFrame
    df = pd.read_csv(file_path)

    return df

In [5]:
all_features_dir = "/scratch/sshuvo13/project_shared_folder_bspml_1/whole_dataset_features/aggregated_male"

## In this function, we are also converting all classes to two classes: Normal and Apnea.

In [6]:
def combine_feature_dataset(patient_list, feature_directory):

    all_patients_features = []
    for patient_id in patient_list:
        single_patient_features = get_patient_features(patient_id, feature_directory)
        all_patients_features.append(single_patient_features)
    final_df = pd.concat(all_patients_features, axis=0)
    final_df["label"] = final_df["label"].where(final_df["label"] == "Normal", "Apnea")
    return final_df

In [8]:
train_df = combine_feature_dataset(train_patient_ids, all_features_dir)
test_df = combine_feature_dataset(test_patient_ids, all_features_dir)

## Adjust your writing directory where the new train and test dataset will be stored

In [9]:
writing_dir = "/scratch/sshuvo13/project_shared_folder_bspml_1/whole_dataset_features/train_test_separated_and_combined/male/33_features/"
os.makedirs(writing_dir, exist_ok = True)
train_df.to_csv( os.path.join(writing_dir, "train_data.csv"), index = False)
test_df.to_csv( os.path.join(writing_dir, "test_data.csv"), index = False)