In [25]:
import math
from pathlib import Path
from typing import Optional, Tuple, List, Any

import pandas as pd

# --- assume detect_event_in_window is implemented elsewhere and imported ---
# from my_events import detect_event_in_window

def _coerce_to_number(x: Any, name: str) -> float:
    """
    Try to coerce x into a float. If x is a 1-element tuple/list, take its first element.
    Raise TypeError with a helpful message if coercion fails.
    """
    # if it's a list/tuple of length 1, unpack
    if isinstance(x, (tuple, list)) and len(x) == 1:
        x = x[0]
    try:
        return float(x)
    except Exception as e:
        raise TypeError(f"Parameter '{name}' must be a number (or single-element list/tuple). "
                        f"Got {type(x).__name__}({x!r}). Error: {e}")

def _find_identifier_csv(directory: Path, identifier: str) -> Optional[Path]:
    directory = Path(directory)
    if not directory.exists() or not directory.is_dir():
        return None
    csv_files = sorted(directory.glob("*.csv"))
    if not csv_files:
        return None
    # matching strategies
    for p in csv_files:
        if p.stem == identifier:
            return p
    for p in csv_files:
        if identifier in p.stem:
            return p
    for p in csv_files:
        if p.name == identifier or p.name == f"{identifier}.csv":
            return p
    for p in csv_files:
        if identifier in p.name:
            return p
    return None

def _lookup_total_duration_from_table(total_csv_path: Path, identifier: str) -> float:
    total_csv_path = Path(total_csv_path)
    if not total_csv_path.exists():
        raise FileNotFoundError(f"Total durations CSV not found: {total_csv_path}")
    table = pd.read_csv(total_csv_path)
    cols_lower = {c.lower(): c for c in table.columns}
    id_candidates = ["identifier", "id", "filename", "file", "subject", "recording"]
    dur_candidates = ["total_duration", "duration", "recording_duration", "total_seconds", "total_time"]
    id_col = None
    dur_col = None
    for cand in id_candidates:
        if cand in cols_lower:
            id_col = cols_lower[cand]
            break
    for cand in dur_candidates:
        if cand in cols_lower:
            dur_col = cols_lower[cand]
            break
    if id_col is None or dur_col is None:
        if table.shape[1] >= 2:
            id_col = id_col or table.columns[0]
            dur_col = dur_col or table.columns[1]
        else:
            raise ValueError(
                f"Could not determine identifier and duration columns in {total_csv_path}. "
                "Expect columns like 'identifier' and 'total_duration' or at least two columns."
            )
    table[id_col] = table[id_col].astype(str)
    table[dur_col] = pd.to_numeric(table[dur_col], errors="coerce")
    match = table[table[id_col].str.strip() == identifier]
    if match.empty:
        match = table[table[id_col].str.replace(r"\.csv$", "", regex=True).str.strip() == identifier]
    if match.empty:
        match = table[table[id_col].str.contains(identifier, na=False)]
    if match.empty:
        raise FileNotFoundError(f"Identifier '{identifier}' not found in {total_csv_path}")
    total_val = match.iloc[0][dur_col]
    if pd.isna(total_val):
        raise ValueError(f"Total duration for identifier '{identifier}' is NaN in {total_csv_path}")
    return float(total_val)

def create_segment_csv_from_identifier(
    identifier: str,
    *,
    events_dir: str = "identified_apnea_events_in_csv",
    total_durations_csv: str = "total_duration_of_each_patient_record.csv",
    segment_length: Any = 3600.0,
    overlap: Any = 0.0,
    output_path: Optional[str] = None,
    join_multiple_events_with: str = ";",
    min_segment_duration: Any = 0.0,
) -> Tuple[Path, pd.DataFrame]:
    """
    Create a CSV with fixed-length (possibly overlapping) segments for the recording identified by `identifier`.

    - segment_length, overlap and min_segment_duration are coerced to floats (accept single-element tuples/lists).
    - overlap must satisfy 0 <= overlap < segment_length.
    """
    # Coerce numeric params (handles e.g. tuple/list coming from widgets)
    seg_len = _coerce_to_number(segment_length, "segment_length")
    ovl = _coerce_to_number(overlap, "overlap")
    min_seg_dur = _coerce_to_number(min_segment_duration, "min_segment_duration")

    # Validate numeric params
    if seg_len <= 0:
        raise ValueError("segment_length must be > 0")
    if ovl < 0 or ovl >= seg_len:
        raise ValueError("overlap must satisfy 0 <= overlap < segment_length")

    events_dir = Path(events_dir)
    match = _find_identifier_csv(events_dir, identifier)
    if match is None:
        raise FileNotFoundError(f"No matching events CSV found for identifier '{identifier}' in {events_dir}")

    # read events CSV and normalize column names
    df = pd.read_csv(match)
    df_cols_lower = {c.lower(): c for c in df.columns}
    required = {"start_time", "duration", "event_name"}
    if not required.issubset(set(df_cols_lower.keys())):
        raise ValueError(f"Events CSV must contain columns {required}. Found: {list(df.columns)}")
    df = df.rename(columns={
        df_cols_lower["start_time"]: "start_time",
        df_cols_lower["duration"]: "duration",
        df_cols_lower["event_name"]: "event_name"
    })

    total_csv_path = Path(total_durations_csv)
    total_duration = _lookup_total_duration_from_table(total_csv_path, identifier)
    if total_duration <= 0:
        raise ValueError(f"total_duration for identifier '{identifier}' must be > 0. Got {total_duration}")

    step = seg_len - ovl
    if step <= 0:
        raise ValueError("segment_length - overlap must be > 0 (positive step required)")

    n_segments = math.ceil((total_duration - ovl) / step)

    rows = []
    for seg_idx in range(n_segments):
        start_time = seg_idx * step
        end_time = start_time + seg_len
        if end_time > total_duration:
            end_time = total_duration

        # Merge final too-short segment if requested
        if seg_idx == n_segments - 1 and (end_time - start_time) < min_seg_dur and rows:
            prev = rows[-1]
            prev["end_time"] = end_time
            events_list = detect_event_in_window(df, prev["start_time"], prev["end_time"])
            if isinstance(events_list, (list, tuple)):
                prev["event_name"] = join_multiple_events_with.join(events_list) if events_list else "Normal"
            else:
                prev["event_name"] = str(events_list)
            continue

        events_list = detect_event_in_window(df, float(start_time), float(end_time))
        if isinstance(events_list, (list, tuple)):
            event_name = join_multiple_events_with.join(events_list) if events_list else "Normal"
        else:
            event_name = str(events_list)

        rows.append({
            "segment_number": seg_idx,
            "start_time": float(start_time),
            "end_time": float(end_time),
            "event_name": event_name
        })

        if end_time >= total_duration:
            break

    seg_df = pd.DataFrame(rows, columns=["segment_number", "start_time", "end_time", "event_name"])
    # out_path = Path(output_path) if output_path else Path(f"{identifier}_segments.csv")
    out_path = output_path / Path(f"{identifier}_segments.csv")
    seg_df.to_csv(out_path, index=False)

    return out_path, seg_df

# --------------------
# Example usage:
# --------------------
# out_path, segs = create_segment_csv_from_identifier(
#     "subject_001",
#     events_dir="identified_apnea_events_in_csv",
#     total_durations_csv="total_duration_of_each_patient_record.csv",
#     segment_length=(3600.0,),  # now accepts single-element tuple too
#     overlap=(60.0,),           # single-element tuple/list will be coerced
#     output_path=None
# )
# print("Wrote:", out_path)
# print(segs.head())


In [26]:
def detect_event_in_window(df: pd.DataFrame, window_start: float, window_end: float):
    """
    Detects which event(s) occur within a specified time window.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame with columns ['start_time', 'duration', 'event_name'].
        - start_time: event start time in seconds
        - duration: event duration in seconds
        - event_name: name/label of the event
    window_start : float
        Start time of the window (in seconds).
    window_end : float
        End time of the window (in seconds).

    Returns
    -------
    list
        A list of event names that overlap with the given time window.
        Returns an ["Normal"] if no overlap is found.
    """
    # Compute end time of each event
    df = df.copy()
    df["end_time"] = df["start_time"] + df["duration"]

    # Find overlapping events: event overlaps if its range intersects the window
    overlap_mask = (df["start_time"] < window_end) & (df["end_time"] > window_start)
    overlapping_events = df.loc[overlap_mask, "event_name"].unique().tolist()
    if not overlapping_events:
        return ["Normal"]
    else:
        return overlapping_events

In [27]:
# identifier = "00000995-100507"
events_dir = "home/sshuvo13/BSPML_project_sbs_files/segmentation_30s/identified_apnea_events_in_csv"
total_durations_csv="/home/sshuvo13/BSPML_project_sbs_files/segmentation_30s/total_duration_of_each_patient_record.csv"
segment_length=3600.0,
overlap=100    # 60 seconds overlap
output_path="./rml_outputs/segment_details"

In [28]:
# out_path, segs = create_segment_csv_from_identifier(
#     identifier,
#     events_dir=events_dir,
#     total_durations_csv=total_durations_csv,
#     segment_length=segment_length,
#     overlap=overlap,    # 60 seconds overlap
#     output_path=output_path
# )

In [29]:
import math
from pathlib import Path
from typing import Optional, Tuple, Any
import pandas as pd

# --- assume detect_event_in_window is available in this namespace ---


def _coerce_to_number(x: Any, name: str) -> float:
    if isinstance(x, (tuple, list)) and len(x) == 1:
        x = x[0]
    try:
        return float(x)
    except Exception as e:
        raise TypeError(f"Parameter '{name}' must be a number (or single-element list/tuple). Got {type(x).__name__}({x!r}). Error: {e}")


def _find_identifier_csv(directory: Path, identifier: str) -> Optional[Path]:
    """
    Find a csv in directory matching identifier. Return Path or None.
    """
    directory = Path(directory)
    if not directory.exists() or not directory.is_dir():
        return None
    csv_files = sorted(directory.glob("*.csv"))
    if not csv_files:
        return None

    # try exact stem, contains, full name
    for p in csv_files:
        if p.stem == identifier:
            return p
    for p in csv_files:
        if identifier in p.stem:
            return p
    for p in csv_files:
        if p.name == identifier or p.name == f"{identifier}.csv":
            return p
    for p in csv_files:
        if identifier in p.name:
            return p
    return None


def _lookup_total_duration_from_table(total_csv_path: Path, identifier: str) -> float:
    """
    Same as before: find total_duration for identifier from totals CSV.
    """
    total_csv_path = Path(total_csv_path)
    if not total_csv_path.exists():
        raise FileNotFoundError(f"Total durations CSV not found: {total_csv_path}")
    table = pd.read_csv(total_csv_path)
    cols_lower = {c.lower(): c for c in table.columns}
    id_candidates = ["identifier", "id", "filename", "file", "subject", "recording"]
    dur_candidates = ["total_duration", "duration", "recording_duration", "total_seconds", "total_time"]
    id_col = None
    dur_col = None
    for cand in id_candidates:
        if cand in cols_lower:
            id_col = cols_lower[cand]
            break
    for cand in dur_candidates:
        if cand in cols_lower:
            dur_col = cols_lower[cand]
            break
    if id_col is None or dur_col is None:
        if table.shape[1] >= 2:
            id_col = id_col or table.columns[0]
            dur_col = dur_col or table.columns[1]
        else:
            raise ValueError(
                f"Could not determine identifier and duration columns in {total_csv_path}. "
                "Expect columns like 'identifier' and 'total_duration' or at least two columns."
            )
    table[id_col] = table[id_col].astype(str)
    table[dur_col] = pd.to_numeric(table[dur_col], errors="coerce")
    match = table[table[id_col].str.strip() == identifier]
    if match.empty:
        match = table[table[id_col].str.replace(r"\.csv$", "", regex=True).str.strip() == identifier]
    if match.empty:
        match = table[table[id_col].str.contains(identifier, na=False)]
    if match.empty:
        raise FileNotFoundError(f"Identifier '{identifier}' not found in {total_csv_path}")
    total_val = match.iloc[0][dur_col]
    if pd.isna(total_val):
        raise ValueError(f"Total duration for identifier '{identifier}' is NaN in {total_csv_path}")
    return float(total_val)


def create_segment_csv_from_identifier(
    identifier: str,
    *,
    events_dir: str = "identified_apnea_events_in_csv",
    total_durations_csv: str = "total_duration_of_each_patient_record.csv",
    segment_length: Any = 3600.0,
    overlap: Any = 0.0,
    output_path: Optional[str] = None,
    join_multiple_events_with: str = ";",
    min_segment_duration: Any = 0.0,
) -> Tuple[Path, pd.DataFrame]:
    """
    Create segment CSV for identifier. Defensive: tries sanitized identifier fallback,
    ensures output directory exists before writing, and raises clear messages on failure.
    """
    seg_len = _coerce_to_number(segment_length, "segment_length")
    ovl = _coerce_to_number(overlap, "overlap")
    min_seg_dur = _coerce_to_number(min_segment_duration, "min_segment_duration")

    if seg_len <= 0:
        raise ValueError("segment_length must be > 0")
    if ovl < 0 or ovl >= seg_len:
        raise ValueError("overlap must satisfy 0 <= overlap < segment_length")

    events_dir_p = Path(events_dir)
    # try exact identifier first
    match = _find_identifier_csv(events_dir_p, identifier)

    # sanitization fallback: strip common suffixes (e.g., "-checkpoint") and retry
    if match is None and isinstance(identifier, str) and identifier.endswith("-checkpoint"):
        cleaned = identifier.replace("-checkpoint", "")
        match = _find_identifier_csv(events_dir_p, cleaned)

    if match is None:
        raise FileNotFoundError(f"No matching events CSV found for identifier '{identifier}' in {events_dir_p}")

    # read events dataframe and normalize
    df = pd.read_csv(match)
    df_cols_lower = {c.lower(): c for c in df.columns}
    required = {"start_time", "duration", "event_name"}
    if not required.issubset(set(df_cols_lower.keys())):
        raise ValueError(f"Events CSV must contain columns {required}. Found: {list(df.columns)}")
    df = df.rename(columns={
        df_cols_lower["start_time"]: "start_time",
        df_cols_lower["duration"]: "duration",
        df_cols_lower["event_name"]: "event_name"
    })

    total_csv_path = Path(total_durations_csv)
    total_duration = _lookup_total_duration_from_table(total_csv_path, identifier)

    if total_duration <= 0:
        raise ValueError(f"total_duration for identifier '{identifier}' must be > 0. Got {total_duration}")

    step = seg_len - ovl
    if step <= 0:
        raise ValueError("segment_length - overlap must be > 0 (positive step required)")

    n_segments = math.ceil((total_duration - ovl) / step)
    rows = []
    for seg_idx in range(n_segments):
        start_time = seg_idx * step
        end_time = min(start_time + seg_len, total_duration)

        # merge too-short final segment into previous if requested
        if seg_idx == n_segments - 1 and (end_time - start_time) < min_seg_dur and rows:
            prev = rows[-1]
            prev["end_time"] = end_time
            events_list = detect_event_in_window(df, prev["start_time"], prev["end_time"])
            if isinstance(events_list, (list, tuple)):
                prev["event_name"] = join_multiple_events_with.join(events_list) if events_list else "Normal"
            else:
                prev["event_name"] = str(events_list)
            continue

        events_list = detect_event_in_window(df, float(start_time), float(end_time))
        if isinstance(events_list, (list, tuple)):
            event_name = join_multiple_events_with.join(events_list) if events_list else "Normal"
        else:
            event_name = str(events_list)

        rows.append({
            "segment_number": seg_idx,
            "start_time": float(start_time),
            "end_time": float(end_time),
            "event_name": event_name
        })

        if end_time >= total_duration:
            break

    seg_df = pd.DataFrame(rows, columns=["segment_number", "start_time", "end_time", "event_name"])

    # Prepare output path and ensure parent exists BEFORE writing
    out_path = Path(output_path) if output_path else Path(f"{identifier}_segments.csv")
    try:
        out_path.parent.mkdir(parents=True, exist_ok=True)
    except Exception as e:
        raise OSError(f"Cannot create output directory '{out_path.parent}': {e}") from e

    try:
        seg_df.to_csv(out_path, index=False)
    except Exception as e:
        # Raise a clearer error so calling code can log it neatly
        raise OSError(f"Failed to write segments CSV to '{out_path}': {e}") from e

    return out_path, seg_df


# ----------------------
# Batch runner (update to call/create parent dir)
# ----------------------
def batch_create_segments(
    totals_csv: str,
    output_dir: str = "segments_output",
    events_dir: str = "identified_apnea_events_in_csv",
    segment_length: float = 3600.0,
    overlap: float = 0.0,
    total_durations_csv: str = "total_duration_of_each_patient_record.csv",
    join_multiple_events_with: str = ";",
    min_segment_duration: float = 0.0,
):
    totals_csv = Path(totals_csv)
    if not totals_csv.exists():
        raise FileNotFoundError(f"Totals CSV not found: {totals_csv}")

    out_dir = Path(output_dir)
    out_dir.mkdir(parents=True, exist_ok=True)  # ensure base output dir exists

    # read identifiers (first column)
    df_totals = pd.read_csv(totals_csv, dtype=str, keep_default_na=False)
    if df_totals.shape[1] == 0:
        print("No identifiers found in totals CSV.")
        return
    first_col = df_totals.columns[0]
    identifiers = df_totals[first_col].astype(str).str.strip().replace("", pd.NA).dropna().unique().tolist()

    success = []
    failed = []
    for ident in identifiers:
        # generate per-identifier output path (can be nested)
        target_outfile = out_dir / f"{ident}_segments.csv"
        try:
            # make sure the parent of this particular output file exists
            target_outfile.parent.mkdir(parents=True, exist_ok=True)

            out_path, seg_df = create_segment_csv_from_identifier(
                identifier=ident,
                events_dir=events_dir,
                total_durations_csv=total_durations_csv,
                segment_length=segment_length,
                overlap=overlap,
                output_path=str(target_outfile),
                join_multiple_events_with=join_multiple_events_with,
                min_segment_duration=min_segment_duration,
            )
            success.append((ident, out_path, len(seg_df)))
            print(f"[OK] {ident} -> {out_path} ({len(seg_df)} segments)")
        except Exception as e:
            failed.append((ident, str(e)))
            print(f"[ERR] {ident} -> {e}")

    # summary
    print("\nSummary:")
    print(f"Processed: {len(identifiers)}; success: {len(success)}; failed: {len(failed)}")
    if failed:
        print("\nFailures (sample):")
        for ident, msg in failed[:10]:
            print(f"- {ident}: {msg}")

    return success, failed


In [30]:
totals_csv = "/home/sshuvo13/BSPML_project_sbs_files/segmentation_30s/total_duration_of_each_patient_record.csv"
events_dir = "/home/sshuvo13/BSPML_project_sbs_files/segmentation_30s/identified_apnea_events_in_csv"

In [31]:
# Path("./rml_outputs/segment_details").mkdir(parents=True, exist_ok=True)

In [32]:

# Call batch runner
batch_create_segments(
    totals_csv=totals_csv,
    output_dir=output_path,
    events_dir=events_dir,
    segment_length=30,
    overlap=20,
    total_durations_csv=totals_csv,  # the same file used by create_segment_csv_from_identifier for lookups
);

[OK] 00000995-100507 -> rml_outputs/segment_details/00000995-100507_segments.csv (1788 segments)
[OK] 00001006-100507 -> rml_outputs/segment_details/00001006-100507_segments.csv (1430 segments)
[OK] 00001008-100507 -> rml_outputs/segment_details/00001008-100507_segments.csv (1512 segments)
[OK] 00001010-100507 -> rml_outputs/segment_details/00001010-100507_segments.csv (1323 segments)
[OK] 00001016-100507 -> rml_outputs/segment_details/00001016-100507_segments.csv (2183 segments)
[OK] 00001018-100507 -> rml_outputs/segment_details/00001018-100507_segments.csv (1525 segments)
[OK] 00001020-100507 -> rml_outputs/segment_details/00001020-100507_segments.csv (1650 segments)
[OK] 00001028-100507 -> rml_outputs/segment_details/00001028-100507_segments.csv (2207 segments)
[OK] 00001041-100507 -> rml_outputs/segment_details/00001041-100507_segments.csv (1498 segments)
[OK] 00001043-100507 -> rml_outputs/segment_details/00001043-100507_segments.csv (1530 segments)
[OK] 00001069-100507 -> rml_ou