# Segment the time series into 1 minute sequences for each user

In [1]:
from Funcs.Utility import *
import numpy as np
import pandas as pd
from typing import Dict, Callable, Union, Tuple, List, Optional, Iterable
from datetime import timedelta as td
from scipy import stats
import ray
import warnings
import time

In [2]:
def _safe_na_check(_v):
    _is_nan_inf = False
    
    try:
        _is_nan_inf = np.isnan(_v) or np.isinf(_v)
    except:
        _is_nan_inf = False
    
    return _is_nan_inf or _v is None

In [3]:
import os
import cloudpickle

DATA = load(os.path.join(PATH_INTERMEDIATE, 'proc.pkl'))
LABELS_PROC = pd.read_csv(os.path.join(PATH_INTERMEDIATE, 'proc', 'LABELS_PROC.csv'), index_col=['pcode','timestamp'],parse_dates=True)

In [6]:
import pandas as pd
import ray
import logging
from datetime import datetime

logging.basicConfig(level=logging.DEBUG)

# RESAMPLE_S = {
    # 'ACC_AXX': 0.25,
    # 'ACC_AXY': 0.25,
    # 'ACC_AXZ': 0.25,
    # 'ACC_MAG': 0.25,
    # 'EDA': 0.5,
# }

@ray.remote
def segment_sensor_data(pcode, sensor_type, sensor_data, label_data):
    user_labels = label_data.loc[pcode]
    if isinstance(user_labels, pd.Series):
        user_labels = user_labels.to_frame().T

    # Initialize empty DataFrames for labeled and unlabeled sequences
    labeled_sequences_df = pd.DataFrame()
    unlabeled_sequences_df = pd.DataFrame()

    resampled_data = sensor_data.resample('T').asfreq()
    if isinstance(resampled_data, pd.Series):
        resampled_data = resampled_data.to_frame()

    for time, row in resampled_data.iterrows():
        sequence = {sensor_type: sensor_data.loc[time:time + pd.Timedelta(minutes=1)], 'pcode': pcode, 'timestamp': time  }

        future_labels = user_labels[user_labels.index > time]
        if not future_labels.empty:
            time_differences = (future_labels.index - time).total_seconds()
            abs_time_differences = abs(pd.Series(time_differences, index=future_labels.index))
            nearest_future_time = abs_time_differences.idxmin()
            label_row = future_labels.loc[nearest_future_time]

            # Ensure duration is a valid number
            duration = label_row['duration']
            if pd.isna(duration) or not isinstance(duration, (int, float)):
                duration = 1  # Default value or use another appropriate handling

            overlapping_labels = user_labels[(user_labels.index >= time) & (user_labels.index - pd.Timedelta(minutes=duration) < time)]

            if not overlapping_labels.empty:
                label = overlapping_labels.iloc[-1]['stress_fixed']
                sequence['label'] = label
                labeled_sequence_df = pd.concat([pd.DataFrame(sequence)], ignore_index=True)
                labeled_sequences_df = pd.concat([labeled_sequences_df, labeled_sequence_df], ignore_index=True)
            else:
                sequence['label'] = None
                unlabeled_sequence_df = pd.concat([pd.DataFrame(sequence)], ignore_index=True)
                unlabeled_sequences_df = pd.concat([unlabeled_sequences_df, unlabeled_sequence_df], ignore_index=True)
        else:
            sequence['label'] = None
            unlabeled_sequence_df = pd.concat([pd.DataFrame(sequence)], ignore_index=True)
            unlabeled_sequences_df = pd.concat([unlabeled_sequences_df, unlabeled_sequence_df], ignore_index=True)

    return labeled_sequences_df, unlabeled_sequences_df

with on_ray():
    segmented_data = []
    for pcode in LABELS_PROC.index.get_level_values('pcode').unique():
        print(f"{datetime.now()} - Segmenting {pcode} data...")
        for sensor_type, data in DATA.items():
            if pcode in data.index.get_level_values('pcode'):
                # resample_interval = RESAMPLE_S.get(sensor_type, 1)
                resample_interval = 1
                user_data = data.loc[pcode]
                resampled_sensor_data = user_data.resample(f'{resample_interval}S').interpolate(method='linear').dropna()
                segmented_data.append(segment_sensor_data.remote(pcode, sensor_type, resampled_sensor_data, LABELS_PROC))
        print(f"{datetime.now()} - Finished segmenting {pcode} data.")

    results = ray.get(segmented_data)

    print(f"{datetime.now()} - Finished segmenting data.")

    # Aggregate DataFrames
    labeled_df = pd.concat([item[0] for item in results], ignore_index=True)
    unlabeled_df = pd.concat([item[1] for item in results], ignore_index=True)

    print(f"{datetime.now()} - Finished aggregating data.")

    labeled_df.to_csv(os.path.join(PATH_INTERMEDIATE, 'proc', 'labeled_sequences.csv'), index=False)
    unlabeled_df.to_csv(os.path.join(PATH_INTERMEDIATE, 'proc', 'unlabeled_sequences.csv'), index=False)

    print(f"{datetime.now()} - Finished saving data.")

DEBUG:filelock:Attempting to acquire lock 140331246276032 on /tmp/ray/session_2023-12-05_20-24-59_296894_208978/ports_by_node.json.lock
DEBUG:filelock:Lock 140331246276032 acquired on /tmp/ray/session_2023-12-05_20-24-59_296894_208978/ports_by_node.json.lock
DEBUG:filelock:Attempting to release lock 140331246276032 on /tmp/ray/session_2023-12-05_20-24-59_296894_208978/ports_by_node.json.lock
DEBUG:filelock:Lock 140331246276032 released on /tmp/ray/session_2023-12-05_20-24-59_296894_208978/ports_by_node.json.lock
DEBUG:filelock:Attempting to acquire lock 140331246276032 on /tmp/ray/session_2023-12-05_20-24-59_296894_208978/ports_by_node.json.lock
DEBUG:filelock:Lock 140331246276032 acquired on /tmp/ray/session_2023-12-05_20-24-59_296894_208978/ports_by_node.json.lock
DEBUG:filelock:Attempting to release lock 140331246276032 on /tmp/ray/session_2023-12-05_20-24-59_296894_208978/ports_by_node.json.lock
DEBUG:filelock:Lock 140331246276032 released on /tmp/ray/session_2023-12-05_20-24-59_29

2023-12-05 20:25:02.091339 - Segmenting P01 data...
2023-12-05 20:25:13.862247 - Finished segmenting P01 data.
2023-12-05 20:25:13.862296 - Segmenting P02 data...
2023-12-05 20:25:28.561331 - Finished segmenting P02 data.
2023-12-05 20:25:28.561394 - Segmenting P03 data...
2023-12-05 20:25:49.696808 - Finished segmenting P03 data.
2023-12-05 20:25:49.696875 - Segmenting P04 data...
2023-12-05 20:26:10.021494 - Finished segmenting P04 data.
2023-12-05 20:26:10.021555 - Segmenting P05 data...
2023-12-05 20:26:30.797868 - Finished segmenting P05 data.
2023-12-05 20:26:30.797933 - Segmenting P06 data...
2023-12-05 20:26:50.697657 - Finished segmenting P06 data.
2023-12-05 20:26:50.697723 - Segmenting P07 data...
2023-12-05 20:27:10.809427 - Finished segmenting P07 data.
2023-12-05 20:27:10.809489 - Segmenting P08 data...
2023-12-05 20:27:30.937594 - Finished segmenting P08 data.
2023-12-05 20:27:30.937776 - Segmenting P09 data...
2023-12-05 20:27:51.221353 - Finished segmenting P09 data.
2