# Segment the time series into 1 minute sequences for each user

In [22]:
from Funcs.Utility import *
import numpy as np
import pandas as pd
from typing import Dict, Callable, Union, Tuple, List, Optional, Iterable
from datetime import timedelta as td
from scipy import stats
import ray
import warnings
import time

In [23]:
def _safe_na_check(_v):
    _is_nan_inf = False
    
    try:
        _is_nan_inf = np.isnan(_v) or np.isinf(_v)
    except:
        _is_nan_inf = False
    
    return _is_nan_inf or _v is None

In [24]:
import os
import cloudpickle

DATA = load(os.path.join(PATH_INTERMEDIATE, 'proc.pkl'))
LABELS_PROC = pd.read_csv(os.path.join(PATH_INTERMEDIATE, 'LABELS_PROC.csv'), index_col=['pcode','timestamp'],parse_dates=True)

  LABELS_PROC = pd.read_csv(os.path.join(PATH_INTERMEDIATE, 'LABELS_PROC.csv'), index_col=['pcode','timestamp'],parse_dates=True)


In [25]:
SENSOR_TYPES = ['EDA', 'RRI']
DATA = { key: DATA[key] for key in SENSOR_TYPES}

In [31]:
import pandas as pd
import ray
import logging
from datetime import datetime
import os

logging.basicConfig(level=logging.DEBUG)

sequence_length = 1 # minutes
RESAMPLE_S = {
    'RRI': 1.0,
#     'AML': 1.0,
    'EDA': 1.0,
}

# @ray.remote
# def segment_sensor_data(pcode, sensor_type, sensor_data, label_data):

#     print(f"{datetime.now()} - Started segmenting {pcode} {sensor_type} data.")

#     user_labels = label_data.loc[pcode]
#     if isinstance(user_labels, pd.Series):
#         user_labels = user_labels.to_frame().T

#     labeled_sequences_df = pd.DataFrame()
#     unlabeled_sequences_df = pd.DataFrame()

#     resampled_data = sensor_data.resample('T').asfreq()
#     if isinstance(resampled_data, pd.Series):
#         resampled_data = resampled_data.to_frame()

#     for time, row in resampled_data.iterrows():
#         sequence = {sensor_type: sensor_data.loc[time:time + pd.Timedelta(minutes=sequence_length)], 'pcode': pcode, 'timestamp': time}

#         future_labels = user_labels[user_labels.index > time]
#         if not future_labels.empty:

#             time_differences = (future_labels.index - time).total_seconds()
#             abs_time_differences = abs(pd.Series(time_differences, index=future_labels.index))
#             nearest_future_time = abs_time_differences.idxmin()
#             label_row = future_labels.loc[nearest_future_time]

#             #Extend the labels using the duration
#             # # Ensure duration is a valid number
#             # duration = label_row['duration']
#             # if pd.isna(duration) or not isinstance(duration, (int, float)):
#             #     duration = sequence_length  # Default value or use another appropriate handling
            
#             #Instead of extending the labels, just use the sequence aligned with label timestamp
#             duration = sequence_length

#             overlapping_labels = user_labels[(user_labels.index >= time) & (user_labels.index - pd.Timedelta(minutes=duration) < time)]
#             # overlapping_labels = user_labels[(user_labels.index + pd.Timedelta(minutes=duration) >= time) & (user_labels.index - 2 * pd.Timedelta(minutes=duration) < time)]


#             if not overlapping_labels.empty:
#                 label = overlapping_labels.iloc[-1]['stress_fixed']
#                 sequence['label'] = label
#                 labeled_sequence_df = pd.DataFrame(sequence)
#                 labeled_sequences_df = pd.concat([labeled_sequences_df, labeled_sequence_df], ignore_index=True)
#             else:
#                 sequence['label'] = None
#                 unlabeled_sequence_df = pd.DataFrame(sequence)
#                 unlabeled_sequences_df = pd.concat([unlabeled_sequences_df, unlabeled_sequence_df], ignore_index=True)
#         else:
#             sequence['label'] = None
#             unlabeled_sequence_df = pd.DataFrame(sequence)
#             unlabeled_sequences_df = pd.concat([unlabeled_sequences_df, unlabeled_sequence_df], ignore_index=True)

#     print(f"{datetime.now()} - Finished segmenting {pcode} {sensor_type} data.")

#     # Save each user's sequences as separate CSV files
#     if not labeled_sequences_df.empty:
#         labeled_sequences_df.to_csv(os.path.join(PATH_INTERMEDIATE, 'proc_updated', f"{pcode}_{sensor_type}_labeled.csv"), index=False)
#         print(f"{datetime.now()} - Finished saving {pcode} {sensor_type} labeled data.")
#     if not unlabeled_sequences_df.empty:
#         unlabeled_sequences_df.to_csv(os.path.join(PATH_INTERMEDIATE, 'proc_updated', f"{pcode}_{sensor_type}_unlabeled.csv"), index=False)
#         print(f"{datetime.now()} - Finished saving {pcode} {sensor_type} unlabeled data.")

#     return pcode, sensor_type  # Just to track progress


@ray.remote
def segment_sensor_data(pcode, sensor_type, sensor_data, label_data):
    print(f"{datetime.now()} - Started segmenting {pcode} {sensor_type} data.")

    user_labels = label_data.loc[pcode]
    if isinstance(user_labels, pd.Series):
        user_labels = user_labels.to_frame().T

    labeled_sequences_df = pd.DataFrame()
    unlabeled_sequences_df = pd.DataFrame()

    resampled_data = sensor_data.resample('T').asfreq()
    if isinstance(resampled_data, pd.Series):
        resampled_data = resampled_data.to_frame()

    for time, row in resampled_data.iterrows():
        sequence = {sensor_type: sensor_data.loc[time:time + pd.Timedelta(minutes=sequence_length)], 'pcode': pcode, 'timestamp': time}

        future_labels = user_labels[user_labels.index > time]
        if not future_labels.empty:
            nearest_future_time = future_labels.index.min()
            label_row = future_labels.loc[nearest_future_time]

            duration = sequence_length
            overlapping_labels = user_labels[(user_labels.index >= time) & (user_labels.index - pd.Timedelta(minutes=duration) < time)]
            if not overlapping_labels.empty:
                label = overlapping_labels.iloc[-1]['stress_fixed']
                sequence['label'] = label
                sequence['label_type'] = 'real'
                labeled_sequence_df = pd.DataFrame(sequence)
                labeled_sequences_df = pd.concat([labeled_sequences_df, labeled_sequence_df], ignore_index=True)

                # # Extend label to adjacent sequences
                # for adj_time in [time - pd.Timedelta(minutes=duration), time + pd.Timedelta(minutes=duration)]:
                #     adj_sequence = {sensor_type: sensor_data.loc[adj_time:adj_time + pd.Timedelta(minutes=sequence_length)], 'pcode': pcode, 'timestamp': adj_time}
                #     adj_sequence['label'] = label
                #     adj_sequence['label_type'] = 'pseudo'
                #     adj_sequence_df = pd.DataFrame(adj_sequence)
                #     labeled_sequences_df = pd.concat([labeled_sequences_df, adj_sequence_df], ignore_index=True)
            else:
                sequence['label'] = None
                sequence['label_type'] = 'unlabeled'
                unlabeled_sequence_df = pd.DataFrame(sequence)
                unlabeled_sequences_df = pd.concat([unlabeled_sequences_df, unlabeled_sequence_df], ignore_index=True)
        else:
            sequence['label'] = None
            sequence['label_type'] = 'unlabeled'
            unlabeled_sequence_df = pd.DataFrame(sequence)
            unlabeled_sequences_df = pd.concat([unlabeled_sequences_df, unlabeled_sequence_df], ignore_index=True)

    print(f"{datetime.now()} - Finished segmenting {pcode} {sensor_type} data.")

    # Save the data
    if not labeled_sequences_df.empty:
        labeled_sequences_df.to_csv(os.path.join(PATH_INTERMEDIATE, 'proc_updated', f"{pcode}_{sensor_type}_labeled.csv"), index=False)
        print(f"{datetime.now()} - Finished saving {pcode} {sensor_type} labeled data.")
    if not unlabeled_sequences_df.empty:
        unlabeled_sequences_df.to_csv(os.path.join(PATH_INTERMEDIATE, 'proc_updated', f"{pcode}_{sensor_type}_unlabeled.csv"), index=False)
        print(f"{datetime.now()} - Finished saving {pcode} {sensor_type} unlabeled data.")

    return pcode, sensor_type



with on_ray():


    segmented_data = []
    for pcode in LABELS_PROC.index.get_level_values('pcode').unique():
        print(f"{datetime.now()} - Segmenting {pcode} data...")
        for sensor_type, data in DATA.items():
            if pcode not in data.index.get_level_values('pcode'):
                print(f"{datetime.now()} - No data for {pcode} {sensor_type}")
            if pcode in data.index.get_level_values('pcode'):
                resample_interval =  RESAMPLE_S.get(sensor_type) or 1  # Assuming 1 second as the interval
                user_data = data.loc[pcode]
                max_gap_size = int(2 * float(resample_interval))
                resampled_sensor_data = user_data.resample(f'{resample_interval}S').mean().interpolate(method='linear', limit=max_gap_size).fillna(method='ffill',limit=max_gap_size).fillna(method='bfill',limit=max_gap_size).dropna()
                segmented_data.append(segment_sensor_data.remote(pcode, sensor_type, resampled_sensor_data, LABELS_PROC))
        # print(f"{datetime.now()} - Finished segmenting {pcode} data.")

    results = ray.get(segmented_data)
    print(f"{datetime.now()} - Finished segmenting and saving data for all users.")

2023-12-20 19:56:51,368	INFO worker.py:1664 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8266 [39m[22m


2023-12-20 19:56:51.777339 - Segmenting P01 data...


  resampled_sensor_data = user_data.resample(f'{resample_interval}S').mean().interpolate(method='linear', limit=max_gap_size).fillna(method='ffill',limit=max_gap_size).fillna(method='bfill',limit=max_gap_size).dropna()


[36m(segment_sensor_data pid=326051)[0m 2023-12-20 19:56:54.230785 - Started segmenting P01 EDA data.
2023-12-20 19:56:54.829536 - Segmenting P02 data...
[36m(segment_sensor_data pid=326040)[0m 2023-12-20 19:56:54.937640 - Started segmenting P01 RRI data.
2023-12-20 19:56:57.873011 - Segmenting P03 data...
[36m(segment_sensor_data pid=326041)[0m 2023-12-20 19:57:00.338614 - Started segmenting P03 EDA data.[32m [repeated 2x across cluster][0m
[36m(segment_sensor_data pid=326032)[0m 2023-12-20 19:56:57.989540 - Started segmenting P02 RRI data.
2023-12-20 19:57:00.943701 - Segmenting P05 data...
[36m(segment_sensor_data pid=326054)[0m 2023-12-20 19:57:01.078393 - Started segmenting P03 RRI data.
2023-12-20 19:57:04.105470 - Segmenting P06 data...
[36m(segment_sensor_data pid=326045)[0m 2023-12-20 19:57:06.742981 - Started segmenting P06 EDA data.[32m [repeated 2x across cluster][0m
[36m(segment_sensor_data pid=326053)[0m 2023-12-20 19:57:04.293066 - Started segmenting P0