# Segment the time series into 1 minute sequences for each user

In [1]:
from Funcs.Utility import *
import numpy as np
import pandas as pd
from typing import Dict, Callable, Union, Tuple, List, Optional, Iterable
from datetime import timedelta as td
from scipy import stats
import ray
import warnings
import time

In [2]:
def _safe_na_check(_v):
    _is_nan_inf = False
    
    try:
        _is_nan_inf = np.isnan(_v) or np.isinf(_v)
    except:
        _is_nan_inf = False
    
    return _is_nan_inf or _v is None

In [3]:
import os
import cloudpickle

DATA = load(os.path.join(PATH_INTERMEDIATE, 'proc.pkl'))
LABELS_PROC = pd.read_csv(os.path.join(PATH_INTERMEDIATE, 'proc', 'LABELS_PROC.csv'), index_col=['pcode','timestamp'],parse_dates=True)

In [None]:
import pandas as pd
import ray

RESAMPLE_S = {
    'ACC_AXX': 0.25,
    'ACC_AXY': 0.25,
    'ACC_AXZ': 0.25,
    'ACC_MAG': 0.25,
    'EDA': 0.5,
}

@ray.remote
def segment_data(pcode, sensor_data, label_data):
    user_data = {sensor_type: data.loc[pcode] for sensor_type, data in sensor_data.items()}

    # Convert user_data index to DatetimeIndex if necessary
    for sensor_type, data in user_data.items():
        if not isinstance(data.index, pd.DatetimeIndex):
            data.index = pd.to_datetime(data.index)

    user_labels = label_data.loc[pcode]

    # Ensure user_labels is a DataFrame
    if isinstance(user_labels, pd.Series):
        user_labels = user_labels.to_frame().T

    labeled_sequences = []
    unlabeled_sequences = []

    # Iterate over every minute in the user data
    for time, row in user_data[list(user_data.keys())[0]].resample('T').dropna().iterrows():  # 'T' for minute
        sequences = {sensor_type: data.loc[time:time + pd.Timedelta(minutes=1)] for sensor_type, data in user_data.items()}

        # Check if there is a label for this exact time
        if time in user_labels.index:
            label_row = user_labels.loc[time]

            # Extend the label for the duration specified
            if not pd.isna(label_row['duration']):
                end_time = time + pd.Timedelta(minutes=int(label_row['duration']))
                sequences = {sensor_type: data.loc[time:end_time] for sensor_type, data in user_data.items()}
        
        # Check if sequence falls within any label's duration
        overlapping_labels = user_labels[(user_labels.index < time) & 
                                         (user_labels.index + pd.Timedelta(minutes=user_labels['duration']) > time)]
        if not overlapping_labels.empty:
            label = overlapping_labels.iloc[-1]['label']  # Use the most recent label
            labeled_sequences.append((sequences, label))
        else:
            unlabeled_sequences.append(sequences)

    return labeled_sequences, unlabeled_sequences


with on_ray():

    # Replace DATA and LABELS_PROC with your actual data
    segmented_data = []
    for sensor_type, sensor_data in DATA.items():
        segmented_data.extend(ray.get([segment_data.remote(pcode, sensor_data, LABELS_PROC) 
                                    for pcode in LABELS_PROC.index.get_level_values('pcode').unique()]))

    # Gathering labeled and unlabeled sequences
    labeled_sequences = [(sequences, label) for labeled_sequences, _ in segmented_data for sequences, label in labeled_sequences]
    unlabeled_sequences = [sequences for _, unlabeled_sequences in segmented_data for sequences in unlabeled_sequences]

    # Concatenate sequences into dataframes
    labeled_df = pd.concat([pd.concat(list(sequences.values()), axis=1) for sequences, _ in labeled_sequences], ignore_index=True)
    unlabeled_df = pd.concat([pd.concat(list(sequences.values()), axis=1) for sequences in unlabeled_sequences], ignore_index=True)

2023-12-02 22:10:43,880	INFO worker.py:1431 -- Connecting to existing Ray cluster at address: 143.248.57.77:6379...
2023-12-02 22:10:43,884	INFO worker.py:1612 -- Connected to Ray cluster. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


KeyboardInterrupt: 