# Settings

In [21]:
from Funcs.Utility import  *

# Dataset Overview

## Partcipants

In [22]:
import pandas as pd
import os

PARTICIPANTS = pd.read_csv(PATH_PARTICIPANT).set_index('pcode')

PARTICIPANTS.to_csv(os.path.join(PATH_INTERMEDIATE, 'PARTICIPANT_INFO.csv'),index = True)

## Labels (via ESM)

In [23]:
import pandas as pd
import os

LABELS = pd.read_csv(PATH_ESM).assign(
    timestamp=lambda x: pd.to_datetime(x['responseTime'], unit='ms', utc=True).dt.tz_convert(DEFAULT_TZ)
).set_index(
    ['pcode', 'timestamp']
)

# Preprocessing

## Labels

In [24]:
LABELS_VALID = LABELS.loc[
    lambda x: ~x['scheduledTime'].isna(), :
]
print(f'# Non-voluntary response: {len(LABELS_VALID)}')
print(summary(LABELS_VALID.groupby('pcode').count().iloc[:, -1]))

excl_pcode = LABELS_VALID.loc[
    lambda x: ~x['scheduledTime'].isna()
].groupby('pcode').count().iloc[:, -1].loc[lambda y: y < 35]

LABELS_VALID = LABELS_VALID.loc[
    lambda x:  ~x.index.get_level_values('pcode').isin(excl_pcode.index), :
]

# LABELS_VALID = LABELS

# excl_pcode = LABELS_VALID.groupby('pcode').count().iloc[:, -1].loc[lambda y: y < 35]

# LABELS_VALID = LABELS_VALID.loc[
#     lambda x:  ~x.index.get_level_values('pcode').isin(excl_pcode.index), :
# ]

print(f'# Response from participants with enough responses: {len(LABELS_VALID)}')
print(summary(LABELS_VALID.groupby('pcode').count().iloc[:, -1]))

print('# Participants whose responses to ESM delivery were less then 35')
print(excl_pcode, f'#participants = {len(excl_pcode)} / #response = {sum(excl_pcode)}')

# Non-voluntary response: 3323
{'n': 76, 'sum': 3323, 'mean': 43.723684210526315, 'SD': 19.36291898394835, 'med': 43.5, 'range': (3, 83), 'conf.': (39.29906768359284, 48.14830073745979), 'nan_count': 0}
# Response from participants with enough responses: 2619
{'n': 47, 'sum': 2619, 'mean': 55.723404255319146, 'SD': 13.076201628480542, 'med': 52.0, 'range': (36, 83), 'conf.': (51.88408763344431, 59.56272087719398), 'nan_count': 0}
# Participants whose responses to ESM delivery were less then 35
pcode
P04    34
P07    24
P11    22
P14    11
P16    30
P17    13
P18    32
P20    31
P22    23
P24    10
P25    30
P29    32
P34    22
P36    29
P37    31
P38    33
P41    31
P43    24
P44    23
P46     4
P54    13
P56    31
P58    29
P62     3
P63    34
P64    30
P68    11
P73    31
P74    33
Name: change, dtype: int64 #participants = 29 / #response = 704


In [25]:
#Drop duplicate responses
LABELS_VALID = LABELS_VALID.groupby('pcode').apply(lambda x: x.reset_index(drop=False).drop_duplicates(subset='timestamp', keep='first')).set_index(
    ['pcode', 'timestamp']
)

In [26]:
import pandas as pd
import numpy as np

conditions = [
    (LABELS_VALID['stress'] < 0), 
    (LABELS_VALID['stress'] == 0), 
    (LABELS_VALID['stress'] > 0)
]

choices = [0, 1, 2]  # correspondingly negative, zero and positive

LABELS_PROC = LABELS_VALID.assign(
    valence_fixed = lambda x: np.where(x['valence'] > 0, 1, 0),
    arousal_fixed = lambda x: np.where(x['arousal'] > 0, 1, 0),
    stress_fixed = lambda x: np.where(x['stress'] > 0, 1, 0),
    disturbance_fixed = lambda x: np.where(x['disturbance'] > 0, 1, 0),   
    stress_fixed_tri = np.select(conditions, choices, default=np.nan),

)
LABELS_PROC.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,responseTime,scheduledTime,valence,arousal,attention,stress,duration,disturbance,change,valence_fixed,arousal_fixed,stress_fixed,disturbance_fixed,stress_fixed_tri
pcode,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
P01,2019-05-08 10:29:46+09:00,1557278986000,1557279000000.0,-3,3,3,3,5.0,-1,-3,0,1,1,0,2.0
P01,2019-05-08 11:16:12+09:00,1557281772000,1557282000000.0,-3,-2,2,2,15.0,3,-2,0,0,1,1,2.0
P01,2019-05-08 15:58:22+09:00,1557298702000,1557299000000.0,3,3,3,-3,20.0,2,0,1,1,0,1,0.0
P01,2019-05-08 16:41:51+09:00,1557301311000,1557301000000.0,3,3,3,-3,30.0,1,2,1,1,0,1,0.0
P01,2019-05-08 17:27:42+09:00,1557304062000,1557304000000.0,3,3,3,-3,20.0,2,2,1,1,0,1,0.0


In [27]:
import numpy as np

def zscore(col):
    mean = col.mean()
    std = col.std()
    return (col - mean) / std

# Calculate the overall mean z-score
LABELS_PROC['zscore'] = LABELS_PROC['stress'].transform(zscore)
overall_mean_zscore = LABELS_PROC['zscore'].mean()

# Binarize using the overall mean z-score
LABELS_PROC['stress_user_mean'] = (LABELS_PROC['zscore'] > overall_mean_zscore).astype(int)

In [28]:
LABELS_PROC['stress_fixed'].value_counts()

0    1702
1     917
Name: stress_fixed, dtype: int64

In [29]:
LABELS_PROC = LABELS_PROC[LABELS_PROC['stress_fixed_tri']!=1]
# Replace all values of 2 with 1 in the 'stress_fixed_tri' column
LABELS_PROC['stress_fixed_tri'] = LABELS_PROC['stress_fixed_tri'].replace(2, 1)

In [30]:
import numpy as np


inst = LABELS_PROC.groupby('pcode').count().iloc[:, -1]
sam = np.concatenate([
    (LABELS_PROC.loc[(p,), :].index.array - LABELS_PROC.loc[(p,), :].index.array.shift(1)).dropna().total_seconds()
    for p in LABELS_PROC.index.get_level_values('pcode').unique()
])

for c in [c for c in LABELS_PROC.columns if (c.endswith('_dyn') or c.endswith('_fixed'))]:
    print(f'- {c}:', summary(LABELS_PROC[c].astype(object)))

- valence_fixed: {'n': 2181, 'cardinality': 2, 'value_count': '1:1364, 0:817'}
- arousal_fixed: {'n': 2181, 'cardinality': 2, 'value_count': '0:1258, 1:923'}
- stress_fixed: {'n': 2181, 'cardinality': 2, 'value_count': '0:1264, 1:917'}
- disturbance_fixed: {'n': 2181, 'cardinality': 2, 'value_count': '0:1188, 1:993'}


In [31]:
LABELS_PROC.to_csv(os.path.join(PATH_INTERMEDIATE, 'LABELS_PROC.csv'), index=True)

In [12]:
# import os
# import pandas as pd
# dir = os.path.join(PATH_INTERMEDIATE, 'labeled_joined')# Get the list of files in the directory
# file_list = [file for file in os.listdir(dir) if file.endswith('_labeled.csv')]

# # Concatenate the CSV files
# dfs = []
# for file in file_list:
#     file_path = os.path.join(dir, file)
#     df = pd.read_csv(file_path)
#     dfs.append(df)

# concatenated_df = pd.concat(dfs)

# # Print the concatenated dataframe
# print(concatenated_df)


In [13]:
# concatenated_df.label.value_counts()/61

## Sensor Data

In [2]:
import pandas as pd
import scipy.spatial.distance as dist
from typing import Dict, Union
import pygeohash as geo
from datetime import timedelta
from collections import defaultdict  
from scipy.signal import medfilt
from sklearn.preprocessing import MinMaxScaler

def trim_outlier(col, threshold=3.0):
    """
    Remove the values in a dataframe column based on the median and the median absolute deviation.

    Parameters
    ----------
    col : pandas.Series
        The column to be trimmed.
    threshold : float, optional
        The threshold for trimming, expressed in units of the Median Absolute Deviation (MAD).
        Observations with a distance greater than `threshold` times the MAD value from the median are removed.
        Default is 3.0.

    Returns
    -------
    pandas.Series
        The column without outliers.
    """
    median = col.median()
    mad = (col - median).abs().median()
    threshold_value = threshold * mad
    mask = (col > median - threshold_value) & (col < median + threshold_value)
    return col[mask]
    

In [3]:
import pandas as pd
import numpy as np
import scipy.spatial.distance as dist
from typing import Dict, Union
#import pygeohash as geo
from sklearn.cluster import DBSCAN
from datetime import timedelta
from collections import defaultdict
from poi import PoiCluster
import warnings
from pandas.errors import PerformanceWarning
import neurokit2 as nk
from scipy.signal import find_peaks
from scipy.integrate import simps
import scipy.signal
from typing import Union, Dict


warnings.simplefilter(action='ignore', category=PerformanceWarning)
warnings.simplefilter(action="ignore", category=RuntimeWarning)



# AmbientLight.csv
def _proc_ambient_light(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    return data['brightness'].astype('float32')
    

# StepCount.csv
def _proc_step_count(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    new_data = []

    for pcode in data.index.get_level_values('pcode').unique():
        sub = data.loc[(pcode, ), :].sort_index(
            axis=0, level='timestamp'
        ).assign(
            steps=lambda x: (x['totalSteps'] - x['totalSteps'].shift(1)),
            pcode=pcode
        ).reset_index()
        new_data.append(sub)

    new_data = pd.concat(new_data, axis=0, ignore_index=True).set_index(
        ['pcode', 'timestamp']
    )

    return new_data['steps'].dropna().astype('float32')
    


# Acceleration.csv
def _proc_acceleration(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    data = data.assign(
        mag=lambda x: np.sqrt(np.square(x['x']) + np.square(x['y']) + np.square(x['z']))
    )

    return {
        'AXX': data['x'].astype('float32'),
        'AXY': data['y'].astype('float32'),
        'AXZ': data['z'].astype('float32'),
        'MAG': data['mag'].astype('float32')
    }

# SkinTemperature.csv
def _proc_skin_temperature(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    temperature = []
    for pcode in data.index.get_level_values('pcode').unique():
        v = data.loc[(pcode, ), :].sort_index(axis=0,level='timestamp').assign(pcode=pcode)
        v = v.reset_index()
        v['temperature'] = trim_outlier(v['temperature'], threshold=3.0)
        v= v[~v['temperature'].isnull()]
        # Z-score normalize column 'temperature'
        v['temperature'] = (v['temperature'] - v['temperature'].mean()) / v['temperature'].std()
        temperature.append(v)

    temperature = pd.concat(temperature, axis=0, ignore_index=True).set_index(
                ['pcode', 'timestamp']
            ) 
    
    return temperature['temperature'].astype('float32')


# RRI.csv
def _proc_rri(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    RRI = []
    for pcode in data.index.get_level_values('pcode').unique():
        v = data.loc[(pcode, ), :].sort_index(axis=0,level='timestamp').assign(pcode=pcode)
        v = v.reset_index()
        v['interval'] = trim_outlier(v['interval'], threshold=3.0)
        v= v[~v['interval'].isnull()]
        # Z-score normalize column 'interval'
        v['interval'] = (v['interval'] - v['interval'].mean()) / v['interval'].std()
        RRI.append(v)

    RRI = pd.concat(RRI, axis=0, ignore_index=True).set_index(
                ['pcode', 'timestamp']
            ) 
    return RRI['interval'].astype('float32')



# HR.csv
def _proc_hr(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    data['bpm'] = data.loc[(data['bpm'] >= 30) | (data['bpm'] <= 220), 'bpm']
    data= data[~data['bpm'].isnull()]
    HRT = []
    for pcode in data.index.get_level_values('pcode').unique():
        v = data.loc[(pcode, ), :].sort_index(axis=0,level='timestamp').assign(pcode=pcode)
        v = v.reset_index()
        v['bpm'] = trim_outlier(v['bpm'], threshold=3.0)
        v= v[~v['bpm'].isnull()]
        # Z-score normalize column 'bpm'
        v['bpm'] = (v['bpm'] - v['bpm'].mean()) / v['bpm'].std()
        HRT.append(v)

    HRT = pd.concat(HRT, axis=0, ignore_index=True).set_index(
                ['pcode', 'timestamp']
            ) 

    return HRT['bpm'].astype('float32')
    

# # EDA.csv
# def _proc_eda(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
#     sampling_rate = 8

#     # Apply a median filter with a window size of window_size_sec seconds
#     window_size_sec = 5
#     window_size = window_size_sec * sampling_rate  # Multiply by the sampling frequency (8 Hz)

#    #Make the window size odd if it is even
#     if window_size % 2 == 0:
#         window_size += 1

#     data["conductance"] = 1 / (data["resistance"] / 1000) # divide by 1000 to convert kΩ to Ω
#     data['conductance'] =data.loc[(data['conductance'] >= 0.01) & (data['conductance'] <= 100), 'conductance']
#     data= data[~data['conductance'].isnull()]


#     eda = []
#     for pcode in data.index.get_level_values('pcode').unique():
#         v = data.loc[(pcode, ), :].sort_index(axis=0,level='timestamp').assign(pcode=pcode)
#         v = v.reset_index()

#         eda_data = v['conductance'].to_numpy()
#         eda_data = medfilt(eda_data, window_size)
#         # Reshape to 2D with a single column
#         eda_data = eda_data.reshape(-1, 1)
# #         eda_data = eda_data.reshape(-1)
#         # assuming your data is a numpy array with shape (n_samples, n_features)
#         scaler = MinMaxScaler()
#         eda_data_scaled = scaler.fit_transform(eda_data)
#         eda_data = scaler.inverse_transform(eda_data_scaled).reshape(-1)

#         v['conductance'] =eda_data
#         v= v[~v['conductance'].isnull()]

#         eda.append(v)

#     eda = pd.concat(eda, axis=0, ignore_index=True).set_index(
#                 ['pcode', 'timestamp']
#             ) 
    
    
#     return eda['conductance'].astype('float32')



def _proc_eda(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    sampling_rate = 8

    # Low-pass filter parameters
    lowpass_cutoff = 0.05  # cutoff frequency for the low-pass filter in Hz

    # Apply a median filter with a window size of window_size_sec seconds
    window_size_sec = 5
    window_size = window_size_sec * sampling_rate
    if window_size % 2 == 0:
        window_size += 1

    data["conductance"] = 1 / (data["resistance"] / 1000)
    data = data.loc[(data['conductance'] >= 0.01) & (data['conductance'] <= 100)]

    eda_tonic = []
    for pcode in data.index.get_level_values('pcode').unique():
        v = data.loc[(pcode, ), :].sort_index(axis=0, level='timestamp').assign(pcode=pcode)
        v = v.reset_index()

        eda_data = v['conductance'].to_numpy()
        eda_data = medfilt(eda_data, window_size)

        # Apply low-pass filter
        b, a = scipy.signal.butter(N=2, Wn=lowpass_cutoff/(0.5 * sampling_rate), btype='low')
        tonic_eda = scipy.signal.filtfilt(b, a, eda_data)

        # Scale the data
        tonic_eda = tonic_eda.reshape(-1, 1)
        scaler = MinMaxScaler()
        tonic_eda_scaled = scaler.fit_transform(tonic_eda)
        tonic_eda = scaler.inverse_transform(tonic_eda_scaled).reshape(-1)

        # Add to the dataframe
        v['tonic_conductance'] = tonic_eda
        eda_tonic.append(v)

    eda_tonic = pd.concat(eda_tonic, axis=0, ignore_index=True).set_index(['pcode', 'timestamp'])
    
    return eda_tonic['tonic_conductance'].astype('float32')


# Distance.csv
def _proc_distance(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    new_data = []

    for pcode in data.index.get_level_values('pcode').unique():
        sub = data.loc[(pcode, ), :].sort_index(
            axis=0, level='timestamp'
        ).assign(
            distance=lambda x: x['totalDistance'] - x['totalDistance'].shift(1),
            pcode=pcode
        ).reset_index()

        new_data.append(sub)

    new_data = pd.concat(new_data, axis=0, ignore_index=True).set_index(
        ['pcode', 'timestamp']
    )

    return {
        'DST': new_data['distance'].dropna().astype('float32'),
        # 'MOT': new_data['motionType'].astype('object'),
        'PAC': new_data['pace'].astype('float32'),
        'SPD': new_data['speed'].astype('float32')
    }


# Calorie.csv
def _proc_calories(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    new_data = []

    for pcode in data.index.get_level_values('pcode').unique():
        sub = data.loc[(pcode, ), :].sort_index(
            axis=0, level='timestamp'
        ).assign(
            calories=lambda x: x['totalCalories'] - x['totalCalories'].shift(1),
            pcode=pcode
        ).reset_index()

        new_data.append(sub)

    new_data = pd.concat(new_data, axis=0, ignore_index=True).set_index(
        ['pcode', 'timestamp']
    )

    return new_data['calories'].dropna().astype('float32')    

In [4]:
import pandas as pd
import gc
from functools import reduce
import warnings
from pandas.errors import PerformanceWarning
from Funcs.Utility import _load_data

warnings.simplefilter(action='ignore', category=PerformanceWarning)
warnings.simplefilter(action="ignore", category=RuntimeWarning)

FUNC_PROC = {
    # 'Acceleration': _proc_acceleration,
    # 'AmbientLight': _proc_ambient_light,
    # 'Calorie': _proc_calories,
    # 'Distance': _proc_distance,
    'EDA': _proc_eda,
    # 'HR': _proc_hr,
    'RRI': _proc_rri,
    # 'SkinTemperature': _proc_skin_temperature,
    # 'StepCount': _proc_step_count
}


def _process(data_type: str):
    log(f'Begin to processing data: {data_type}')
    
    abbrev = DATA_TYPES[data_type]
    data_raw = _load_data(data_type)
    data_proc = FUNC_PROC[data_type](data_raw)
    result = dict()
    
    if type(data_proc) is dict:
        for k, v in data_proc.items():
            result[f'{abbrev}_{k}'] = v
    else:
        result[abbrev] = data_proc
        
    log(f'Complete processing data: {data_type}')
    return result



#with on_ray(num_cpus=6):
with on_ray():
    jobs = []
    
    func = ray.remote(_process).remote
    
    for data_type in DATA_TYPES:
        job = func(data_type)
        jobs.append(job)

    jobs = ray.get(jobs)
    jobs = reduce(lambda a, b: {**a, **b}, jobs)
    dump(jobs, os.path.join(PATH_INTERMEDIATE, 'proc.pkl'))

    del jobs
    gc.collect()

2023-12-19 14:44:17,884	INFO worker.py:1612 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8267 [39m[22m


[2m[36m(_process pid=274274)[0m [23-12-19 14:44:19] Begin to processing data: RRI
[2m[36m(_process pid=274273)[0m [23-12-19 14:44:19] Begin to processing data: EDA




[2m[36m(_process pid=274274)[0m [23-12-19 14:44:52] Complete processing data: RRI




[2m[36m(_process pid=274273)[0m [23-12-19 14:47:57] Complete processing data: EDA
