In [1]:
import pytz
import os


DEFAULT_TZ = pytz.FixedOffset(540)  # GMT+09:00; Asia/Seoul

PATH_DATA = 'data/D'
PATH_ESM = os.path.join(PATH_DATA, 'EsmResponse.csv')
PATH_PARTICIPANT = os.path.join(PATH_DATA, 'UserInfo.csv')
PATH_SENSOR = os.path.join(PATH_DATA, 'Sensor')

PATH_INTERMEDIATE = './intermediate'

DATA_TYPES = {
    'Acceleration': 'ACC',
    'AmbientLight': 'AML',
    'Calorie': 'CAL',
    'Distance': 'DST',
    'EDA': 'EDA',
    'HR': 'HRT',
    'RRI': 'RRI',
    'SkinTemperature': 'SKT',
    'StepCount': 'STP',
    'UltraViolet': 'ULV',
    'ActivityEvent': 'ACE',
    'ActivityTransition': 'ACT',
    'AppUsageEvent': 'APP',
    'BatteryEvent': 'BAT',
    'CallEvent': 'CAE',
    'Connectivity': 'CON',
    'DataTraffic': 'DAT',
    'InstalledApp': 'INS',
    'Location': 'LOC',
    'MediaEvent': 'MED',
    'MessageEvent': 'MSG',
    'WiFi': 'WIF',
    'ScreenEvent': 'SCR',
    'RingerModeEvent': 'RNG',
    'ChargeEvent': 'CHG',
    'PowerSaveEvent': 'PWS',
    'OnOffEvent': 'ONF'
}


In [2]:
import pandas as pd
import numpy as np
import scipy.stats as st
import cloudpickle
import ray
from datetime import datetime
from contextlib import contextmanager
import warnings
import time


def load(path: str):
    with open(path, mode='rb') as f:
        return cloudpickle.load(f)

    
def dump(obj, path: str):
    with open(path, mode='wb') as f:
        cloudpickle.dump(obj, f)
        
    
def log(msg: any):
    print('[{}] {}'.format(datetime.now().strftime('%y-%m-%d %H:%M:%S'), msg))


def summary(x):
    x = np.asarray(x)
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')

        n = len(x)
        # Here, uppercase np.dtype.kind corresponds to non-numeric data.
        # Also, we view the boolean data as dichotomous categorical data.
        if x.dtype.kind.isupper() or x.dtype.kind == 'b': 
            cnt = pd.Series(x).value_counts(dropna=False)
            card = len(cnt)
            cnt = cnt[:20]                
            cnt_str = ', '.join([f'{u}:{c}' for u, c in zip(cnt.index, cnt)])
            if card > 30:
                cnt_str = f'{cnt_str}, ...'
            return {
                'n': n,
                'cardinality': card,
                'value_count': cnt_str
            }
        else: 
            x_nan = x[np.isnan(x)]
            x_norm = x[~np.isnan(x)]
            
            tot = np.sum(x_norm)
            m = np.mean(x_norm)
            me = np.median(x_norm)
            s = np.std(x_norm, ddof=1)
            l, u = np.min(x_norm), np.max(x)
            conf_l, conf_u = st.t.interval(0.95, len(x_norm) - 1, loc=m, scale=st.sem(x_norm))
            n_nan = len(x_nan)
            
            return {
                'n': n,
                'sum': tot,
                'mean': m,
                'SD': s,
                'med': me,
                'range': (l, u),
                'conf.': (conf_l, conf_u),
                'nan_count': n_nan
            }


@contextmanager
def on_ray(*args, **kwargs):
    try:
        if ray.is_initialized():
            ray.shutdown()
        ray.init(*args, **kwargs)
        yield None
    finally:
        ray.shutdown()

In [3]:
%load_ext rpy2.ipython

In [5]:
%%R

library(tidyverse)
library(ggforce)
library(ggpubr)
library(showtext)
library(rmcorr)
library(patchwork)

font_add_google(
    name='Source Serif 4',
    family='ssp',
    db_cache=FALSE
)

showtext_auto()

THEME_DEFAULT <- theme_bw(
    base_size=10,
    base_family='ssp',
) + theme(
        axis.title.x=element_text(colour='grey20', size=10, face='bold'),
        axis.title.y=element_text(colour='grey20', size=10, face='bold'),
        axis.text.x=element_text(colour='grey20', size=10),
        axis.text.y=element_text(colour='grey20', size=10),
        strip.text.x=element_text(colour='grey20', size=10, face='bold'),
        strip.text.y=element_text(colour='grey20', size=10, face='bold'),
        legend.background=element_blank(),
        legend.title=element_text(colour='grey20', size=10, face='bold'),
        legend.text=element_text(colour='grey20', size=10),
        legend.position='top',
        legend.box.spacing= unit(0, 'cm'),
        plot.subtitle=element_text(colour='grey20', size=10, hjust=.5),
    )


In [6]:
import pandas as pd
import os


PARTICIPANTS = pd.read_csv(PATH_PARTICIPANT).set_index('pcode').assign(
    particpationStartDateTime=lambda x: pd.to_datetime(
        x['participationStartDate'], format='%Y-%m-%d'
    ).dt.tz_convert(DEFAULT_TZ)
)
PARTICIPANTS.head()

Unnamed: 0_level_0,participationStartDate,participationStartTimestamp,age,gender,openness,conscientiousness,neuroticism,extraversion,agreeableness,PSS10,PHQ9,GHQ12,particpationStartDateTime
pcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
P01,2019-05-08 00:00:00+09:00,1557241200000,27,M,11,11,3,4,13,13,0,1,2019-05-08 00:00:00+09:00
P02,2019-05-08 00:00:00+09:00,1557241200000,21,M,14,5,12,14,5,27,6,18,2019-05-08 00:00:00+09:00
P03,2019-05-08 00:00:00+09:00,1557241200000,24,F,10,15,8,7,11,18,2,6,2019-05-08 00:00:00+09:00
P04,2019-05-08 00:00:00+09:00,1557241200000,23,M,12,11,8,6,11,20,1,9,2019-05-08 00:00:00+09:00
P05,2019-05-08 00:00:00+09:00,1557241200000,27,F,10,11,13,10,6,25,14,9,2019-05-08 00:00:00+09:00


In [7]:
for c in PARTICIPANTS.columns:
    print(f'- {c}:', summary(PARTICIPANTS[c]))

- participationStartDate: {'n': 77, 'cardinality': 3, 'value_count': '2019-05-08 00:00:00+09:00:27, 2019-05-16 00:00:00+09:00:25, 2019-04-30 00:00:00+09:00:25'}
- participationStartTimestamp: {'n': 77, 'sum': 119907572400000, 'mean': 1557241200000.0, 'SD': 560637231.4279153, 'med': 1557241200000.0, 'range': (1556550000000, 1557932400000), 'conf.': (1557113950957.346, 1557368449042.654), 'nan_count': 0}
- age: {'n': 77, 'sum': 1686, 'mean': 21.896103896103895, 'SD': 3.8613619617422406, 'med': 21.0, 'range': (17, 38), 'conf.': (21.019682236199852, 22.77252555600794), 'nan_count': 0}
- gender: {'n': 77, 'cardinality': 2, 'value_count': 'M:53, F:24'}
- openness: {'n': 77, 'sum': 787, 'mean': 10.220779220779221, 'SD': 2.8956563505732467, 'med': 11.0, 'range': (3, 15), 'conf.': (9.563545848092234, 10.878012593466208), 'nan_count': 0}
- conscientiousness: {'n': 77, 'sum': 820, 'mean': 10.64935064935065, 'SD': 2.3662441579221882, 'med': 11.0, 'range': (5, 15), 'conf.': (10.112279104861539, 11.

In [8]:
import pandas as pd
import os


LABELS = pd.read_csv(PATH_ESM).set_index(
    ['pcode']
)
LABELS.head()

Unnamed: 0_level_0,responseTime,scheduledTime,valence,arousal,attention,stress,duration,disturbance,change
pcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
P01,1557278103000,,0,0,0,-1,20.0,3,-2
P01,1557278986000,1557279000000.0,-3,3,3,3,5.0,-1,-3
P01,1557281772000,1557282000000.0,-3,-2,2,2,15.0,3,-2
P01,1557287138000,,2,-1,2,0,15.0,1,-1
P01,1557291117000,,3,3,3,-3,20.0,1,0


In [9]:
for c in LABELS.columns:
    print(f'- {c}:', summary(LABELS[c]))

- responseTime: {'n': 5582, 'sum': 8694314195328000, 'mean': 1557562557385.8833, 'SD': 590915040.4254278, 'med': 1557562969500.0, 'range': (1556582982000, 1558545246000), 'conf.': (1557547052362.8618, 1557578062408.9048), 'nan_count': 0}
- scheduledTime: {'n': 5582, 'sum': 5175814282500000.0, 'mean': 1557572760306.9517, 'SD': 591697484.8543198, 'med': 1557565860000.0, 'range': (1556586120000.0, nan), 'conf.': (1557552635074.4736, 1557592885539.4297), 'nan_count': 2259}
- valence: {'n': 5582, 'sum': 3665, 'mean': 0.6565747044070226, 'SD': 1.4184297545899174, 'med': 1.0, 'range': (-3, 3), 'conf.': (0.6193565182132938, 0.6937928906007513), 'nan_count': 0}
- arousal: {'n': 5582, 'sum': -529, 'mean': -0.09476890003582945, 'SD': 1.6675313128774563, 'med': 0.0, 'range': (-3, 3), 'conf.': (-0.13852326339835566, -0.051014536673303246), 'nan_count': 0}
- attention: {'n': 5582, 'sum': 2236, 'mean': 0.4005732712289502, 'SD': 1.6113242733571864, 'med': 1.0, 'range': (-3, 3), 'conf.': (0.35829372468

In [10]:
inst = LABELS.groupby('pcode').count().iloc[:, -1]
inst_sch = LABELS.loc[lambda x: ~x['scheduledTime'].isna(), :].groupby('pcode').count().iloc[:, -1]
inst_vol = LABELS.loc[lambda x: x['scheduledTime'].isna(), :].groupby('pcode').count().iloc[:, -1]
resp_time = LABELS.assign(
    timestamp=lambda x: pd.to_datetime(x['responseTime'], unit='ms', utc=True).dt.tz_convert(DEFAULT_TZ)
)
sam = np.concatenate([
    (resp_time.loc[p, 'timestamp'].array - resp_time.loc[p, 'timestamp'].array.shift(1)).dropna().total_seconds()
    for p in LABELS.index.unique()
])

print('- # Inst.:', summary(inst))
print('- # Inst. - Scheduled:', summary(inst_sch))
print('- # Inst. - Voluntary:', summary(inst_vol))
print('- Samp. period:', summary(sam))
for c in LABELS.columns:
    print(f'- {c}:', summary(LABELS[c]))

- # Inst.: {'n': 77, 'sum': 5582, 'mean': 72.49350649350649, 'SD': 16.02270048911147, 'med': 74.0, 'range': (20, 110), 'conf.': (68.85679957559911, 76.13021341141386), 'nan_count': 0}
- # Inst. - Scheduled: {'n': 76, 'sum': 3323, 'mean': 43.723684210526315, 'SD': 19.36291898394835, 'med': 43.5, 'range': (3, 83), 'conf.': (39.29906768359284, 48.14830073745979), 'nan_count': 0}
- # Inst. - Voluntary: {'n': 77, 'sum': 2259, 'mean': 29.337662337662337, 'SD': 16.297893300742235, 'med': 27.0, 'range': (2, 74), 'conf.': (25.638494313245726, 33.03683036207895), 'nan_count': 0}
- Samp. period: {'n': 5505, 'sum': 42240670.0, 'mean': 7673.146230699364, 'SD': 13193.471538029606, 'med': 3090.0, 'range': (1.0, 136446.0), 'conf.': (7324.548923384188, 8021.743538014541), 'nan_count': 0}
- responseTime: {'n': 5582, 'sum': 8694314195328000, 'mean': 1557562557385.8833, 'SD': 590915040.4254278, 'med': 1557562969500.0, 'range': (1556582982000, 1558545246000), 'conf.': (1557547052362.8618, 1557578062408.904

In [11]:
data = LABELS.reset_index()[[
    'pcode', 'valence', 'arousal', 'stress', 'attention', 'disturbance', 'change'
]]

In [12]:
%%R -i data 

com <- combn(c('valence', 'arousal', 'stress', 'attention', 'disturbance', 'change'), 2)

for(i in 1:ncol(com)) {
    a <- com[, i][1]
    b <- com[, i][2]
    r <- rmcorr(participant = 'pcode', measure1=a, measure2=b, dataset=data)
    cat(a, '-', b, ': R =', r$r, '(p =', r$p, ') \n')
}

valence - arousal : R = 0.3858505 (p = 6.005105e-195 ) 
valence - stress : R = -0.5918317 (p = 0 ) 
valence - attention : R = 0.2880379 (p = 1.138238e-105 ) 
valence - disturbance : R = -0.02946216 (p = 0.02880521 ) 
valence - change : R = 0.3163849 (p = 2.978458e-128 ) 
arousal - stress : R = -0.2020498 (p = 8.095612e-52 ) 
arousal - attention : R = 0.4354836 (p = 1.162341e-253 ) 
arousal - disturbance : R = 0.0284022 (p = 0.03507786 ) 
arousal - change : R = 0.1673468 (p = 7.1899e-36 ) 
stress - attention : R = -0.1515681 (p = 1.176714e-29 ) 
stress - disturbance : R = 0.08679844 (p = 1.108432e-10 ) 
stress - change : R = -0.2907816 (p = 9.560751e-108 ) 
attention - disturbance : R = 0.1182107 (p = 1.363257e-18 ) 
attention - change : R = 0.1163492 (p = 4.682081e-18 ) 
disturbance - change : R = -0.2216444 (p = 3.009815e-62 ) 


In [13]:
import os
import pandas as pd
from typing import Optional


def _load_data(
    name: str
) -> Optional[pd.DataFrame]:
    paths = [
        (d, os.path.join(PATH_SENSOR, d, f'{name}.csv'))
        for d in os.listdir(PATH_SENSOR)
        if d.startswith('P')
    ]
    return pd.concat(
        filter(
            lambda x: len(x.index), 
            [
                pd.read_csv(p).assign(pcode=pcode)
                for pcode, p in paths
                if os.path.exists(p)
            ]
        ), ignore_index=True
    ).assign(
        timestamp=lambda x: pd.to_datetime(x['timestamp'], unit='ms', utc=True).dt.tz_convert(DEFAULT_TZ)
    ).set_index(
        ['pcode', 'timestamp']
    )


In [14]:
import pandas as pd
import gc
from datetime import timedelta as td


STATS = []

for data_type in DATA_TYPES:
    dat = _load_data(data_type)
    inst = dat.groupby('pcode').count().iloc[:, -1]
    samp = np.concatenate([
        (dat.loc[(p,), :].index.array - dat.loc[(p,), :].index.array.shift(1)).dropna().total_seconds()
        for p in dat.index.get_level_values('pcode').unique()
    ])
    inst, samp = summary(inst), summary(samp)
    
    print('#'*5, data_type, '#'*5)
    print('- # Inst.:', inst)
    print('- Samp. period:', samp)
    
    for c in dat.columns:
        print(f'- {c}:', summary(dat[c]))
        
    del dat
    gc.collect()
    
STATS = pd.DataFrame(STATS)

  (dat.loc[(p,), :].index.array - dat.loc[(p,), :].index.array.shift(1)).dropna().total_seconds()


##### Acceleration #####
- # Inst.: {'n': 77, 'sum': 123260001, 'mean': 1600779.2337662338, 'SD': 473426.34254662704, 'med': 1616756.0, 'range': (489289, 2464554), 'conf.': (1493324.634822134, 1708233.8327103336), 'nan_count': 0}
- Samp. period: {'n': 123259924, 'sum': 41213652.71599982, 'mean': 0.33436376868121237, 'SD': 113.32762322571334, 'med': 0.13, 'range': (0.001, 347544.66), 'conf.': (0.3143571853179472, 0.35437035204447753), 'nan_count': 0}
- x: {'n': 123260001, 'sum': -5999685.698486212, 'mean': -0.0486750417800679, 'SD': 0.619149989039202, 'med': -0.066162109375, 'range': (-7.9951171875, 7.99975585938), 'conf.': (-0.04878434498125062, -0.04856573857888518), 'nan_count': 0}
- y: {'n': 123260001, 'sum': 21869550.39282226, 'mean': 0.17742617406616976, 'SD': 0.5751403091314417, 'med': 0.223876953125, 'range': (-7.9951171875, 7.9990234375), 'conf.': (0.17732464022456404, 0.17752770790777547), 'nan_count': 0}
- z: {'n': 123260001, 'sum': 27721322.122802727, 'mean': 0.2249011998856

  (dat.loc[(p,), :].index.array - dat.loc[(p,), :].index.array.shift(1)).dropna().total_seconds()


##### AmbientLight #####
- # Inst.: {'n': 77, 'sum': 31948974, 'mean': 414921.74025974027, 'SD': 122176.94279182944, 'med': 416936.0, 'range': (125402, 643940), 'conf.': (387190.9758132746, 442652.5047062059), 'nan_count': 0}
- Samp. period: {'n': 31948897, 'sum': 41213635.99199994, 'mean': 1.2899861923871718, 'SD': 222.607981838843, 'med': 0.5, 'range': (0.001, 347544.869), 'conf.': (1.2127962165083057, 1.367176168266038), 'nan_count': 0}
- brightness: {'n': 31948974, 'sum': 11939891469, 'mean': 373.71752435618123, 'SD': 2573.666971017758, 'med': 39.0, 'range': (0, 65535), 'conf.': (372.82509879993745, 374.609949912425), 'nan_count': 0}


  (dat.loc[(p,), :].index.array - dat.loc[(p,), :].index.array.shift(1)).dropna().total_seconds()


##### Calorie #####
- # Inst.: {'n': 77, 'sum': 15601514, 'mean': 202617.06493506493, 'SD': 59660.549453893436, 'med': 203676.0, 'range': (61197, 314216), 'conf.': (189075.78123744018, 216158.34863268968), 'nan_count': 0}
- Samp. period: {'n': 15601437, 'sum': 41213542.37799993, 'mean': 2.641650405536357, 'SD': 318.5582067426913, 'med': 1.025, 'range': (0.001, 347545.58), 'conf.': (2.483578524354583, 2.7997222867181315), 'nan_count': 0}
- caloriesToday: {'n': 15601514, 'sum': 17690514158, 'mean': 1133.897271636586, 'SD': 429.6894403067457, 'med': 1142.0, 'range': (0, 2387), 'conf.': (1133.684055812589, 1134.1104874605833), 'nan_count': 0}
- totalCalories: {'n': 15601514, 'sum': 90234460224, 'mean': 5783.6989553706135, 'SD': 3364.915210229474, 'med': 5451.0, 'range': (0, 14838), 'conf.': (5782.029253716513, 5785.368657024714), 'nan_count': 0}


  (dat.loc[(p,), :].index.array - dat.loc[(p,), :].index.array.shift(1)).dropna().total_seconds()


##### Distance #####
- # Inst.: {'n': 77, 'sum': 15658461, 'mean': 203356.63636363635, 'SD': 64968.13128573705, 'med': 206892.0, 'range': (31, 319157), 'conf.': (188610.67936026378, 218102.59336700893), 'nan_count': 0}
- Samp. period: {'n': 15658384, 'sum': 41219633.3800005, 'mean': 2.632432144977445, 'SD': 318.2560407682015, 'med': 1.009000192, 'range': (0.000999936, 347546.808999936), 'conf.': (2.4747976315300506, 2.790066658424839), 'nan_count': 0}
- motionType: {'n': 15658461, 'cardinality': 4, 'value_count': 'IDLE:14312951, WALKING:1312097, JOGGING:33213, RUNNING:200'}
- pace: {'n': 15658461, 'sum': 1729412434, 'mean': 110.4458754918507, 'SD': 366.63067086702637, 'med': 0.0, 'range': (0, 3225), 'conf.': (110.26428112155037, 110.62746986215103), 'nan_count': 0}
- speed: {'n': 15658461, 'sum': 187088259, 'mean': 11.948061753961644, 'SD': 36.67668738662747, 'med': 0.0, 'range': (0, 695), 'conf.': (11.929895570715365, 11.966227937207924), 'nan_count': 0}
- distanceToday: {'n': 1565846

  (dat.loc[(p,), :].index.array - dat.loc[(p,), :].index.array.shift(1)).dropna().total_seconds()


##### EDA #####
- # Inst.: {'n': 77, 'sum': 80150329, 'mean': 1040913.3636363636, 'SD': 306422.1210160014, 'med': 1046093.0, 'range': (314771, 1615971), 'conf.': (971364.0733618538, 1110462.6539108735), 'nan_count': 0}
- Samp. period: {'n': 80150252, 'sum': 41213631.25599986, 'mean': 0.5142046372605275, 'SD': 140.5468260735239, 'med': 0.199, 'range': (0.001, 347546.153), 'conf.': (0.48343540259511586, 0.544973871925939), 'nan_count': 0}
- resistance: {'n': 80150329, 'sum': 4456740676426, 'mean': 55604.77083538858, 'SD': 121898.5131897955, 'med': 1385.0, 'range': (0, 340330), 'conf.': (55578.08419819559, 55631.45747258158), 'nan_count': 0}


  (dat.loc[(p,), :].index.array - dat.loc[(p,), :].index.array.shift(1)).dropna().total_seconds()


##### HR #####
- # Inst.: {'n': 77, 'sum': 13621023, 'mean': 176896.4025974026, 'SD': 52558.47787145981, 'med': 191711.0, 'range': (38545, 266374), 'conf.': (164967.0914179474, 188825.7137768578), 'nan_count': 0}
- Samp. period: {'n': 13620946, 'sum': 40968427.92100002, 'mean': 3.0077520255201082, 'SD': 362.9172458474613, 'med': 0.996, 'range': (0.001, 351677.643), 'conf.': (2.815020792256888, 3.2004832587833283), 'nan_count': 0}
- bpm: {'n': 13621023, 'sum': 1029728342, 'mean': 75.59845850051057, 'SD': 9.820564496209338, 'med': 75.0, 'range': (35, 199), 'conf.': (75.59324319542789, 75.60367380559326), 'nan_count': 0}


  (dat.loc[(p,), :].index.array - dat.loc[(p,), :].index.array.shift(1)).dropna().total_seconds()


##### RRI #####
- # Inst.: {'n': 77, 'sum': 20764471, 'mean': 269668.45454545453, 'SD': 81037.2444152014, 'med': 279148.0, 'range': (81185, 439154), 'conf.': (251275.25624700362, 288061.65284390544), 'nan_count': 0}
- Samp. period: {'n': 20764394, 'sum': 41213534.8499999, 'mean': 1.9848176089319003, 'SD': 276.2002925233017, 'med': 0.762, 'range': (0.001, 347548.529), 'conf.': (1.8660186540337294, 2.1036165638300712), 'nan_count': 0}
- interval: {'n': 20764471, 'sum': 15700581045.231997, 'mean': 756.1271869257828, 'SD': 178.71396844200436, 'med': 763.232, 'range': (298.65599999999995, 1493.28), 'conf.': (756.0503188199689, 756.2040550315967), 'nan_count': 0}


  (dat.loc[(p,), :].index.array - dat.loc[(p,), :].index.array.shift(1)).dropna().total_seconds()


##### SkinTemperature #####
- # Inst.: {'n': 77, 'sum': 535095, 'mean': 6949.285714285715, 'SD': 2044.9192742858738, 'med': 7010.0, 'range': (2095, 10770), 'conf.': (6485.145972273351, 7413.425456298079), 'nan_count': 0}
- Samp. period: {'n': 535018, 'sum': 41212170.55200001, 'mean': 77.02950284289503, 'SD': 1719.1916269814033, 'med': 30.082, 'range': (0.006, 347555.636), 'conf.': (72.42281097588157, 81.63619470990848), 'nan_count': 0}
- temperature: {'n': 535095, 'sum': 17310047.448570266, 'mean': 32.349484574833006, 'SD': 2.1008397544761457, 'med': 32.6599998474, 'range': (0.0, 41.1699981689), 'conf.': (32.343855635681685, 32.35511351398433), 'nan_count': 0}


  (dat.loc[(p,), :].index.array - dat.loc[(p,), :].index.array.shift(1)).dropna().total_seconds()


##### StepCount #####
- # Inst.: {'n': 77, 'sum': 15848456, 'mean': 205824.1038961039, 'SD': 60607.39763375781, 'med': 206903.0, 'range': (62164, 319175), 'conf.': (192067.91202315246, 219580.29576905532), 'nan_count': 0}
- Samp. period: {'n': 15848379, 'sum': 41213527.357999936, 'mean': 2.600488501568516, 'SD': 316.0710676012733, 'med': 1.009, 'range': (0.001, 347546.809), 'conf.': (2.4448774479214284, 2.7560995552156036), 'nan_count': 0}
- stepsToday: {'n': 15848456, 'sum': 51743117282, 'mean': 3264.8680276488763, 'SD': 2878.269178322284, 'med': 2686.0, 'range': (0, 20756), 'conf.': (3263.450974785182, 3266.2850805125704), 'nan_count': 0}
- totalSteps: {'n': 15848456, 'sum': 343389233538, 'mean': 21667.0465273084, 'SD': 15147.988032074709, 'med': 19338.0, 'range': (0, 93938), 'conf.': (21659.588746784702, 21674.5043078321), 'nan_count': 0}


  (dat.loc[(p,), :].index.array - dat.loc[(p,), :].index.array.shift(1)).dropna().total_seconds()


##### UltraViolet #####
- # Inst.: {'n': 77, 'sum': 264692, 'mean': 3437.5584415584417, 'SD': 1017.6239287455036, 'med': 3495.0, 'range': (1044, 5373), 'conf.': (3206.5861417259757, 3668.5307413909077), 'nan_count': 0}
- Samp. period: {'n': 264615, 'sum': 41109668.714999996, 'mean': 155.35653199931974, 'SD': 2474.851673896092, 'med': 59.998, 'range': (0.012, 347610.692), 'conf.': (145.92695907860184, 164.78610492003764), 'nan_count': 0}
- intensity: {'n': 264692, 'cardinality': 4, 'value_count': 'NONE:260364, LOW:4301, MEDIUM:26, HIGH:1'}
- exposureToday: {'n': 264692, 'sum': 14238420000, 'mean': 53792.407779607995, 'SD': 193566.12706067285, 'med': 0.0, 'range': (0, 2220000), 'conf.': (53054.9977676023, 54529.81779161369), 'nan_count': 0}
- totalExposure: {'n': 264692, 'sum': 50604600000, 'mean': 191182.9598174482, 'SD': 362878.33463728055, 'med': 0.0, 'range': (0, 2700000), 'conf.': (189800.53758704624, 192565.38204785014), 'nan_count': 0}


  (dat.loc[(p,), :].index.array - dat.loc[(p,), :].index.array.shift(1)).dropna().total_seconds()


##### ActivityEvent #####
- # Inst.: {'n': 77, 'sum': 857437, 'mean': 11135.545454545454, 'SD': 4325.836148396645, 'med': 11085.0, 'range': (2640, 22938), 'conf.': (10153.701085845014, 12117.389823245894), 'nan_count': 0}
- Samp. period: {'n': 857360, 'sum': 43179845.234, 'mean': 50.36372729541849, 'SD': 631.318232649463, 'med': 15.19, 'range': (0.001, 260838.132), 'conf.': (49.02739203647506, 51.70006255436193), 'nan_count': 0}
- confidenceStill: {'n': 857437, 'sum': 384491.49999999994, 'mean': 0.4484195340299053, 'SD': 0.4346116026395941, 'med': 0.24, 'range': (0.0, 1.0), 'conf.': (0.44749961646725483, 0.4493394515925558), 'nan_count': 0}
- confidenceUnknown: {'n': 857437, 'sum': 77627.03000000003, 'mean': 0.09053380015091492, 'SD': 0.15446505481813255, 'med': 0.01, 'range': (0.0, 1.0), 'conf.': (0.09020685278945478, 0.09086074751237505), 'nan_count': 0}
- confidenceOnFoot: {'n': 857437, 'sum': 141488.60000000003, 'mean': 0.16501340623276117, 'SD': 0.30429163345470095, 'med': 0.01, '

  (dat.loc[(p,), :].index.array - dat.loc[(p,), :].index.array.shift(1)).dropna().total_seconds()


##### ActivityTransition #####
- # Inst.: {'n': 77, 'sum': 51009, 'mean': 662.4545454545455, 'SD': 344.9931938861276, 'med': 608.0, 'range': (232, 2394), 'conf.': (584.1506955080739, 740.7583954010171), 'nan_count': 0}
- Samp. period: {'n': 50932, 'sum': 42154858.737, 'mean': 827.6694168106495, 'SD': 4482.627081784924, 'med': 0.0, 'range': (0.0, 261515.903), 'conf.': (788.7384080176117, 866.6004256036873), 'nan_count': 0}
- transitionType: {'n': 51009, 'cardinality': 10, 'value_count': 'EXIT_WALKING:11158, ENTER_WALKING:11151, ENTER_STILL:10419, EXIT_STILL:10414, ENTER_IN_VEHICLE:2441, EXIT_IN_VEHICLE:2431, EXIT_ON_BICYCLE:1170, ENTER_ON_BICYCLE:1169, ENTER_RUNNING:328, EXIT_RUNNING:328'}


  (dat.loc[(p,), :].index.array - dat.loc[(p,), :].index.array.shift(1)).dropna().total_seconds()


##### AppUsageEvent #####
- # Inst.: {'n': 77, 'sum': 1404657, 'mean': 18242.2987012987, 'SD': 7252.845122441942, 'med': 16956.0, 'range': (5179, 38243), 'conf.': (16596.10478728152, 19888.492615315878), 'nan_count': 0}
- Samp. period: {'n': 1404580, 'sum': 43369161.539999984, 'mean': 30.876960756952244, 'SD': 518.8103586669805, 'med': 0.379, 'range': (0.0, 260988.331), 'conf.': (30.018967521349424, 31.734953992555063), 'nan_count': 0}
- name: {'n': 1404657, 'cardinality': 873, 'value_count': '카카오톡:485233, 안드로이드 시스템:109284, Samsung Experience 홈:94274, One UI 홈:47717, Facebook:47637, 기본홈:47156, Paco:42692, 시스템 UI:28225, 트위터:24874, Chrome:19664, Polar Beat:19492, 캐시워크:17301, 네이버 웹툰:16860, 삼성 인터넷:16780, Instagram:13360, 설정:13134, YouTube:12091, 메시지:11815, 시계:10724, 디시인사이드:10693, ...'}
- packageName: {'n': 1404657, 'cardinality': 883, 'value_count': 'com.kakao.talk:485233, com.sec.android.app.launcher:150200, android:118565, com.facebook.katana:47637, com.lge.launcher3:46200, com.pacoapp.p

  (dat.loc[(p,), :].index.array - dat.loc[(p,), :].index.array.shift(1)).dropna().total_seconds()


##### BatteryEvent #####
- # Inst.: {'n': 77, 'sum': 1160326, 'mean': 15069.16883116883, 'SD': 7677.663567303528, 'med': 12795.0, 'range': (2872, 42124), 'conf.': (13326.552957674054, 16811.784704663605), 'nan_count': 0}
- Samp. period: {'n': 1160249, 'sum': 43417685.226999976, 'mean': 37.4210063762175, 'SD': 425.2680918274715, 'med': 10.168, 'range': (0.001, 261248.657), 'conf.': (36.64719387577137, 38.194818876663625), 'nan_count': 0}
- status: {'n': 1160326, 'cardinality': 4, 'value_count': 'CHARGING:651709, DISCHARGING:423437, FULL:81850, NOT_CHARGING:3330'}
- temperature: {'n': 1160326, 'sum': 37831986.300000004, 'mean': 32.6046182710721, 'SD': 4.162447912169574, 'med': 32.7, 'range': (9.7, 81.3), 'conf.': (32.59704458422733, 32.61219195791686), 'nan_count': 0}
- level: {'n': 1160326, 'sum': 73058432.0, 'mean': 62.963711922339066, 'SD': 28.007237523882562, 'med': 66.0, 'range': (0.0, 100.0), 'conf.': (62.912751994222575, 63.01467185045556), 'nan_count': 0}
- plugged: {'n': 1160326

  (dat.loc[(p,), :].index.array - dat.loc[(p,), :].index.array.shift(1)).dropna().total_seconds()
  (dat.loc[(p,), :].index.array - dat.loc[(p,), :].index.array.shift(1)).dropna().total_seconds()


##### CallEvent #####
- # Inst.: {'n': 67, 'sum': 1546, 'mean': 23.074626865671643, 'SD': 25.421485444543677, 'med': 13.0, 'range': (1, 123), 'conf.': (16.87383934740705, 29.275414383936237), 'nan_count': 0}
- Samp. period: {'n': 1479, 'sum': 16861406.623, 'mean': 11400.545384043273, 'SD': 27689.5710025381, 'med': 1337.943, 'range': (0.408, 338163.378), 'conf.': (9988.215903119613, 12812.874864966932), 'nan_count': 0}
- number: {'n': 1546, 'cardinality': 492, 'value_count': '0104$070dd33ca6bb9f397ef5d50b5ff85a68:152, 0103$40b5d03a8b8ea68054d5ac4c72b382d1:42, 0107$7a44af8ea4fa07ae50542fcb560dd529:41, 0104$953efebd0af0a0f6f73948961fd239bd:37, 0102$7891680e5c259162e79235c578895ebb:33, 0104$fab8457f6f6fe2575d1ba5205177cbee:28, 0105$b23b2674f2e76dabd69e76a2cd0cb438:23, 0105$5539bf1719ca24553f5cdc026780ab6d:21, 0158$af629036eeea7f946ff562aab2254d49:18, 0107$2dd389f6280caf7443d513a834c201b1:16, 1544$5f2c22cb4a5380af7ca75622a6426917:14, 0109$b11158939336f7a81a6eff9f662f0a3e:14, 0107$8c43cd4cd9

  (dat.loc[(p,), :].index.array - dat.loc[(p,), :].index.array.shift(1)).dropna().total_seconds()


##### DataTraffic #####
- # Inst.: {'n': 77, 'sum': 692935, 'mean': 8999.155844155845, 'SD': 3664.6841935630578, 'med': 8428.0, 'range': (1584, 19668), 'conf.': (8167.374561859926, 9830.937126451763), 'nan_count': 0}
- Samp. period: {'n': 692858, 'sum': 43067429.304000005, 'mean': 62.15909941719661, 'SD': 807.6512193118496, 'med': 15.033, 'range': (11.534, 261869.448), 'conf.': (60.25736185376411, 64.0608369806291), 'nan_count': 0}
- rxKiloBytes: {'n': 692935, 'sum': 937801669, 'mean': 1353.3761016545564, 'SD': 9590.345100079721, 'med': 40.0, 'range': (0, 847233), 'conf.': (1330.795431037939, 1375.9567722711738), 'nan_count': 0}
- txKiloBytes: {'n': 692935, 'sum': 186407124, 'mean': 269.0109808279276, 'SD': 8837.797315547428, 'med': 11.0, 'range': (0, 844800), 'conf.': (248.20219990542003, 289.8197617504352), 'nan_count': 0}
- duration: {'n': 692935, 'sum': 10394025000, 'mean': 15000.0, 'SD': 0.0, 'med': 15000.0, 'range': (15000, 15000), 'conf.': (nan, nan), 'nan_count': 0}


  (dat.loc[(p,), :].index.array - dat.loc[(p,), :].index.array.shift(1)).dropna().total_seconds()


##### InstalledApp #####
- # Inst.: {'n': 77, 'sum': 1635350, 'mean': 21238.31168831169, 'SD': 5286.182229120261, 'med': 21459.0, 'range': (4046, 31192), 'conf.': (20038.495496588894, 22438.127880034484), 'nan_count': 0}
- Samp. period: {'n': 1635273, 'sum': 42803911.341000006, 'mean': 26.175391718080103, 'SD': 631.0867933465355, 'med': 0.0, 'range': (0.0, 271385.388), 'conf.': (25.208133627055055, 27.14264980910515), 'nan_count': 0}
- name: {'n': 1635350, 'cardinality': 3307, 'value_count': '전화:10225, 소프트웨어 업데이트:9364, 설정:7278, SmartThings:4777, 캘린더:4585, 메시지:3970, HTML 뷰어:3916, ABC Logger:3916, YouTube:3916, com.android.wallpapercropper:3916, Google One Time Init:3916, 외부 저장소:3916, 패키지 액세스 도움말:3915, ConfigUpdater:3915, AhnLab V3 Mobile Plus 2.0:3915, Google Play 스토어:3915, Polar Beat:3915, MmsService:3915, PacProcessor:3915, Gmail:3914, ...'}
- lastUpdateTime: {'n': 1635350, 'cardinality': 15829, 'value_count': '1230735600000:509400, 1230735600000:22951, 1217592000000:18601, 1481885422

  (dat.loc[(p,), :].index.array - dat.loc[(p,), :].index.array.shift(1)).dropna().total_seconds()


##### Location #####
- # Inst.: {'n': 77, 'sum': 788430, 'mean': 10239.35064935065, 'SD': 5938.968016340164, 'med': 8854.0, 'range': (2047, 27742), 'conf.': (8891.370258346496, 11587.331040354804), 'nan_count': 0}
- Samp. period: {'n': 788353, 'sum': 42733537.05999998, 'mean': 54.20609430039586, 'SD': 888.3962695759781, 'med': 2.0, 'range': (0.001, 261313.101), 'conf.': (52.245015153086875, 56.16717344770485), 'nan_count': 0}
- altitude: {'n': 788430, 'sum': 28759191.682146616, 'mean': 36.47653143861423, 'SD': 0.43770958085690265, 'med': 36.3706519, 'range': (35.0889784, 37.7498106), 'conf.': (36.47556526929999, 36.47749760792847), 'nan_count': 0}
- longitude: {'n': 788430, 'sum': 100424855.4541156, 'mean': 127.37320428461068, 'SD': 0.7948267295443736, 'med': 127.359652, 'range': (-150.2128574, 189.4486178), 'conf.': (127.37144984004331, 127.37495872917805), 'nan_count': 0}
- latitude: {'n': 788430, 'sum': 28759191.682146616, 'mean': 36.47653143861423, 'SD': 0.43770958085690265, 'med':

  (dat.loc[(p,), :].index.array - dat.loc[(p,), :].index.array.shift(1)).dropna().total_seconds()


##### MediaEvent #####
- # Inst.: {'n': 68, 'sum': 1814, 'mean': 26.676470588235293, 'SD': 32.195967660966296, 'med': 15.0, 'range': (1, 159), 'conf.': (18.883386171450674, 34.46955500501991), 'nan_count': 0}
- Samp. period: {'n': 1746, 'sum': 19530305.077, 'mean': 11185.741739404353, 'SD': 32116.61130432838, 'med': 149.0435, 'range': (0.063, 458846.323), 'conf.': (9678.242774201177, 12693.24070460753), 'nan_count': 0}
- mimetype: {'n': 1814, 'cardinality': 4, 'value_count': 'image/jpeg:1573, image/png:183, video/mp4:50, image/gif:8'}
- bucketDisplay: {'n': 1814, 'cardinality': 30, 'value_count': 'Camera:865, Screenshots:487, KakaoTalk:132, Capture+:57, Foodie:57, Download:48, Facebook:36, SNOW:34, B612:28, Twitter:13, Pictures:9, video:8, CandyCam:7, Pictail:6, KakaoStory:5, sCAM:5, ImageResized:2, GIF Capture:2, 19 이음:2, Ticket:1'}
##### MessageEvent #####
- # Inst.: {'n': 66, 'sum': 2591, 'mean': 39.25757575757576, 'SD': 61.801247337688686, 'med': 22.0, 'range': (1, 442), 'conf.': (

  (dat.loc[(p,), :].index.array - dat.loc[(p,), :].index.array.shift(1)).dropna().total_seconds()
  (dat.loc[(p,), :].index.array - dat.loc[(p,), :].index.array.shift(1)).dropna().total_seconds()


##### WiFi #####
- # Inst.: {'n': 77, 'sum': 15258102, 'mean': 198157.16883116882, 'SD': 147795.70290664234, 'med': 178686.0, 'range': (28475, 900435), 'conf.': (164611.65909872198, 231702.67856361566), 'nan_count': 0}
- Samp. period: {'n': 15258025, 'sum': 43178434.54099999, 'mean': 2.8298835885378346, 'SD': 148.96371508723627, 'med': 0.0, 'range': (0.0, 260934.687), 'conf.': (2.755139050978287, 2.904628126097382), 'nan_count': 0}
- bssid: {'n': 15258102, 'cardinality': 310874, 'value_count': '6b186e57-cc96-4f85-ad3c-80734ba597b8:20300, 777ae97a-7584-4414-9862-ee8a4645ae8b:15749, 6dd66e08-a0b7-481e-af4b-9d70e82d49cb:15458, adab9bd0-15e5-409f-929c-57f4f0424c50:15331, bdb0522c-5453-4e8f-8bb9-93b2483cced5:13912, 454820e6-9e3b-4938-9cf6-74a955c47e05:13777, 2fac2da8-d0f8-408d-9608-b98caa359823:13745, 14e48f9a-2b95-4f53-9ee4-90add0183eb0:13287, da75aa19-6ec8-490f-8830-fd62fd315af9:13186, fd5ba903-8eea-4771-af9d-86f84f037d8b:13170, 87b30ad2-34b8-49c7-b150-c057943d8b22:12703, b5d5d05d-6a0a-46

  (dat.loc[(p,), :].index.array - dat.loc[(p,), :].index.array.shift(1)).dropna().total_seconds()


##### ScreenEvent #####
- # Inst.: {'n': 77, 'sum': 207137, 'mean': 2690.090909090909, 'SD': 998.2933091534162, 'med': 2718.0, 'range': (512, 5034), 'conf.': (2463.5061217158936, 2916.6756964659244), 'nan_count': 0}
- Samp. period: {'n': 207060, 'sum': 43288647.885000005, 'mean': 209.06330476673432, 'SD': 1404.1554471892762, 'med': 13.744499999999999, 'range': (0.003, 261385.771), 'conf.': (203.0152221646858, 215.11138736878283), 'nan_count': 0}
- type: {'n': 207137, 'cardinality': 3, 'value_count': 'ON:77317, OFF:77233, UNLOCK:52587'}
##### RingerModeEvent #####
- # Inst.: {'n': 76, 'sum': 1242, 'mean': 16.342105263157894, 'SD': 15.946203419898165, 'med': 11.5, 'range': (1, 87), 'conf.': (12.698241688042252, 19.985968838273536), 'nan_count': 0}
- Samp. period: {'n': 1166, 'sum': 30162187.284, 'mean': 25868.085149228133, 'SD': 46793.844568577675, 'med': 5975.3595000000005, 'range': (0.013, 434379.337), 'conf.': (23179.406018772534, 28556.76427968373), 'nan_count': 0}
- type: {'n': 1242

  (dat.loc[(p,), :].index.array - dat.loc[(p,), :].index.array.shift(1)).dropna().total_seconds()
  (dat.loc[(p,), :].index.array - dat.loc[(p,), :].index.array.shift(1)).dropna().total_seconds()


##### ChargeEvent #####
- # Inst.: {'n': 77, 'sum': 11570, 'mean': 150.25974025974025, 'SD': 272.06749368673997, 'med': 72.0, 'range': (4, 2112), 'conf.': (88.5079939108227, 212.0114866086578), 'nan_count': 0}
- Samp. period: {'n': 11493, 'sum': 41418375.384, 'mean': 3603.791471678413, 'SD': 9696.835793352431, 'med': 33.34, 'range': (0.003, 287550.279), 'conf.': (3426.4921099203157, 3781.0908334365104), 'nan_count': 0}
- type: {'n': 11570, 'cardinality': 2, 'value_count': 'DISCONNECTED:5818, CONNECTED:5752'}
##### PowerSaveEvent #####
- # Inst.: {'n': 16, 'sum': 370, 'mean': 23.125, 'SD': 62.0127675026146, 'med': 4.0, 'range': (1, 254), 'conf.': (-9.919271278080963, 56.16927127808096), 'nan_count': 0}
- Samp. period: {'n': 354, 'sum': 3364281.088, 'mean': 9503.618892655368, 'SD': 27354.303418008076, 'med': 283.078, 'range': (0.008, 347353.123), 'conf.': (6644.29275453458, 12362.945030776154), 'nan_count': 0}
- type: {'n': 370, 'cardinality': 2, 'value_count': 'ACTIVATE:211, DEACTIVATE:

  (dat.loc[(p,), :].index.array - dat.loc[(p,), :].index.array.shift(1)).dropna().total_seconds()
  (dat.loc[(p,), :].index.array - dat.loc[(p,), :].index.array.shift(1)).dropna().total_seconds()


##### OnOffEvent #####
- # Inst.: {'n': 47, 'sum': 184, 'mean': 3.9148936170212765, 'SD': 2.857711057548792, 'med': 3.0, 'range': (1, 16), 'conf.': (3.0758382188645337, 4.753949015178019), 'nan_count': 0}
- Samp. period: {'n': 137, 'sum': 7283051.445, 'mean': 53160.95945255474, 'SD': 87596.26514794471, 'med': 5687.382, 'range': (84.24, 429360.778), 'conf.': (38361.18334128552, 67960.73556382397), 'nan_count': 0}
- type: {'n': 184, 'cardinality': 2, 'value_count': 'ON:95, OFF:89'}


In [15]:
LABELS_VALID = LABELS.loc[
    lambda x: ~x['scheduledTime'].isna(), :
]
print(f'# Non-voluntary response: {len(LABELS_VALID)}')
print(summary(LABELS_VALID.groupby('pcode').count().iloc[:, -1]))

excl_pcode = LABELS_VALID.loc[
    lambda x: ~x['scheduledTime'].isna()
].groupby('pcode').count().iloc[:, -1].loc[lambda y: y < 35]

LABELS_VALID = LABELS_VALID.loc[
    lambda x:  ~x.index.get_level_values('pcode').isin(excl_pcode.index), :
]
print(f'# Response from participants with enough responses: {len(LABELS_VALID)}')
print(summary(LABELS_VALID.groupby('pcode').count().iloc[:, -1]))

print('# Participants whose responses to ESM delivery were less then 35')
print(excl_pcode, f'#participants = {len(excl_pcode)} / #response = {sum(excl_pcode)}')

# Non-voluntary response: 3323
{'n': 76, 'sum': 3323, 'mean': 43.723684210526315, 'SD': 19.36291898394835, 'med': 43.5, 'range': (3, 83), 'conf.': (39.29906768359284, 48.14830073745979), 'nan_count': 0}
# Response from participants with enough responses: 2619
{'n': 47, 'sum': 2619, 'mean': 55.723404255319146, 'SD': 13.076201628480542, 'med': 52.0, 'range': (36, 83), 'conf.': (51.88408763344431, 59.56272087719398), 'nan_count': 0}
# Participants whose responses to ESM delivery were less then 35
pcode
P04    34
P07    24
P11    22
P14    11
P16    30
P17    13
P18    32
P20    31
P22    23
P24    10
P25    30
P29    32
P34    22
P36    29
P37    31
P38    33
P41    31
P43    24
P44    23
P46     4
P54    13
P56    31
P58    29
P62     3
P63    34
P64    30
P68    11
P73    31
P74    33
Name: change, dtype: int64 #participants = 29 / #response = 704


In [16]:
import pandas as pd
import numpy as np


LABELS_PROC = LABELS_VALID.reset_index().assign(
    timestamp=lambda x: pd.to_datetime(x['scheduledTime'], unit='ms', utc=True).dt.tz_convert(DEFAULT_TZ),
    valence_bin = lambda x: np.where(x['valence'] > 0, 1, 0),
    arousal_bin = lambda x: np.where(x['arousal'] > 0, 1, 0),
    stress_bin = lambda x: np.where(x['stress'] > 0, 1, 0),
    disturbance_bin = lambda x: np.where(x['disturbance'] > 0, 1, 0)
).set_index(
    ['pcode', 'timestamp']
)

LABELS_PROC.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,responseTime,scheduledTime,valence,arousal,attention,stress,duration,disturbance,change,valence_bin,arousal_bin,stress_bin,disturbance_bin
pcode,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
P01,2019-05-08 10:26:00+09:00,1557278986000,1557279000000.0,-3,3,3,3,5.0,-1,-3,0,1,1,0
P01,2019-05-08 11:13:00+09:00,1557281772000,1557282000000.0,-3,-2,2,2,15.0,3,-2,0,0,1,1
P01,2019-05-08 15:56:00+09:00,1557298702000,1557299000000.0,3,3,3,-3,20.0,2,0,1,1,0,1
P01,2019-05-08 16:41:00+09:00,1557301311000,1557301000000.0,3,3,3,-3,30.0,1,2,1,1,0,1
P01,2019-05-08 17:23:00+09:00,1557304062000,1557304000000.0,3,3,3,-3,20.0,2,2,1,1,0,1


In [17]:
import numpy as np


inst = LABELS_PROC.groupby('pcode').count().iloc[:, -1]
for c in [c for c in LABELS_PROC.columns if c.endswith('_bin')]:
    print(f'- {c}:', summary(LABELS_PROC[c].astype(object)))

- valence_bin: {'n': 2619, 'cardinality': 2, 'value_count': '1:1556, 0:1063'}
- arousal_bin: {'n': 2619, 'cardinality': 2, 'value_count': '0:1586, 1:1033'}
- stress_bin: {'n': 2619, 'cardinality': 2, 'value_count': '0:1702, 1:917'}
- disturbance_bin: {'n': 2619, 'cardinality': 2, 'value_count': '0:1509, 1:1110'}


In [18]:
import pandas as pd
import numpy as np
import scipy.spatial.distance as dist
from typing import Dict, Union
import pygeohash as geo
from datetime import timedelta
from collections import defaultdict


# AppUsageEvent.csv
def _proc_app_usage(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    data = data.loc[
        lambda x: x['type'].isin(['MOVE_TO_FOREGROUND', 'MOVE_TO_BACKGROUND']), :
    ].assign(
        packageName=lambda x: np.where(x['type'] == 'MOVE_TO_FOREGROUND', x['packageName'], None),
        category=lambda x: np.where(x['type'] == 'MOVE_TO_FOREGROUND', x['category'], None),
    )

    return {
        'PAC': data['packageName'].astype('object'),
        'CAT': data['category'].astype('object')
    }


# Connectivity.csv
def _proc_connectivity(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    data = data.assign(
        type=lambda x: np.where(x['isConnected'] == True, x['type'], 'DISCONNECTED')
    )

    return data['type'].astype('object')


# BatteryEvent.csv
def _proc_battery(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    return {
        'LEV': data['level'].astype('float32'),
        'STA': data['status'].astype('object'),
        'TMP': data['temperature'].astype('float32')
    }
        

# CallEvent.csv
def _proc_call(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    data = data.loc[
        lambda x: x['duration'] > 0, :
    ]
    new_data = []

    for row in data.itertuples():
        pcode, timestamp = row.Index
        
        new_data.append({
            'pcode': pcode,
            'timestamp': timestamp,                
            'state': 'CALL',
        })
        new_data.append({
            'pcode': pcode,
            'timestamp': timestamp + timedelta(milliseconds=row.duration),
            'state': 'IDLE'
        })

    new_data = pd.DataFrame(new_data).set_index(
        ['pcode', 'timestamp']
    )

    return new_data['state'].astype('object')


# DataTraffic.csv
def _proc_data_traffic(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    return {
        'RCV': data['rxKiloBytes'].astype('float32'),
        'SNT': data['txKiloBytes'].astype('float32')
    }


# RingerModeEvent.csv
def _proc_ringer_mode(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    return data['type'].astype('object')


# ScreenEvent.csv
def _proc_screen(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    return data['type'].astype('object')


# OnOffEvent.csv
def _proc_on_off(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    return data['type'].astype('object')


# PowerSaveEvent.csv
def _proc_power_save(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    return data['type'].astype('object')


# ChargeEvent.csv
def _proc_charge(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    return data['type'].astype('object')


# Location.csv
def _proc_location(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    def _haversine(_lat1, _lat2, _lng1, _lng2) -> float:
        if np.isnan(_lat1) or np.isnan(_lat2) or np.isnan(_lng1) or np.isnan(_lng2):
            return 0.0

        _lat1_r, _lat2_r, _lng1_r, _lng2_r = np.radians(_lat1), np.radians(_lat2), np.radians(_lng1), np.radians(_lng2)
        _lat = _lat2_r - _lat1_r
        _lng = _lng2_r - _lng1_r
        _R = 6371008.8
        _d = np.sin(_lat * 0.5) ** 2 + np.cos(_lat1_r) * np.cos(_lat2_r) * np.sin(_lng * 0.5) ** 2
        return 2 * _R * np.arcsin(np.sqrt(_d))

    new_data = []
    
    for pcode in data.index.get_level_values('pcode').unique():
        sub = data.loc[(pcode, ), :].sort_index(
            axis=0, level='timestamp'
        ).assign(
            _latitude=lambda x: x['latitude'].shift(1),
            _longitude=lambda x: x['longitude'].shift(1),
            dist=lambda x: x.apply(
                lambda y: _haversine(y['latitude'], y['_latitude'], y['longitude'], y['_longitude']),
                axis=1
            ),
            cluster=lambda x: x.apply(
                lambda y: geo.encode(y['latitude'], y['longitude'], precision=7),
                axis=1
            ),
            pcode=pcode
        ).reset_index()
        new_data.append(sub)

    new_data = pd.concat(new_data, axis=0, ignore_index=True).set_index(
        ['pcode', 'timestamp']
    )

    return {
        'CLS': new_data['cluster'].astype('object'),
        'DST': new_data['dist'].astype('float32')
    }


# ActivityEvent.csv
def _proc_activity_event(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    return {
        'UNK': data['confidenceUnknown'].astype('float32'),
        'FOT': data['confidenceOnFoot'].astype('float32'),
        'WLK': data['confidenceWalking'].astype('float32'),
        'VHC': data['confidenceInVehicle'].astype('float32'),
        'BCC': data['confidenceOnBicycle'].astype('float32'),
        'RUN': data['confidenceRunning'].astype('float32'),
        'TLT': data['confidenceTilting'].astype('float32')
    }


# ActivityTransition.csv
def _proc_activity_transition(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    data = data.loc[
        lambda x: x['transitionType'].isin(['ENTER_WALKING', 'ENTER_STILL', 'ENTER_IN_VEHICLE', 'ENTER_ON_BICYCLE', 'ENTER_RUNNING']), :
    ].assign(
        type=lambda x: x['transitionType'].str.replace('ENTER_', '')
    )
    
    return data['type'].astype('object')


# WiFi.csv
def _proc_wifi(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    new_data = []
    
    for pcode in data.index.get_level_values('pcode').unique():
        sub = data.loc[(pcode, ), :].sort_index(
            axis=0, level='timestamp'
        ).assign(
            bssid=lambda x: x['bssid'].str.cat(x['frequency'].astype(str), sep='-')
        )
        t = sub.index.unique().array
        for cur_t, prev_t in zip(t, t.shift(1)):
            if cur_t is pd.NaT or prev_t is pd.NaT:
                continue

            prev = sub.loc[[prev_t], :]
            cur = sub.loc[[cur_t], :]
            intersect = np.intersect1d(prev['bssid'], cur['bssid'])
            union = np.union1d(prev['bssid'], cur['bssid'])
            w = np.repeat(1 / len(intersect), len(intersect)) if len(intersect) else 1.0
            prev_intersect = prev.loc[
                lambda x: x['bssid'].isin(intersect), :
            ].sort_values('bssid')
            cur_intersect = cur.loc[
                lambda x: x['bssid'].isin(intersect), :
            ].sort_values('bssid')
            prev_rssi = prev_intersect['rssi']
            cur_rssi = cur_intersect['rssi']

            new_data.append(dict(
                pcode=pcode,
                timestamp=cur_t,
                cosine=1 - dist.cosine(prev_rssi, cur_rssi) if len(intersect) > 0 else 0,
                euclidean=1 / (1 + dist.euclidean(prev_rssi, cur_rssi, w)) if len(intersect) > 0 else 0,
                manhattan=1 / (1 + dist.cityblock(prev_rssi, cur_rssi, w)) if len(intersect) > 0 else 0,
                jaccard = len(intersect) / len(union) if len(union) > 0 else 0
            ))
            
    new_data = pd.DataFrame(new_data).set_index(
        ['pcode', 'timestamp']
    )
    
    return {
        'COS': new_data['cosine'].astype('float32'),
        'EUC': new_data['euclidean'].astype('float32'),
        'MAN': new_data['manhattan'].astype('float32'),
        'JAC': new_data['jaccard'].astype('float32')
    }


# InstalledApp.csv
def _proc_installed_app(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    new_data = []
    
    for pcode in data.index.get_level_values('pcode').unique():
        sub = data.loc[(pcode, ), :].sort_index(axis=0, level='timestamp')
        t = sub.index.unique().array
        for cur_t, prev_t in zip(t, t.shift(1)):
            if cur_t is pd.NaT or prev_t is pd.NaT:
                continue

            prev = sub.loc[[prev_t], :]
            cur = sub.loc[[cur_t], :]
            intersect = np.intersect1d(prev['packageName'], cur['packageName'])
            union = np.union1d(prev['packageName'], cur['packageName'])
            new_data.append(dict(
                pcode=pcode,
                timestamp=cur_t,
                jaccard = len(intersect) / len(union) if len(union) > 0 else 0
            ))
            
    new_data = pd.DataFrame(new_data).set_index(
        ['pcode', 'timestamp']
    )
    
    return {
       'JAC': new_data['jaccard'].astype('float32')
    }


# MediaEvent.csv
def _proc_media_event(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    new_data = defaultdict(list)
    
    for pcode in data.index.get_level_values('pcode').unique():
        sub = data.loc[(pcode, ), :].sort_index(
            axis=0, level='timestamp'
        )

        video = sub.loc[
            lambda x: x['mimetype'].str.startswith('video'), :
        ].assign(
            event=1,
            pcode=pcode
        ).reset_index()

        image = sub.loc[
            lambda x: x['mimetype'].str.startswith('image'), :
        ].assign(
            event=1,
            pcode=pcode
        ).reset_index()

        media = sub.assign(
            event=1,
            pcode=pcode
        ).reset_index()

        new_data['VID'].append(video)
        new_data['IMG'].append(image)
        new_data['ALL'].append(media)

    return {
        k: pd.concat(
            v, axis=0, ignore_index=True
        ).set_index(
            ['pcode', 'timestamp']
        )['event'].astype('float32') 
        for k, v in new_data.items()
    }


# MessageEvent.csv
def _proc_message_event(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    new_data = defaultdict(list)
    
    for pcode in data.index.get_level_values('pcode').unique():
        sub = data.loc[(pcode, ), :].sort_index(
            axis=0, level='timestamp'
        )

        sent = sub.loc[
            lambda x: x['messageBox'] == 'SENT', :
        ].assign(
            event=1,
            pcode=pcode
        ).reset_index()

        recv = sub.loc[
            lambda x: x['messageBox'] == 'INBOX', :
        ].assign(
            event=1,
            pcode=pcode
        ).reset_index()

        msg = sub.assign(
            event=1,
            pcode=pcode
        ).reset_index()

        new_data['SNT'].append(sent)
        new_data['RCV'].append(recv)
        new_data['ALL'].append(msg)

    return {
        k: pd.concat(
            v, axis=0, ignore_index=True
        ).set_index(
            ['pcode', 'timestamp']
        )['event'].astype('float32') 
        for k, v in new_data.items()
    }


# Acceleration.csv
def _proc_acceleration(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    data = data.assign(
        mag=lambda x: np.sqrt(np.square(x['x']) + np.square(x['y']) + np.square(x['z']))
    )

    return {
        'AXX': data['x'].astype('float32'),
        'AXY': data['y'].astype('float32'),
        'AXZ': data['z'].astype('float32'),
        'MAG': data['mag'].astype('float32')
    }

# UltraViolet.csv
def _proc_ultra_violet(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    new_data = []
    
    for pcode in data.index.get_level_values('pcode').unique():
        sub = data.loc[(pcode, ), :].sort_index(
            axis=0, level='timestamp'
        ).assign(
            exposure=lambda x: (x['totalExposure'] - x['totalExposure'].shift(1)),
            pcode=pcode
        ).reset_index()

        new_data.append(sub)

    new_data = pd.concat(new_data, axis=0, ignore_index=True).set_index(
        ['pcode', 'timestamp']
    )

    return {
        'INT': new_data['intensity'].astype('object'),
        'EXP': new_data['exposure'].dropna().astype('float32')
    }


# SkinTemperature.csv
def _proc_skin_temperature(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    return data['temperature'].astype('float32')


# RRI.csv
def _proc_rri(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    return data['interval'].astype('float32')


# AmbientLight.csv
def _proc_ambient_light(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    return data['brightness'].astype('float32')
    

# StepCount.csv
def _proc_step_count(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    new_data = []

    for pcode in data.index.get_level_values('pcode').unique():
        sub = data.loc[(pcode, ), :].sort_index(
            axis=0, level='timestamp'
        ).assign(
            steps=lambda x: (x['totalSteps'] - x['totalSteps'].shift(1)),
            pcode=pcode
        ).reset_index()
        new_data.append(sub)

    new_data = pd.concat(new_data, axis=0, ignore_index=True).set_index(
        ['pcode', 'timestamp']
    )

    return new_data['steps'].dropna().astype('float32')
    

# HR.csv
def _proc_hr(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    return data['bpm'].astype('float32')
    

# EDA.csv
def _proc_eda(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    return data['resistance'].astype('float32')


# Distance.csv
def _proc_distance(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    new_data = []

    for pcode in data.index.get_level_values('pcode').unique():
        sub = data.loc[(pcode, ), :].sort_index(
            axis=0, level='timestamp'
        ).assign(
            distance=lambda x: x['totalDistance'] - x['totalDistance'].shift(1),
            pcode=pcode
        ).reset_index()

        new_data.append(sub)

    new_data = pd.concat(new_data, axis=0, ignore_index=True).set_index(
        ['pcode', 'timestamp']
    )

    return {
        'DST': new_data['distance'].dropna().astype('float32'),
        'MOT': new_data['motionType'].astype('object'),
        'PAC': new_data['pace'].astype('float32'),
        'SPD': new_data['speed'].astype('float32')
    }


# Calorie.csv
def _proc_calories(data: pd.DataFrame) -> Union[pd.Series, Dict[str, pd.Series]]:
    new_data = []

    for pcode in data.index.get_level_values('pcode').unique():
        sub = data.loc[(pcode, ), :].sort_index(
            axis=0, level='timestamp'
        ).assign(
            calories=lambda x: x['totalCalories'] - x['totalCalories'].shift(1),
            pcode=pcode
        ).reset_index()

        new_data.append(sub)

    new_data = pd.concat(new_data, axis=0, ignore_index=True).set_index(
        ['pcode', 'timestamp']
    )

    return new_data['calories'].dropna().astype('float32')      

In [22]:
import pandas as pd
import gc
from functools import reduce

FUNC_PROC = {
    'Acceleration': _proc_acceleration,
    'AmbientLight': _proc_ambient_light,
    'Calorie': _proc_calories,
    'Distance': _proc_distance,
    'EDA': _proc_eda,
    'HR': _proc_hr,
    'RRI': _proc_rri,
    'SkinTemperature': _proc_skin_temperature,
    'StepCount': _proc_step_count,
    'UltraViolet': _proc_ultra_violet,
    'ActivityEvent': _proc_activity_event,
    'ActivityTransition': _proc_activity_transition,
    'AppUsageEvent': _proc_app_usage,
    'BatteryEvent': _proc_battery,
    'CallEvent': _proc_call,
    'Connectivity': _proc_connectivity,
    'DataTraffic': _proc_data_traffic,
    'InstalledApp': _proc_installed_app,
    'Location': _proc_location,
    'MediaEvent': _proc_media_event,
    'MessageEvent': _proc_message_event,
    'WiFi': _proc_wifi,
    'ScreenEvent': _proc_screen,
    'RingerModeEvent': _proc_ringer_mode,
    'ChargeEvent': _proc_charge,
    'PowerSaveEvent': _proc_power_save,
    'OnOffEvent': _proc_on_off
}

def _process(data_type: str):
    log(f'Begin to processing data: {data_type}')
    
    abbrev = DATA_TYPES[data_type]
    data_raw = _load_data(data_type)
    data_proc = FUNC_PROC[data_type](data_raw)
    result = dict()
    
    if type(data_proc) is dict:
        for k, v in data_proc.items():
            result[f'{abbrev}_{k}'] = v
    else:
        result[abbrev] = data_proc
        
    log(f'Complete processing data: {data_type}')
    return result



with on_ray(num_cpus=12):
    jobs = []
    
    func = ray.remote(_process).remote
    
    for data_type in DATA_TYPES:
        job = func(data_type)
        jobs.append(job)
        del job
        gc.collect()

    jobs = ray.get(jobs)
    jobs = reduce(lambda a, b: {**a, **b}, jobs)
    dump(jobs, os.path.join(PATH_INTERMEDIATE, 'original_data' 'proc_original.pkl'))

    del jobs
    gc.collect()

2023-07-12 05:48:47,268	INFO worker.py:1616 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


[2m[36m(_process pid=407591)[0m [23-07-12 05:48:52] Begin to processing data: Acceleration
[2m[36m(_process pid=407590)[0m [23-07-12 05:48:52] Begin to processing data: AmbientLight
[2m[36m(_process pid=407594)[0m [23-07-12 05:48:52] Begin to processing data: Calorie
[2m[36m(_process pid=407585)[0m [23-07-12 05:48:53] Begin to processing data: Distance
[2m[36m(_process pid=407584)[0m [23-07-12 05:48:53] Begin to processing data: EDA
[2m[36m(_process pid=407592)[0m [23-07-12 05:48:55] Begin to processing data: HR
[2m[36m(_process pid=407588)[0m [23-07-12 05:48:55] Begin to processing data: RRI
[2m[36m(_process pid=407586)[0m [23-07-12 05:48:56] Begin to processing data: SkinTemperature
[2m[36m(_process pid=407587)[0m [23-07-12 05:48:57] Begin to processing data: StepCount
[2m[36m(_process pid=407589)[0m [23-07-12 05:48:58] Begin to processing data: UltraViolet
[2m[36m(_process pid=407583)[0m [23-07-12 05:49:00] Begin to processing data: ActivityEvent
[2



[2m[36m(_process pid=407589)[0m [23-07-12 05:49:18] Complete processing data: CallEvent
[2m[36m(_process pid=407589)[0m [23-07-12 05:49:18] Begin to processing data: Connectivity
[2m[36m(_process pid=407589)[0m [23-07-12 05:49:21] Complete processing data: Connectivity
[2m[36m(_process pid=407589)[0m [23-07-12 05:49:21] Begin to processing data: DataTraffic
[2m[36m(_process pid=407583)[0m [23-07-12 05:49:22] Complete processing data: ActivityEvent
[2m[36m(_process pid=407583)[0m [23-07-12 05:49:23] Begin to processing data: InstalledApp
[2m[36m(_process pid=407593)[0m [23-07-12 05:49:33] Complete processing data: BatteryEvent
[2m[36m(_process pid=407589)[0m [23-07-12 05:49:34] Complete processing data: DataTraffic
[2m[36m(_process pid=407589)[0m [23-07-12 05:49:35] Begin to processing data: Location
[2m[36m(_process pid=407593)[0m [23-07-12 05:49:35] Begin to processing data: MediaEvent






[2m[36m(_process pid=407593)[0m [23-07-12 05:49:43] Complete processing data: MediaEvent
[2m[36m(_process pid=407593)[0m [23-07-12 05:49:44] Begin to processing data: MessageEvent




[2m[36m(_process pid=407593)[0m [23-07-12 05:49:55] Complete processing data: MessageEvent
[2m[36m(_process pid=407593)[0m [23-07-12 05:49:55] Begin to processing data: WiFi
[2m[36m(_process pid=407586)[0m [23-07-12 05:49:59] Complete processing data: AppUsageEvent
[2m[36m(_process pid=407586)[0m [23-07-12 05:50:01] Begin to processing data: ScreenEvent




[2m[36m(_process pid=407586)[0m [23-07-12 05:50:06] Complete processing data: ScreenEvent
[2m[36m(_process pid=407586)[0m [23-07-12 05:50:06] Begin to processing data: RingerModeEvent
[2m[36m(_process pid=407586)[0m [23-07-12 05:50:08] Complete processing data: RingerModeEvent
[2m[36m(_process pid=407586)[0m [23-07-12 05:50:09] Begin to processing data: ChargeEvent
[2m[36m(_process pid=407586)[0m [23-07-12 05:50:12] Complete processing data: ChargeEvent
[2m[36m(_process pid=407586)[0m [23-07-12 05:50:12] Begin to processing data: PowerSaveEvent




[2m[36m(_process pid=407586)[0m [23-07-12 05:50:15] Complete processing data: PowerSaveEvent
[2m[36m(_process pid=407586)[0m [23-07-12 05:50:15] Begin to processing data: OnOffEvent
[2m[36m(_process pid=407586)[0m [23-07-12 05:50:19] Complete processing data: OnOffEvent


[2m[33m(raylet)[0m [2023-07-12 05:52:47,179 E 407416 407416] (raylet) node_manager.cc:3071: 2 Workers (tasks / actors) killed due to memory pressure (OOM), 0 Workers crashed due to other reasons at node (ID: c8a3a5d049b60fd58830b4be6eef90aaeff8923efb89e01708f23cac, IP: 172.17.0.3) over the last time period. To see more information about the Workers killed on this node, use `ray logs raylet.out -ip 172.17.0.3`
[2m[33m(raylet)[0m 
[2m[33m(raylet)[0m Refer to the documentation on how to address the out of memory issue: https://docs.ray.io/en/latest/ray-core/scheduling/ray-oom-prevention.html. Consider provisioning more memory on this node or reducing task parallelism by requesting more CPUs per task. To adjust the kill threshold, set the environment variable `RAY_memory_usage_threshold` when starting Ray. To disable worker killing, set the environment variable `RAY_memory_monitor_refresh_ms` to zero.


[2m[36m(_process pid=408984)[0m [23-07-12 05:52:58] Begin to processing data: WiFi
[2m[36m(_process pid=407583)[0m [23-07-12 05:52:58] Complete processing data: InstalledApp
[2m[36m(_process pid=408983)[0m [23-07-12 05:53:10] Begin to processing data: Location
[2m[36m(_process pid=407583)[0m [23-07-12 05:53:10] Begin to processing data: Location


[2m[33m(raylet)[0m [2023-07-12 05:53:47,180 E 407416 407416] (raylet) node_manager.cc:3071: 5 Workers (tasks / actors) killed due to memory pressure (OOM), 0 Workers crashed due to other reasons at node (ID: c8a3a5d049b60fd58830b4be6eef90aaeff8923efb89e01708f23cac, IP: 172.17.0.3) over the last time period. To see more information about the Workers killed on this node, use `ray logs raylet.out -ip 172.17.0.3`
[2m[33m(raylet)[0m 
[2m[33m(raylet)[0m Refer to the documentation on how to address the out of memory issue: https://docs.ray.io/en/latest/ray-core/scheduling/ray-oom-prevention.html. Consider provisioning more memory on this node or reducing task parallelism by requesting more CPUs per task. To adjust the kill threshold, set the environment variable `RAY_memory_usage_threshold` when starting Ray. To disable worker killing, set the environment variable `RAY_memory_monitor_refresh_ms` to zero.


[2m[36m(_process pid=409194)[0m [23-07-12 05:53:48] Begin to processing data: Location
[2m[36m(_process pid=409193)[0m [23-07-12 05:53:48] Begin to processing data: WiFi
[2m[36m(_process pid=409195)[0m [23-07-12 05:53:49] Begin to processing data: StepCount
[2m[36m(_process pid=407592)[0m [23-07-12 05:53:53] Complete processing data: HR
[2m[36m(_process pid=409192)[0m [23-07-12 05:54:00] Begin to processing data: StepCount




[2m[36m(_process pid=407592)[0m [23-07-12 05:54:04] Begin to processing data: WiFi


[2m[33m(raylet)[0m [2023-07-12 05:54:47,182 E 407416 407416] (raylet) node_manager.cc:3071: 3 Workers (tasks / actors) killed due to memory pressure (OOM), 0 Workers crashed due to other reasons at node (ID: c8a3a5d049b60fd58830b4be6eef90aaeff8923efb89e01708f23cac, IP: 172.17.0.3) over the last time period. To see more information about the Workers killed on this node, use `ray logs raylet.out -ip 172.17.0.3`
[2m[33m(raylet)[0m 
[2m[33m(raylet)[0m Refer to the documentation on how to address the out of memory issue: https://docs.ray.io/en/latest/ray-core/scheduling/ray-oom-prevention.html. Consider provisioning more memory on this node or reducing task parallelism by requesting more CPUs per task. To adjust the kill threshold, set the environment variable `RAY_memory_usage_threshold` when starting Ray. To disable worker killing, set the environment variable `RAY_memory_monitor_refresh_ms` to zero.


[2m[36m(_process pid=407588)[0m [23-07-12 05:54:48] Complete processing data: RRI




[2m[36m(_process pid=407588)[0m [23-07-12 05:54:56] Begin to processing data: WiFi




[2m[36m(_process pid=409759)[0m [23-07-12 05:56:38] Begin to processing data: StepCount
[2m[36m(_process pid=409758)[0m [23-07-12 05:56:40] Begin to processing data: Location




[2m[36m(_process pid=409863)[0m [23-07-12 05:56:44] Begin to processing data: EDA


[2m[33m(raylet)[0m [2023-07-12 05:56:47,185 E 407416 407416] (raylet) node_manager.cc:3071: 4 Workers (tasks / actors) killed due to memory pressure (OOM), 0 Workers crashed due to other reasons at node (ID: c8a3a5d049b60fd58830b4be6eef90aaeff8923efb89e01708f23cac, IP: 172.17.0.3) over the last time period. To see more information about the Workers killed on this node, use `ray logs raylet.out -ip 172.17.0.3`
[2m[33m(raylet)[0m 
[2m[33m(raylet)[0m Refer to the documentation on how to address the out of memory issue: https://docs.ray.io/en/latest/ray-core/scheduling/ray-oom-prevention.html. Consider provisioning more memory on this node or reducing task parallelism by requesting more CPUs per task. To adjust the kill threshold, set the environment variable `RAY_memory_usage_threshold` when starting Ray. To disable worker killing, set the environment variable `RAY_memory_monitor_refresh_ms` to zero.


[2m[36m(_process pid=409862)[0m [23-07-12 05:56:49] Begin to processing data: WiFi




[2m[36m(_process pid=407590)[0m [23-07-12 05:56:58] Complete processing data: AmbientLight


[2m[33m(raylet)[0m [2023-07-12 05:57:47,186 E 407416 407416] (raylet) node_manager.cc:3071: 1 Workers (tasks / actors) killed due to memory pressure (OOM), 0 Workers crashed due to other reasons at node (ID: c8a3a5d049b60fd58830b4be6eef90aaeff8923efb89e01708f23cac, IP: 172.17.0.3) over the last time period. To see more information about the Workers killed on this node, use `ray logs raylet.out -ip 172.17.0.3`
[2m[33m(raylet)[0m 
[2m[33m(raylet)[0m Refer to the documentation on how to address the out of memory issue: https://docs.ray.io/en/latest/ray-core/scheduling/ray-oom-prevention.html. Consider provisioning more memory on this node or reducing task parallelism by requesting more CPUs per task. To adjust the kill threshold, set the environment variable `RAY_memory_usage_threshold` when starting Ray. To disable worker killing, set the environment variable `RAY_memory_monitor_refresh_ms` to zero.


[2m[36m(_process pid=407594)[0m [23-07-12 05:57:49] Complete processing data: Calorie




[2m[36m(_process pid=407585)[0m [23-07-12 05:58:03] Complete processing data: Distance




[2m[36m(_process pid=407585)[0m [23-07-12 05:58:28] Begin to processing data: EDA




[2m[36m(_process pid=407594)[0m [23-07-12 05:58:43] Begin to processing data: WiFi
[2m[36m(_process pid=407590)[0m [23-07-12 05:58:44] Begin to processing data: EDA
[2m[36m(_process pid=409758)[0m [23-07-12 05:58:46] Complete processing data: Location


[2m[33m(raylet)[0m [2023-07-12 05:58:47,187 E 407416 407416] (raylet) node_manager.cc:3071: 3 Workers (tasks / actors) killed due to memory pressure (OOM), 0 Workers crashed due to other reasons at node (ID: c8a3a5d049b60fd58830b4be6eef90aaeff8923efb89e01708f23cac, IP: 172.17.0.3) over the last time period. To see more information about the Workers killed on this node, use `ray logs raylet.out -ip 172.17.0.3`
[2m[33m(raylet)[0m 
[2m[33m(raylet)[0m Refer to the documentation on how to address the out of memory issue: https://docs.ray.io/en/latest/ray-core/scheduling/ray-oom-prevention.html. Consider provisioning more memory on this node or reducing task parallelism by requesting more CPUs per task. To adjust the kill threshold, set the environment variable `RAY_memory_usage_threshold` when starting Ray. To disable worker killing, set the environment variable `RAY_memory_monitor_refresh_ms` to zero.


[2m[36m(_process pid=409758)[0m [23-07-12 05:58:54] Begin to processing data: EDA
[2m[36m(_process pid=409759)[0m [23-07-12 05:59:06] Complete processing data: StepCount
[2m[36m(_process pid=409759)[0m [23-07-12 05:59:21] Begin to processing data: EDA


[2m[33m(raylet)[0m [2023-07-12 05:59:47,188 E 407416 407416] (raylet) node_manager.cc:3071: 3 Workers (tasks / actors) killed due to memory pressure (OOM), 0 Workers crashed due to other reasons at node (ID: c8a3a5d049b60fd58830b4be6eef90aaeff8923efb89e01708f23cac, IP: 172.17.0.3) over the last time period. To see more information about the Workers killed on this node, use `ray logs raylet.out -ip 172.17.0.3`
[2m[33m(raylet)[0m 
[2m[33m(raylet)[0m Refer to the documentation on how to address the out of memory issue: https://docs.ray.io/en/latest/ray-core/scheduling/ray-oom-prevention.html. Consider provisioning more memory on this node or reducing task parallelism by requesting more CPUs per task. To adjust the kill threshold, set the environment variable `RAY_memory_usage_threshold` when starting Ray. To disable worker killing, set the environment variable `RAY_memory_monitor_refresh_ms` to zero.


[2m[36m(_process pid=410524)[0m [23-07-12 06:00:06] Begin to processing data: EDA
[2m[36m(_process pid=410523)[0m [23-07-12 06:00:07] Begin to processing data: WiFi


OutOfMemoryError: Task was killed due to the node running low on memory.
Memory on the node (IP: 172.17.0.3, ID: c8a3a5d049b60fd58830b4be6eef90aaeff8923efb89e01708f23cac) where the task (task ID: a67da5a546caba6c758df50841b74c98145e3b8c01000000, name=_process, pid=407591, memory used=18.91GB) was running was 29.91GB / 31.28GB (0.956201), which exceeds the memory usage threshold of 0.95. Ray killed this worker (ID: 3dac2190a9425b9bd54dbab124c23460d4166a08dadba30842d2d8e7) because it was the most recently scheduled task; to see more information about memory usage on this node, use `ray logs raylet.out -ip 172.17.0.3`. To see the logs of the worker, use `ray logs worker-3dac2190a9425b9bd54dbab124c23460d4166a08dadba30842d2d8e7*out -ip 172.17.0.3. Top 10 memory users:
PID	MEM(GB)	COMMAND
407591	18.91	ray::_process
394786	0.43	/home/user/miniconda3/envs/sci-data/bin/python -m ipykernel_launcher -f /home/user/.local/share/jupy...
393639	0.12	/home/user/miniconda3/envs/sci-data/bin/python -m ipykernel_launcher -f /home/user/.local/share/jupy...
393221	0.07	/home/user/.vscode-server/bin/660393deaaa6d1996740ff4880f1bad43768c814/node /home/user/.vscode-serve...
407175	0.07	/home/user/miniconda3/envs/sci-data/lib/python3.9/site-packages/ray/core/src/ray/gcs/gcs_server --lo...
407566	0.05	/home/user/miniconda3/envs/sci-data/bin/python -u /home/user/miniconda3/envs/sci-data/lib/python3.9/...
407284	0.05	/home/user/miniconda3/envs/sci-data/bin/python /home/user/miniconda3/envs/sci-data/lib/python3.9/sit...
407243	0.04	/home/user/miniconda3/envs/sci-data/bin/python -u /home/user/miniconda3/envs/sci-data/lib/python3.9/...
407459	0.04	/home/user/miniconda3/envs/sci-data/bin/python -u /home/user/miniconda3/envs/sci-data/lib/python3.9/...
392700	0.03	/home/user/.vscode-server/bin/660393deaaa6d1996740ff4880f1bad43768c814/node /home/user/.vscode-serve...
Refer to the documentation on how to address the out of memory issue: https://docs.ray.io/en/latest/ray-core/scheduling/ray-oom-prevention.html. Consider provisioning more memory on this node or reducing task parallelism by requesting more CPUs per task. To adjust the kill threshold, set the environment variable `RAY_memory_usage_threshold` when starting Ray. To disable worker killing, set the environment variable `RAY_memory_monitor_refresh_ms` to zero.

In [None]:
import os
import gc


DATA = load(os.path.join(PATH_INTERMEDIATE, 'proc_original.pkl'))
N_NUMERIC, N_CATEGORICAL = 0, 0

for k, v in DATA.items():
    if v.dtype.kind.isupper() or v.dtype.kind == 'b': 
        N_CATEGORICAL = N_CATEGORICAL + 1
    else:
        N_NUMERIC = N_NUMERIC + 1
        
    inst = v.groupby('pcode').count()
    sam = np.concatenate([
        (v.loc[(p,)].index.array - v.loc[(p,)].index.array.shift(1)).dropna().total_seconds()
        for p in v.index.get_level_values('pcode').unique()
    ])
    
    print('#'*5, k, '#'*5, )
    print('- # Inst.:', summary(inst))
    print('- Samp. period:', summary(sam))
    print('- Values', summary(v))
    print('')
    
    
print(f'# categorical data: {N_CATEGORICAL}/# numeric data: {N_NUMERIC}')
del DATA
gc.collect()