In [1]:
import pytz
import os


DEFAULT_TZ = pytz.FixedOffset(540)  # GMT+09:00; Asia/Seoul

PATH_DATA = 'data/D'
PATH_ESM = os.path.join(PATH_DATA, 'EsmResponse.csv')
PATH_PARTICIPANT = os.path.join(PATH_DATA, 'UserInfo.csv')
PATH_SENSOR = os.path.join(PATH_DATA, 'Sensor')

PATH_INTERMEDIATE = os.path.join('data/intermediate')
PATH_SAVE = '/home/user/Collab/MT/logo/3_clusters/data_MT'

DATA_TYPES = {
    'Acceleration': 'ACC',
    'AmbientLight': 'AML',
    'Calorie': 'CAL',
    'Distance': 'DST',
    'EDA': 'EDA',
    'HR': 'HRT',
    'RRI': 'RRI',
    'SkinTemperature': 'SKT',
    'StepCount': 'STP',
    'UltraViolet': 'ULV',
    'ActivityEvent': 'ACE',
    'ActivityTransition': 'ACT',
    'AppUsageEvent': 'APP',
    'BatteryEvent': 'BAT',
    'CallEvent': 'CAE',
    'Connectivity': 'CON',
    'DataTraffic': 'DAT',
    'InstalledApp': 'INS',
    'Location': 'LOC',
    'MediaEvent': 'MED',
    'MessageEvent': 'MSG',
    'WiFi': 'WIF',
    'ScreenEvent': 'SCR',
    'RingerModeEvent': 'RNG',
    'ChargeEvent': 'CHG',
    'PowerSaveEvent': 'PWS',
    'OnOffEvent': 'ONF'
}

In [2]:
import pandas as pd
import numpy as np
import scipy.stats as st
import cloudpickle
import ray
from datetime import datetime
from contextlib import contextmanager
import warnings
import time


def load(path: str):
    with open(path, mode='rb') as f:
        return cloudpickle.load(f)

    
def dump(obj, path: str):
    with open(path, mode='wb') as f:
        cloudpickle.dump(obj, f)
        
    
def log(msg: any):
    print('[{}] {}'.format(datetime.now().strftime('%y-%m-%d %H:%M:%S'), msg))


def summary(x):
    x = np.asarray(x)
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')

        n = len(x)
        # Here, uppercase np.dtype.kind corresponds to non-numeric data.
        # Also, we view the boolean data as dichotomous categorical data.
        if x.dtype.kind.isupper() or x.dtype.kind == 'b': 
            cnt = pd.Series(x).value_counts(dropna=False)
            card = len(cnt)
            cnt = cnt[:20]                
            cnt_str = ', '.join([f'{u}:{c}' for u, c in zip(cnt.index, cnt)])
            if card > 30:
                cnt_str = f'{cnt_str}, ...'
            return {
                'n': n,
                'cardinality': card,
                'value_count': cnt_str
            }
        else: 
            x_nan = x[np.isnan(x)]
            x_norm = x[~np.isnan(x)]
            
            tot = np.sum(x_norm)
            m = np.mean(x_norm)
            me = np.median(x_norm)
            s = np.std(x_norm, ddof=1)
            l, u = np.min(x_norm), np.max(x)
            conf_l, conf_u = st.t.interval(0.95, len(x_norm) - 1, loc=m, scale=st.sem(x_norm))
            n_nan = len(x_nan)
            
            return {
                'n': n,
                'sum': tot,
                'mean': m,
                'SD': s,
                'med': me,
                'range': (l, u),
                'conf.': (conf_l, conf_u),
                'nan_count': n_nan
            }


@contextmanager
def on_ray(*args, **kwargs):
    try:
        if ray.is_initialized():
            ray.shutdown()
        ray.init(*args, **kwargs)
        yield None
    finally:
        ray.shutdown()

transform = {
    'GAME': 'ENTER',
    'GAME_TRIVIA': 'ENTER',
    'GAME_CASINO': 'ENTER',
    'GAME-ACTION': 'ENTER',
    'GAME_SPORTS': 'ENTER',
    'GAME_PUZZLE': 'ENTER',
    'GAME_SIMULATION': 'ENTER',
    'GAME_STRATEGY': 'ENTER',
    'GAME_ROLE_PLAYING': 'ENTER',
    'GAME_ACTION': 'ENTER',
    'GAME_ARCADE': 'ENTER',
    'GAME_RACING': 'ENTER',
    'GAME_CASUAL': 'ENTER',
    'GAME_MUSIC': 'ENTER',
    'GAME_CARD': 'ENTER',
    'GAME_ADVENTURE': 'ENTER',
    'GAME_BOARD': 'ENTER',
    'GAME_EDUCATIONAL': 'ENTER',
    'GAME_RACING': 'ENTER',
    'PHOTOGRAPHY': 'ENTER',
    'ENTERTAINMENT': 'ENTER',
    'SPORTS': 'ENTER',
    'MUSIC_AND_AUDIO': 'ENTER',
    'COMICS': 'ENTER',
    'VIDEO_PLAYERS_AND_EDITORS': 'ENTER',
    'VIDEO_PLAYERS': 'ENTER',
    'ART_AND_DESIGN': 'ENTER',
    'TRAVEL_AND_LOCAL': 'INFO',
    'FOOD_AND_DRINK': 'INFO',
    'NEWS_AND_MAGAZINES': 'INFO',
    'MAPS_AND_NAVIGATION': 'INFO',
    'WEATHER': 'INFO',
    'HOUSE_AND_HOME': 'INFO',
    'BOOKS_AND_REFERENCE': 'INFO',
    'SHOPPING': 'INFO',
    'LIBRARIES_AND_DEMO': 'INFO',
    'BEAUTY': 'INFO',
    'AUTO_AND_VEHICLES': 'INFO',
    'LIFESTYLE': 'INFO',
    'PERSONALIZATION': 'SYSTEM',
    'TOOLS': 'SYSTEM',
    'COMMUNICATION': 'SOCIAL',
    'SOCIAL': 'SOCIAL',
    'DATING': 'SOCIAL',
    'PARENTING':'SOCIAL',
    'FINANCE': 'WORK',
    'BUSINESS': 'WORK',
    'PRODUCTIVITY': 'WORK',
    'EDUCATION': 'WORK',
    'HEALTH_AND_FITNESS': 'HEALTH',
    'MEDICAL': 'HEALTH',
    'SYSTEM': 'SYSTEM',
    'MISC': 'SYSTEM', # ABC logger
     None: 'UNKNOWN',
    'UNKNOWN':'UNKNOWN'
}

In [3]:
p = os.path.join(PATH_INTERMEDIATE, 'stress-fixed.pkl')
X, y, groups, t, datetimes = load(p)

In [4]:
PARTICIPANTS = pd.read_csv(os.path.join(PATH_INTERMEDIATE, 'PARTICIPANT_INFO.csv'),index_col = 'pcode')
PINFO = PARTICIPANTS.assign(
    BFI_OPN=lambda x: x['openness'],
    BFI_CON=lambda x: x['conscientiousness'],
    BFI_NEU=lambda x: x['neuroticism'],
    BFI_EXT=lambda x: x['extraversion'],
    BFI_AGR=lambda x: x['agreeableness'],
)[[
    'BFI_OPN', 'BFI_CON', 'BFI_NEU', 'BFI_EXT', 'BFI_AGR'
]]
PINFO = pd.get_dummies(PINFO, prefix_sep='=', dtype=bool)

In [5]:
PINFO

Unnamed: 0_level_0,BFI_OPN,BFI_CON,BFI_NEU,BFI_EXT,BFI_AGR
pcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
P01,11,11,3,4,13
P02,14,5,12,14,5
P03,10,15,8,7,11
P04,12,11,8,6,11
P05,10,11,13,10,6
...,...,...,...,...,...
P76,8,8,12,6,8
P77,11,12,7,11,10
P78,12,11,9,12,10
P79,9,10,7,12,11


In [6]:
duplicate_rows = PINFO[PINFO.duplicated()]

print(duplicate_rows)
print(PINFO)

       BFI_OPN  BFI_CON  BFI_NEU  BFI_EXT  BFI_AGR
pcode                                             
P80         13        7        5        4       12
       BFI_OPN  BFI_CON  BFI_NEU  BFI_EXT  BFI_AGR
pcode                                             
P01         11       11        3        4       13
P02         14        5       12       14        5
P03         10       15        8        7       11
P04         12       11        8        6       11
P05         10       11       13       10        6
...        ...      ...      ...      ...      ...
P76          8        8       12        6        8
P77         11       12        7       11       10
P78         12       11        9       12       10
P79          9       10        7       12       11
P80         13        7        5        4       12

[77 rows x 5 columns]


In [7]:
LABELS_PROC = pd.read_csv(os.path.join(PATH_INTERMEDIATE, 'LABELS_PROC.csv'), index_col=['pcode','timestamp'],parse_dates=True)

In [8]:
_df =LABELS_PROC
_df.reset_index(level='timestamp', inplace=True)
print('First timestamp:', _df['timestamp'].min())
print('Last timestamp:', _df['timestamp'].max())

First timestamp: 2019-04-30 10:03:28+09:00
Last timestamp: 2019-05-22 22:02:03+09:00


In [9]:
time_ranges = _df.groupby('pcode')['timestamp'].agg(['min', 'max'])
print(time_ranges)

                            min                       max
pcode                                                    
P01   2019-05-08 10:29:46+09:00 2019-05-14 21:12:31+09:00
P02   2019-05-08 10:52:29+09:00 2019-05-14 21:13:14+09:00
P03   2019-05-08 11:13:13+09:00 2019-05-14 20:23:23+09:00
P05   2019-05-08 10:40:49+09:00 2019-05-14 21:59:16+09:00
P06   2019-05-08 10:32:09+09:00 2019-05-14 21:56:51+09:00
P08   2019-05-08 10:42:48+09:00 2019-05-14 21:12:32+09:00
P09   2019-05-08 13:44:51+09:00 2019-05-14 20:31:01+09:00
P10   2019-05-08 10:40:26+09:00 2019-05-14 15:13:12+09:00
P12   2019-05-09 14:18:30+09:00 2019-05-14 21:09:22+09:00
P13   2019-05-08 10:30:38+09:00 2019-05-14 21:51:35+09:00
P15   2019-05-08 12:09:34+09:00 2019-05-14 19:44:17+09:00
P19   2019-05-08 10:41:14+09:00 2019-05-14 22:01:12+09:00
P21   2019-05-08 15:49:36+09:00 2019-05-14 21:43:22+09:00
P23   2019-05-08 10:20:41+09:00 2019-05-14 18:13:07+09:00
P26   2019-05-08 10:11:53+09:00 2019-05-14 21:59:17+09:00
P28   2019-05-

In [10]:
list_pid = set(LABELS_PROC.index.get_level_values('pcode').values)
list_pid

{'P01',
 'P02',
 'P03',
 'P05',
 'P06',
 'P08',
 'P09',
 'P10',
 'P12',
 'P13',
 'P15',
 'P19',
 'P21',
 'P23',
 'P26',
 'P28',
 'P30',
 'P31',
 'P32',
 'P33',
 'P35',
 'P39',
 'P40',
 'P42',
 'P45',
 'P47',
 'P48',
 'P49',
 'P50',
 'P51',
 'P52',
 'P53',
 'P55',
 'P57',
 'P60',
 'P61',
 'P66',
 'P67',
 'P69',
 'P70',
 'P72',
 'P75',
 'P76',
 'P77',
 'P78',
 'P79',
 'P80'}

In [11]:
PINFO_valid = PINFO.loc[PINFO.index.isin(list_pid)]
PINFO_valid.count()

BFI_OPN    47
BFI_CON    47
BFI_NEU    47
BFI_EXT    47
BFI_AGR    47
dtype: int64

In [12]:
duplicate_rows = PINFO_valid[PINFO_valid.duplicated()]

print(duplicate_rows)
print(PINFO_valid)

       BFI_OPN  BFI_CON  BFI_NEU  BFI_EXT  BFI_AGR
pcode                                             
P80         13        7        5        4       12
       BFI_OPN  BFI_CON  BFI_NEU  BFI_EXT  BFI_AGR
pcode                                             
P01         11       11        3        4       13
P02         14        5       12       14        5
P03         10       15        8        7       11
P05         10       11       13       10        6
P06          3        6       11        3        6
P08         10        8        9        9       12
P09         12       12        4       11        9
P10          6        7        9        9       11
P12          9       12        7        7       12
P13          5       12        3       12       13
P15          6       12        5        6       11
P19         12       13        3        9       10
P21         12        5        8        8        8
P23         13       12        6        9       14
P26         13        8        

In [13]:
#Divide the features into different categories
feat_current = X.loc[:,[('#VAL' in str(x)) or ('ESM#LastLabel' in str(x)) for x in X.keys()]]  
feat_dsc = X.loc[:,[('#DSC' in str(x))  for x in X.keys()]]  
feat_yesterday = X.loc[:,[('Yesterday' in str(x))  for x in X.keys()]]  
feat_today = X.loc[:,[('Today' in str(x))  for x in X.keys()]]  
feat_sleep = X.loc[:,[('Sleep' in str(x))  for x in X.keys()]]  
feat_time = X.loc[:,[('Time' in str(x))  for x in X.keys()]]  
feat_pif = X.loc[:,[('PIF' in str(x))  for x in X.keys()]]  
feat_ImmediatePast = X.loc[:,[('ImmediatePast_15' in str(x))  for x in X.keys()]]
#Divide the time window features into sensor/past stress label
feat_current_sensor = X.loc[:,[('#VAL' in str(x))  for x in X.keys()]]  
feat_current_ESM = X.loc[:,[('ESM#LastLabel' in str(x)) for x in X.keys()]]  
feat_ImmediatePast_sensor = feat_ImmediatePast.loc[:,[('ESM' not in str(x)) for x in feat_ImmediatePast.keys()]]  
feat_ImmediatePast_ESM = feat_ImmediatePast.loc[:,[('ESM'  in str(x)) for x in feat_ImmediatePast.keys()]]  
feat_today_sensor = feat_today.loc[:,[('ESM' not in str(x))  for x in feat_today.keys()]]  
feat_today_ESM = feat_today.loc[:,[('ESM'  in str(x)) for x in feat_today.keys()]]  
feat_yesterday_sensor = feat_yesterday.loc[:,[('ESM' not in str(x)) for x in feat_yesterday.keys()]]  
feat_yesterday_ESM = feat_yesterday.loc[:,[('ESM'  in str(x)) for x in feat_yesterday.keys()]]
#Prepare the final feature set
feat_baseline = pd.concat([ feat_time,feat_dsc,feat_current_sensor, feat_ImmediatePast_sensor],axis=1)
feat_final = pd.concat([feat_baseline ], axis=1)
X = feat_final

In [14]:
X

Unnamed: 0,Time#DOW=MON,Time#DOW=TUE,Time#DOW=WED,Time#DOW=THU,Time#DOW=FRI,Time#DOW=SAT,Time#DOW=SUN,Time#WKD=Y,Time#WKD=N,Time#HRN=DAWN,...,ONF#ASC##ImmediatePast_15,ONF#RLV_SUP#ImmediatePast_15,MED_VID#AVG#ImmediatePast_15,MED_VID#STD#ImmediatePast_15,MED_VID#SKW#ImmediatePast_15,MED_VID#KUR#ImmediatePast_15,MED_VID#ASC#ImmediatePast_15,MED_VID#BEP#ImmediatePast_15,MED_VID#MED#ImmediatePast_15,MED_VID#TSC#ImmediatePast_15
0,-0.493710,-0.32914,2.143928,-0.373210,-0.373210,-0.373210,-0.414800,-0.608135,0.608135,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.493710,-0.32914,2.143928,-0.373210,-0.373210,-0.373210,-0.414800,-0.608135,0.608135,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.493710,-0.32914,2.143928,-0.373210,-0.373210,-0.373210,-0.414800,-0.608135,0.608135,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.493710,-0.32914,2.143928,-0.373210,-0.373210,-0.373210,-0.414800,-0.608135,0.608135,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.493710,-0.32914,2.143928,-0.373210,-0.373210,-0.373210,-0.414800,-0.608135,0.608135,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2614,-0.301735,0.00000,-0.514315,-0.546859,-0.378455,-0.448067,2.184325,1.377054,-1.377054,0.0,...,-0.145865,1.242816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2615,3.243652,0.00000,-0.514315,-0.546859,-0.378455,-0.448067,-0.448067,-0.710737,0.710737,0.0,...,-0.145865,1.242816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2616,3.243652,0.00000,-0.514315,-0.546859,-0.378455,-0.448067,-0.448067,-0.710737,0.710737,0.0,...,-0.145865,1.242816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2617,3.243652,0.00000,-0.514315,-0.546859,-0.378455,-0.448067,-0.448067,-0.710737,0.710737,0.0,...,-0.145865,-0.805437,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# Check for infinity or negative infinity
is_inf = np.isinf(X)

# Count how many are infinity or negative infinity
count = is_inf.sum().sum()

print(f"There are {count} values that are either infinity or negative infinity.")

There are 0 values that are either infinity or negative infinity.


In [16]:
def process_dataframe(X):
    """
    Process the input DataFrame 'X':
    1. Set column names as a range from 0 to the number of columns.
    2. Replace boolean values with 1 for True and 0 for False.

    Parameters:
        X (pandas.DataFrame): The input DataFrame.

    Returns:
        pandas.DataFrame: The processed DataFrame.
    """
    # Set column names as a range from 0 to the number of columns
    X.columns = range(X.shape[1])

    # Replace boolean values with 1 for True and 0 for False
    #     X = X * 1

    return X
def save_data_to_data_file(X, y, filename):
    if not os.path.exists(PATH_SAVE):
        os.makedirs(PATH_SAVE)

    file_path = os.path.join(PATH_SAVE, filename)

    with open(file_path, 'w') as f:
        for i in range(len(X)):
            line = str(y.iloc[i])  # get the value of the series
            for col in X.columns:
                line += " {}:{}".format(col, X[col].iloc[i])
            f.write(line + '\n')

def split_train_test(df, labels, indices):
    test_X = df.loc[indices]
    test_y = labels.loc[indices]
    train_X = df.drop(indices)
    train_y = labels.drop(indices)
    return train_X, train_y, test_X, test_y

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler



for num_clusters in range(46):  # 46 is exclusive
    PATH_SAVE = '/home/user/Collab/MT/logo/{}_clusters/data_MT'.format(num_clusters)
    print(PATH_SAVE)

    for turn in range(47):
        # Select the current user (first one from the current PINFO_valid DataFrame)
        selected = PINFO_valid.iloc[0:1]
        # Remove the selected user from PINFO_valid
        PINFO_valid = PINFO_valid.drop(selected.index)

        scaler = StandardScaler()
        df = scaler.fit_transform(PINFO_valid)
        #Clustering
        kmeans = KMeans(n_clusters=num_clusters,init='k-means++', max_iter=300, n_init=10, random_state=0)
        pred_y = kmeans.fit_predict(df)
        PINFO_valid.loc[:, 'cluster'] = pred_y
    #     print(PINFO_valid)

        #Calculate the cluster label of selected user
        selected_scaled = scaler.fit_transform(selected)
        selected_cluster = kmeans.predict(selected_scaled)
        selected = selected.assign(cluster=selected_cluster)
        PINFO_valid = pd.concat([PINFO_valid, selected])
    #     print(selected_cluster)
    #     print(selected)
    #     print("___________________")

        #adding cluster label to feature space


        # Convert groups to a pandas DataFrame
        groups_df = pd.DataFrame(groups, columns=['pcode'])

        # Create a new Series that maps Pcode to cluster label
        cluster_map = PINFO_valid.set_index(PINFO_valid.index)['cluster']

        # Create the 'cluster' column in the 'groups' DataFrame
        groups_df['cluster'] = groups_df['pcode'].map(cluster_map)

        # Add the 'cluster' column from 'groups_df' to 'X' as the first column
        X.insert(0, 'cluster', groups_df['cluster'])

        #saving data
        matching_indices = []
        for index, element in groups_df.iterrows():
            if element['pcode'] == selected.index:
                matching_indices.append(index)

        # convert cluster_labels[y] into DataFrame with matching indices to cluster_dfs[y]
        labels = pd.Series(y)
        train_X, train_y, test_X, test_y = split_train_test(X, 
                                                            labels,
                                                            matching_indices)
        test_X = process_dataframe(test_X)
        train_X = process_dataframe(train_X)
        # Drop the first column for ST_XGboost
    #     train_X = train_X.iloc[:, 1:]
    #     test_X = test_X.iloc[:, 1:]

        save_data_to_data_file(train_X, train_y, f'{turn}_train.data')
        save_data_to_data_file(test_X, test_y, f'{turn}_val.data')

        #reset to PINFO_valid
        PINFO_valid = PINFO_valid.drop(columns=['cluster'])
        X = X.drop(columns=['cluster'])
    # print(PINFO_valid)

Data For ST

In [17]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
PATH_SAVE = '/home/user/Collab/MT/logo/data_ST3'
for turn in range(47):
    # Select the current user (first one from the current PINFO_valid DataFrame)
    selected = PINFO_valid.iloc[0:1]
    # Remove the selected user from PINFO_valid
    PINFO_valid = PINFO_valid.drop(selected.index)

    scaler = StandardScaler()
    df = scaler.fit_transform(PINFO_valid)
    #Clustering
    kmeans = KMeans(n_clusters=38,init='k-means++', max_iter=300, n_init=10, random_state=0)
    pred_y = kmeans.fit_predict(df)
    PINFO_valid.loc[:, 'cluster'] = pred_y
#     print(PINFO_valid)

    #Calculate the cluster label of selected user
    selected_scaled = scaler.fit_transform(selected)
    selected_cluster = kmeans.predict(selected_scaled)
    selected = selected.assign(cluster=selected_cluster)
    PINFO_valid = pd.concat([PINFO_valid, selected])
#     print(selected_cluster)
#     print(selected)
#     print("___________________")

    #adding cluster label to feature space


    # Convert groups to a pandas DataFrame
    groups_df = pd.DataFrame(groups, columns=['pcode'])

    # Create a new Series that maps Pcode to cluster label
    cluster_map = PINFO_valid.set_index(PINFO_valid.index)['cluster']

    # Create the 'cluster' column in the 'groups' DataFrame
    groups_df['cluster'] = groups_df['pcode'].map(cluster_map)

    # Add the 'cluster' column from 'groups_df' to 'X' as the first column
#     X.insert(0, 'cluster', groups_df['cluster'])

    #saving data
    matching_indices = []
    for index, element in groups_df.iterrows():
        if element['pcode'] == selected.index:
            matching_indices.append(index)

    # convert cluster_labels[y] into DataFrame with matching indices to cluster_dfs[y]
    labels = pd.Series(y)
    train_X, train_y, test_X, test_y = split_train_test(X, 
                                                        labels,
                                                        matching_indices)
    test_X = process_dataframe(test_X)
    train_X = process_dataframe(train_X)
#     Drop the first column for ST_XGboost
#     train_X = train_X.iloc[:, 1:]
#     test_X = test_X.iloc[:, 1:]

    save_data_to_data_file(train_X, train_y, f'{turn}_train.data')
    save_data_to_data_file(test_X, test_y, f'{turn}_val.data')

    #reset to PINFO_valid
    PINFO_valid = PINFO_valid.drop(columns=['cluster'])
#     X = X.drop(columns=['cluster'])
# print(PINFO_valid)