In [1]:
import pytz
import os


DEFAULT_TZ = pytz.FixedOffset(540)  # GMT+09:00; Asia/Seoul

PATH_DATA = 'data/D'
PATH_ESM = os.path.join(PATH_DATA, 'EsmResponse.csv')
PATH_PARTICIPANT = os.path.join(PATH_DATA, 'UserInfo.csv')
PATH_SENSOR = os.path.join(PATH_DATA, 'Sensor')

PATH_INTERMEDIATE = os.path.join('data/intermediate')

RANDOM_STATE = 42 

In [2]:
import pandas as pd
import numpy as np
import scipy.stats as st
import cloudpickle
import ray
from datetime import datetime
from contextlib import contextmanager
import warnings
import time


def load(path: str):
    with open(path, mode='rb') as f:
        return cloudpickle.load(f)

    
def dump(obj, path: str):
    with open(path, mode='wb') as f:
        cloudpickle.dump(obj, f)
        
    
def log(msg: any):
    print('[{}] {}'.format(datetime.now().strftime('%y-%m-%d %H:%M:%S'), msg))


def summary(x):
    x = np.asarray(x)
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')

        n = len(x)
        # Here, uppercase np.dtype.kind corresponds to non-numeric data.
        # Also, we view the boolean data as dichotomous categorical data.
        if x.dtype.kind.isupper() or x.dtype.kind == 'b': 
            cnt = pd.Series(x).value_counts(dropna=False)
            card = len(cnt)
            cnt = cnt[:20]                
            cnt_str = ', '.join([f'{u}:{c}' for u, c in zip(cnt.index, cnt)])
            if card > 30:
                cnt_str = f'{cnt_str}, ...'
            return {
                'n': n,
                'cardinality': card,
                'value_count': cnt_str
            }
        else: 
            x_nan = x[np.isnan(x)]
            x_norm = x[~np.isnan(x)]
            
            tot = np.sum(x_norm)
            m = np.mean(x_norm)
            me = np.median(x_norm)
            s = np.std(x_norm, ddof=1)
            l, u = np.min(x_norm), np.max(x)
            conf_l, conf_u = st.t.interval(0.95, len(x_norm) - 1, loc=m, scale=st.sem(x_norm))
            n_nan = len(x_nan)
            
            return {
                'n': n,
                'sum': tot,
                'mean': m,
                'SD': s,
                'med': me,
                'range': (l, u),
                'conf.': (conf_l, conf_u),
                'nan_count': n_nan
            }

In [3]:
p = os.path.join(PATH_INTERMEDIATE, 'stress-fixed.pkl')
X, y, groups, t, datetimes = load(p)

In [4]:
PARTICIPANTS = pd.read_csv(os.path.join(PATH_INTERMEDIATE, 'PARTICIPANT_INFO.csv'),index_col = 'pcode')
PINFO = PARTICIPANTS.assign(
    BFI_OPN=lambda x: x['openness'],
    BFI_CON=lambda x: x['conscientiousness'],
    BFI_NEU=lambda x: x['neuroticism'],
    BFI_EXT=lambda x: x['extraversion'],
    BFI_AGR=lambda x: x['agreeableness'],
)[[
    'BFI_OPN', 'BFI_CON', 'BFI_NEU', 'BFI_EXT', 'BFI_AGR'
]]
PINFO = pd.get_dummies(PINFO, prefix_sep='=', dtype=bool)

In [5]:
duplicate_rows = PINFO[PINFO.duplicated()]

print(duplicate_rows)
print(PINFO)

       BFI_OPN  BFI_CON  BFI_NEU  BFI_EXT  BFI_AGR
pcode                                             
P80         13        7        5        4       12
       BFI_OPN  BFI_CON  BFI_NEU  BFI_EXT  BFI_AGR
pcode                                             
P01         11       11        3        4       13
P02         14        5       12       14        5
P03         10       15        8        7       11
P04         12       11        8        6       11
P05         10       11       13       10        6
...        ...      ...      ...      ...      ...
P76          8        8       12        6        8
P77         11       12        7       11       10
P78         12       11        9       12       10
P79          9       10        7       12       11
P80         13        7        5        4       12

[77 rows x 5 columns]


In [6]:
LABELS_PROC = pd.read_csv(os.path.join(PATH_INTERMEDIATE, 'LABELS_PROC.csv'), index_col=['pcode','timestamp'],parse_dates=True)

In [7]:
_df =LABELS_PROC
_df.reset_index(level='timestamp', inplace=True)
print('First timestamp:', _df['timestamp'].min())
print('Last timestamp:', _df['timestamp'].max())

First timestamp: 2019-04-30 10:03:28+09:00
Last timestamp: 2019-05-22 22:02:03+09:00


In [8]:
time_ranges = _df.groupby('pcode')['timestamp'].agg(['min', 'max'])
print(time_ranges)

                            min                       max
pcode                                                    
P01   2019-05-08 10:29:46+09:00 2019-05-14 21:12:31+09:00
P02   2019-05-08 10:52:29+09:00 2019-05-14 21:13:14+09:00
P03   2019-05-08 11:13:13+09:00 2019-05-14 20:23:23+09:00
P05   2019-05-08 10:40:49+09:00 2019-05-14 21:59:16+09:00
P06   2019-05-08 10:32:09+09:00 2019-05-14 21:56:51+09:00
P08   2019-05-08 10:42:48+09:00 2019-05-14 21:12:32+09:00
P09   2019-05-08 13:44:51+09:00 2019-05-14 20:31:01+09:00
P10   2019-05-08 10:40:26+09:00 2019-05-14 15:13:12+09:00
P12   2019-05-09 14:18:30+09:00 2019-05-14 21:09:22+09:00
P13   2019-05-08 10:30:38+09:00 2019-05-14 21:51:35+09:00
P15   2019-05-08 12:09:34+09:00 2019-05-14 19:44:17+09:00
P19   2019-05-08 10:41:14+09:00 2019-05-14 22:01:12+09:00
P21   2019-05-08 15:49:36+09:00 2019-05-14 21:43:22+09:00
P23   2019-05-08 10:20:41+09:00 2019-05-14 18:13:07+09:00
P26   2019-05-08 10:11:53+09:00 2019-05-14 21:59:17+09:00
P28   2019-05-

In [9]:
list_pid = set(LABELS_PROC.index.get_level_values('pcode').values)
list_pid

{'P01',
 'P02',
 'P03',
 'P05',
 'P06',
 'P08',
 'P09',
 'P10',
 'P12',
 'P13',
 'P15',
 'P19',
 'P21',
 'P23',
 'P26',
 'P28',
 'P30',
 'P31',
 'P32',
 'P33',
 'P35',
 'P39',
 'P40',
 'P42',
 'P45',
 'P47',
 'P48',
 'P49',
 'P50',
 'P51',
 'P52',
 'P53',
 'P55',
 'P57',
 'P60',
 'P61',
 'P66',
 'P67',
 'P69',
 'P70',
 'P72',
 'P75',
 'P76',
 'P77',
 'P78',
 'P79',
 'P80'}

In [10]:
PINFO_valid = PINFO.loc[PINFO.index.isin(list_pid)]
PINFO_valid.count()

BFI_OPN    47
BFI_CON    47
BFI_NEU    47
BFI_EXT    47
BFI_AGR    47
dtype: int64

In [11]:
duplicate_rows = PINFO_valid[PINFO_valid.duplicated()]

print(duplicate_rows)
print(PINFO_valid)

       BFI_OPN  BFI_CON  BFI_NEU  BFI_EXT  BFI_AGR
pcode                                             
P80         13        7        5        4       12
       BFI_OPN  BFI_CON  BFI_NEU  BFI_EXT  BFI_AGR
pcode                                             
P01         11       11        3        4       13
P02         14        5       12       14        5
P03         10       15        8        7       11
P05         10       11       13       10        6
P06          3        6       11        3        6
P08         10        8        9        9       12
P09         12       12        4       11        9
P10          6        7        9        9       11
P12          9       12        7        7       12
P13          5       12        3       12       13
P15          6       12        5        6       11
P19         12       13        3        9       10
P21         12        5        8        8        8
P23         13       12        6        9       14
P26         13        8        

In [12]:
#Divide the features into different categories
feat_current = X.loc[:,[('#VAL' in str(x)) or ('ESM#LastLabel' in str(x)) for x in X.keys()]]  
feat_dsc = X.loc[:,[('#DSC' in str(x))  for x in X.keys()]]  
feat_yesterday = X.loc[:,[('Yesterday' in str(x))  for x in X.keys()]]  
feat_today = X.loc[:,[('Today' in str(x))  for x in X.keys()]]  
feat_sleep = X.loc[:,[('Sleep' in str(x))  for x in X.keys()]]  
feat_time = X.loc[:,[('Time' in str(x))  for x in X.keys()]]  
feat_pif = X.loc[:,[('PIF' in str(x))  for x in X.keys()]]  
feat_ImmediatePast = X.loc[:,[('ImmediatePast_15' in str(x))  for x in X.keys()]]
#Divide the time window features into sensor/past stress label
feat_current_sensor = X.loc[:,[('#VAL' in str(x))  for x in X.keys()]]  
feat_current_ESM = X.loc[:,[('ESM#LastLabel' in str(x)) for x in X.keys()]]  
feat_ImmediatePast_sensor = feat_ImmediatePast.loc[:,[('ESM' not in str(x)) for x in feat_ImmediatePast.keys()]]  
feat_ImmediatePast_ESM = feat_ImmediatePast.loc[:,[('ESM'  in str(x)) for x in feat_ImmediatePast.keys()]]  
feat_today_sensor = feat_today.loc[:,[('ESM' not in str(x))  for x in feat_today.keys()]]  
feat_today_ESM = feat_today.loc[:,[('ESM'  in str(x)) for x in feat_today.keys()]]  
feat_yesterday_sensor = feat_yesterday.loc[:,[('ESM' not in str(x)) for x in feat_yesterday.keys()]]  
feat_yesterday_ESM = feat_yesterday.loc[:,[('ESM'  in str(x)) for x in feat_yesterday.keys()]]
#Prepare the final feature set
feat_baseline = pd.concat([ feat_time,feat_dsc,feat_current_sensor, feat_ImmediatePast_sensor],axis=1)
feat_final = pd.concat([feat_baseline  ], axis=1)
X = feat_final
cats = X.columns[X.dtypes == bool]

In [13]:
X

Unnamed: 0,Time#DOW=MON,Time#DOW=TUE,Time#DOW=WED,Time#DOW=THU,Time#DOW=FRI,Time#DOW=SAT,Time#DOW=SUN,Time#WKD=Y,Time#WKD=N,Time#HRN=DAWN,...,ONF#ASC##ImmediatePast_15,ONF#RLV_SUP#ImmediatePast_15,MED_VID#AVG#ImmediatePast_15,MED_VID#STD#ImmediatePast_15,MED_VID#SKW#ImmediatePast_15,MED_VID#KUR#ImmediatePast_15,MED_VID#ASC#ImmediatePast_15,MED_VID#BEP#ImmediatePast_15,MED_VID#MED#ImmediatePast_15,MED_VID#TSC#ImmediatePast_15
0,False,False,True,False,False,False,False,False,True,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,False,False,True,False,False,False,False,False,True,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,False,False,True,False,False,False,False,False,True,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,False,False,True,False,False,False,False,False,True,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,False,False,True,False,False,False,False,False,True,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2614,False,False,False,False,False,False,True,True,False,False,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2615,True,False,False,False,False,False,False,False,True,False,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2616,True,False,False,False,False,False,False,False,True,False,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2617,True,False,False,False,False,False,False,False,True,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# Check for infinity or negative infinity
is_inf = np.isinf(X)

# Count how many are infinity or negative infinity
count = is_inf.sum().sum()

print(f"There are {count} values that are either infinity or negative infinity.")

There are 0 values that are either infinity or negative infinity.


In [15]:
def process_dataframe(X):
    """
    Process the input DataFrame 'X':
    1. Set column names as a range from 0 to the number of columns.
    2. Replace boolean values with 1 for True and 0 for False.

    Parameters:
        X (pandas.DataFrame): The input DataFrame.

    Returns:
        pandas.DataFrame: The processed DataFrame.
    """
    # Set column names as a range from 0 to the number of columns
    X.columns = range(X.shape[1])

    # Replace boolean values with 1 for True and 0 for False
    #     X = X * 1

    return X
def save_data_to_data_file(X, y, filename):
    if not os.path.exists(PATH_SAVE):
        os.makedirs(PATH_SAVE)

    file_path = os.path.join(PATH_SAVE, filename)

    with open(file_path, 'w') as f:
        for i in range(len(X)):
            line = str(y.iloc[i])  # get the value of the series
            for col in X.columns:
                line += " {}:{}".format(col, X[col].iloc[i])
            f.write(line + '\n')

def split_train_test(df, labels, indices):
    test_X = df.loc[indices]
    test_y = labels.loc[indices]
    train_X = df.drop(indices)
    train_y = labels.drop(indices)
    return train_X, train_y, test_X, test_y

In [16]:
def mkdir(path):
    path = path.strip()
    isExists = os.path.exists(path)

    if not isExists:
        os.makedirs(path)
        return True
    else:
        return False

In [17]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.base import clone

normalize =True

selector = SelectFromModel(
        estimator=LogisticRegression(penalty='l1', solver='liblinear', C=1, random_state= RANDOM_STATE, max_iter=4000),
        threshold=0.005
#         estimator=LinearSVC(
#         penalty='l1',
#         loss='squared_hinge',
#         dual=False,
#         tol=1e-3,
#         C=1e-2,
#         max_iter=5000,
#         random_state=RANDOM_STATE
#     )  
    )
select = [clone(selector)]
oversample = True

C_cat = np.asarray(sorted(cats))
C_num = np.asarray(sorted(X.columns[~X.columns.isin(C_cat)]))

In [18]:
print(C_num)

['ACC_AXX#ASC#ImmediatePast_15' 'ACC_AXX#AVG#ImmediatePast_15'
 'ACC_AXX#BEP#ImmediatePast_15' 'ACC_AXX#KUR#ImmediatePast_15'
 'ACC_AXX#MED#ImmediatePast_15' 'ACC_AXX#SKW#ImmediatePast_15'
 'ACC_AXX#STD#ImmediatePast_15' 'ACC_AXX#TSC#ImmediatePast_15'
 'ACC_AXX#VAL' 'ACC_AXY#ASC#ImmediatePast_15'
 'ACC_AXY#AVG#ImmediatePast_15' 'ACC_AXY#BEP#ImmediatePast_15'
 'ACC_AXY#KUR#ImmediatePast_15' 'ACC_AXY#MED#ImmediatePast_15'
 'ACC_AXY#SKW#ImmediatePast_15' 'ACC_AXY#STD#ImmediatePast_15'
 'ACC_AXY#TSC#ImmediatePast_15' 'ACC_AXY#VAL'
 'ACC_AXZ#ASC#ImmediatePast_15' 'ACC_AXZ#AVG#ImmediatePast_15'
 'ACC_AXZ#BEP#ImmediatePast_15' 'ACC_AXZ#KUR#ImmediatePast_15'
 'ACC_AXZ#MED#ImmediatePast_15' 'ACC_AXZ#SKW#ImmediatePast_15'
 'ACC_AXZ#STD#ImmediatePast_15' 'ACC_AXZ#TSC#ImmediatePast_15'
 'ACC_AXZ#VAL' 'ACC_MAG#ASC#ImmediatePast_15'
 'ACC_MAG#AVG#ImmediatePast_15' 'ACC_MAG#BEP#ImmediatePast_15'
 'ACC_MAG#KUR#ImmediatePast_15' 'ACC_MAG#MED#ImmediatePast_15'
 'ACC_MAG#SKW#ImmediatePast_15' 'ACC_MAG#ST

In [19]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from imblearn.over_sampling import SMOTENC, SMOTE
from tqdm import tqdm
from sklearn.model_selection import StratifiedGroupKFold, GroupKFold, KFold, TimeSeriesSplit, LeavePGroupsOut, train_test_split, GroupShuffleSplit
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder

In [20]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split



for num_clusters in range(2, 46):  # 46 is exclusive
    PATH_SAVE = '/home/uzair/Stress/StressDetection_Collaboration/Models/logo/{}_clusters/data_MT'.format(num_clusters)
    mkdir(PATH_SAVE)
    print(PATH_SAVE)

    for turn in range(47):
        print(turn)
        print("\n")
        # Select the current user (first one from the current PINFO_valid DataFrame)
        selected = PINFO_valid.iloc[0:1]
        # Remove the selected user from PINFO_valid
        PINFO_valid = PINFO_valid.drop(selected.index)

        scaler = StandardScaler()
        df = scaler.fit_transform(PINFO_valid)
        #Clustering
        kmeans = KMeans(n_clusters=num_clusters,init='k-means++', max_iter=300, n_init=10, random_state=0)
        pred_y = kmeans.fit_predict(df)
        PINFO_valid.loc[:, 'cluster'] = pred_y
    #     print(PINFO_valid)

        #Calculate the cluster label of selected user
        selected_scaled = scaler.fit_transform(selected)
        selected_cluster = kmeans.predict(selected_scaled)
        selected = selected.assign(cluster=selected_cluster)
        PINFO_valid = pd.concat([PINFO_valid, selected])
    #     print(selected_cluster)
    #     print(selected)
    #     print("___________________")

        #adding cluster label to feature space


        # Convert groups to a pandas DataFrame
        groups_df = pd.DataFrame(groups, columns=['pcode'])

        # Create a new Series that maps Pcode to cluster label
        cluster_map = PINFO_valid.set_index(PINFO_valid.index)['cluster']

        # Create the 'cluster' column in the 'groups' DataFrame
        groups_df['cluster'] = groups_df['pcode'].map(cluster_map)

        # Add the 'cluster' column from 'groups_df' to 'X' as the first column
        X.insert(0, 'cluster', groups_df['cluster'])

        #saving data
        matching_indices = []
        for index, element in groups_df.iterrows():
            if element['pcode'] == selected.index:
                matching_indices.append(index)

        # convert cluster_labels[y] into DataFrame with matching indices to cluster_dfs[y]
        labels = pd.Series(y)
        train_X, train_y, X_test, y_test = split_train_test(X, 
                                                            labels,
                                                            matching_indices)
        
        X_train, X_eval, y_train, y_eval = train_test_split(train_X, train_y, test_size=0.2, random_state=RANDOM_STATE, stratify=train_y)
        
#         print("Is 'cluster' in C_num?", 'cluster' in C_num)
#         print("Before normalization:", X_train.columns)
        
        if normalize:
            # Normalize numeric features
            scaler = StandardScaler().fit(X_train[C_num])
            X_train[C_num] = scaler.transform(X_train[C_num].copy())
            X_eval[C_num] = scaler.transform(X_eval[C_num].copy())
            X_test[C_num] = scaler.transform(X_test[C_num].copy())
        
#         print("After normalization:", X_train.columns)
        
        if select:
            if isinstance(select, SelectFromModel):
                select = [select]

            for i, s in enumerate(select):
                # Fit feature selector only on training data
                s.fit(X_train, y_train)
                selected_features = X_train.columns[s.get_support()].tolist()
                
                # Manually add 'cluster' to the start of selected features if it's not already there
                if 'cluster' not in selected_features:
                    selected_features.insert(0, 'cluster')
                else:
                    selected_features.remove('cluster')
                    selected_features.insert(0, 'cluster')
                
                # Apply feature selection to train, eval, and test
                X_train = X_train[selected_features].copy()
                X_eval = X_eval[selected_features].copy()
                X_test = X_test[selected_features].copy()
                
#         print("After Select:", X_train.columns)

        if oversample:
            # Determine categorical features for SMOTENC
            if len(C_cat):
                M = np.isin(X_train.columns, C_cat)
                sampler = SMOTENC(categorical_features=M, random_state= RANDOM_STATE)
            else:
                sampler = SMOTE(random_state=RANDOM_STATE)
            # Only oversample training data
            X_train, y_train = sampler.fit_resample(X_train, y_train)
        
#         print("After Oversample:", X_train.columns)

        # Apply consistent processing to train, eval, and test dataframes
        X_train = process_dataframe(X_train)
        X_eval = process_dataframe(X_eval)
        X_test = process_dataframe(X_test)
        
        save_data_to_data_file(X_train, y_train, f'{turn}_train.data')
        save_data_to_data_file(X_eval, y_eval, f'{turn}_val.data')
        save_data_to_data_file(X_test, y_test, f'{turn}_test.data')

        #reset to PINFO_valid
        PINFO_valid = PINFO_valid.drop(columns=['cluster'])
        X = X.drop(columns=['cluster'])
    # print(PINFO_valid)

/home/uzair/Stress/StressDetection_Collaboration/Models/logo/2_clusters/data_MT
0


1


2


3


4


5


6


7


8


9


10


11


12


13


14


15


16


17


18


19


20


21


22


23


24


25


26


27


28


29


30


31


32


33


34


35


36


37


38


39


40


41


42


43


44


45


46


/home/uzair/Stress/StressDetection_Collaboration/Models/logo/3_clusters/data_MT
0


1


2


3


4


5


6


7


8


9


10


11


12


13


14


15


16


17


18


19


20


21


22


23


24


25


26


27


28


29


30


31


32


33


34


35


36


37


38


39


40


41


42


43


44


45


46


/home/uzair/Stress/StressDetection_Collaboration/Models/logo/4_clusters/data_MT
0


1


2


3


4


5


6


7


8


9


10


11


12


13


14


15


16


17


18


19


20


21


22


23


24


25


26


27


28


29


30


31


32


33


34


35


36


37


38


39


40


41


42


43


44


45


46


/home/uzair/Stress/StressDetection_Collaboration/Models/logo/5_clusters/data_MT
0


1

35


36


37


38


39


40


41


42


43


44


45


46


/home/uzair/Stress/StressDetection_Collaboration/Models/logo/29_clusters/data_MT
0


1


2


3


4


5


6


7


8


9


10


11


12


13


14


15


16


17


18


19


20


21


22


23


24


25


26


27


28


29


30


31


32


33


34


35


36


37


38


39


40


41


42


43


44


45


46


/home/uzair/Stress/StressDetection_Collaboration/Models/logo/30_clusters/data_MT
0


1


2


3


4


5


6


7


8


9


10


11


12


13


14


15


16


17


18


19


20


21


22


23


24


25


26


27


28


29


30


31


32


33


34


35


36


37


38


39


40


41


42


43


44


45


46


/home/uzair/Stress/StressDetection_Collaboration/Models/logo/31_clusters/data_MT
0


1


2


3


4


5


6


7


8


9


10


11


12


13


14


15


16


17


18


19


20


21


22


23


24


25


26


27


28


29


30


31


32


33


34


35


36


37


38


39


40


41


42


43


44


45


46


/home/uzair/Stress/Str