In [1]:
import pytz
import os


DEFAULT_TZ = pytz.FixedOffset(540)  # GMT+09:00; Asia/Seoul

PATH_DATA = 'data/D'
PATH_ESM = os.path.join(PATH_DATA, 'EsmResponse.csv')
PATH_PARTICIPANT = os.path.join(PATH_DATA, 'UserInfo.csv')
PATH_SENSOR = os.path.join(PATH_DATA, 'Sensor')

PATH_INTERMEDIATE = os.path.join('/home/user/Collab/Data_Processing_D1/data/intermediate')
PATH_SAVE = '/home/user/Collab/MT/logo/3_clusters/data_MT'

DATA_TYPES = {
    'Acceleration': 'ACC',
    'AmbientLight': 'AML',
    'Calorie': 'CAL',
    'Distance': 'DST',
    'EDA': 'EDA',
    'HR': 'HRT',
    'RRI': 'RRI',
    'SkinTemperature': 'SKT',
    'StepCount': 'STP',
    'UltraViolet': 'ULV',
    'ActivityEvent': 'ACE',
    'ActivityTransition': 'ACT',
    'AppUsageEvent': 'APP',
    'BatteryEvent': 'BAT',
    'CallEvent': 'CAE',
    'Connectivity': 'CON',
    'DataTraffic': 'DAT',
    'InstalledApp': 'INS',
    'Location': 'LOC',
    'MediaEvent': 'MED',
    'MessageEvent': 'MSG',
    'WiFi': 'WIF',
    'ScreenEvent': 'SCR',
    'RingerModeEvent': 'RNG',
    'ChargeEvent': 'CHG',
    'PowerSaveEvent': 'PWS',
    'OnOffEvent': 'ONF'
}

In [2]:
import pandas as pd
import numpy as np
import scipy.stats as st
import cloudpickle
import ray
from datetime import datetime
from contextlib import contextmanager
import warnings
import time


def load(path: str):
    with open(path, mode='rb') as f:
        return cloudpickle.load(f)

    
def dump(obj, path: str):
    with open(path, mode='wb') as f:
        cloudpickle.dump(obj, f)
        
    
def log(msg: any):
    print('[{}] {}'.format(datetime.now().strftime('%y-%m-%d %H:%M:%S'), msg))


def summary(x):
    x = np.asarray(x)
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')

        n = len(x)
        # Here, uppercase np.dtype.kind corresponds to non-numeric data.
        # Also, we view the boolean data as dichotomous categorical data.
        if x.dtype.kind.isupper() or x.dtype.kind == 'b': 
            cnt = pd.Series(x).value_counts(dropna=False)
            card = len(cnt)
            cnt = cnt[:20]                
            cnt_str = ', '.join([f'{u}:{c}' for u, c in zip(cnt.index, cnt)])
            if card > 30:
                cnt_str = f'{cnt_str}, ...'
            return {
                'n': n,
                'cardinality': card,
                'value_count': cnt_str
            }
        else: 
            x_nan = x[np.isnan(x)]
            x_norm = x[~np.isnan(x)]
            
            tot = np.sum(x_norm)
            m = np.mean(x_norm)
            me = np.median(x_norm)
            s = np.std(x_norm, ddof=1)
            l, u = np.min(x_norm), np.max(x)
            conf_l, conf_u = st.t.interval(0.95, len(x_norm) - 1, loc=m, scale=st.sem(x_norm))
            n_nan = len(x_nan)
            
            return {
                'n': n,
                'sum': tot,
                'mean': m,
                'SD': s,
                'med': me,
                'range': (l, u),
                'conf.': (conf_l, conf_u),
                'nan_count': n_nan
            }


@contextmanager
def on_ray(*args, **kwargs):
    try:
        if ray.is_initialized():
            ray.shutdown()
        ray.init(*args, **kwargs)
        yield None
    finally:
        ray.shutdown()

transform = {
    'GAME': 'ENTER',
    'GAME_TRIVIA': 'ENTER',
    'GAME_CASINO': 'ENTER',
    'GAME-ACTION': 'ENTER',
    'GAME_SPORTS': 'ENTER',
    'GAME_PUZZLE': 'ENTER',
    'GAME_SIMULATION': 'ENTER',
    'GAME_STRATEGY': 'ENTER',
    'GAME_ROLE_PLAYING': 'ENTER',
    'GAME_ACTION': 'ENTER',
    'GAME_ARCADE': 'ENTER',
    'GAME_RACING': 'ENTER',
    'GAME_CASUAL': 'ENTER',
    'GAME_MUSIC': 'ENTER',
    'GAME_CARD': 'ENTER',
    'GAME_ADVENTURE': 'ENTER',
    'GAME_BOARD': 'ENTER',
    'GAME_EDUCATIONAL': 'ENTER',
    'GAME_RACING': 'ENTER',
    'PHOTOGRAPHY': 'ENTER',
    'ENTERTAINMENT': 'ENTER',
    'SPORTS': 'ENTER',
    'MUSIC_AND_AUDIO': 'ENTER',
    'COMICS': 'ENTER',
    'VIDEO_PLAYERS_AND_EDITORS': 'ENTER',
    'VIDEO_PLAYERS': 'ENTER',
    'ART_AND_DESIGN': 'ENTER',
    'TRAVEL_AND_LOCAL': 'INFO',
    'FOOD_AND_DRINK': 'INFO',
    'NEWS_AND_MAGAZINES': 'INFO',
    'MAPS_AND_NAVIGATION': 'INFO',
    'WEATHER': 'INFO',
    'HOUSE_AND_HOME': 'INFO',
    'BOOKS_AND_REFERENCE': 'INFO',
    'SHOPPING': 'INFO',
    'LIBRARIES_AND_DEMO': 'INFO',
    'BEAUTY': 'INFO',
    'AUTO_AND_VEHICLES': 'INFO',
    'LIFESTYLE': 'INFO',
    'PERSONALIZATION': 'SYSTEM',
    'TOOLS': 'SYSTEM',
    'COMMUNICATION': 'SOCIAL',
    'SOCIAL': 'SOCIAL',
    'DATING': 'SOCIAL',
    'PARENTING':'SOCIAL',
    'FINANCE': 'WORK',
    'BUSINESS': 'WORK',
    'PRODUCTIVITY': 'WORK',
    'EDUCATION': 'WORK',
    'HEALTH_AND_FITNESS': 'HEALTH',
    'MEDICAL': 'HEALTH',
    'SYSTEM': 'SYSTEM',
    'MISC': 'SYSTEM', # ABC logger
     None: 'UNKNOWN',
    'UNKNOWN':'UNKNOWN'
}

In [3]:
p = os.path.join(PATH_INTERMEDIATE, 'stress-fixed.pkl')
X, y, groups, t, datetimes = load(p)

In [4]:
X

Unnamed: 0,ACC_AXX#VAL,ACC_AXX#AVG#ImmediatePast_15,ACC_AXX#STD#ImmediatePast_15,ACC_AXX#SKW#ImmediatePast_15,ACC_AXX#KUR#ImmediatePast_15,ACC_AXX#ASC#ImmediatePast_15,ACC_AXX#BEP#ImmediatePast_15,ACC_AXX#MED#ImmediatePast_15,ACC_AXX#TSC#ImmediatePast_15,ACC_AXX#AVG#YesterdayDawn,...,MED_VID#MED_TodayEvening,MED_VID#TSC_TodayEvening,MED_VID#AVG_TodayNight,MED_VID#STD_TodayNight,MED_VID#SKW_TodayNight,MED_VID#KUR_TodayNight,MED_VID#ASC_TodayNight,MED_VID#BEP_TodayNight,MED_VID#MED_TodayNight,MED_VID#TSC_TodayNight
0,1.887458,-1.099064,1.070059,0.156294,-0.147795,3.558316,-0.558527,-1.493644,3.058955,0.200464,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.545471,0.443328,0.027409,0.114711,-0.157560,-0.288708,-0.344570,0.563097,0.116904,0.200464,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.701875,0.454280,0.119796,-0.007248,-0.150215,-0.344461,-0.178299,0.248825,-0.157025,0.200464,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.001830,0.606667,-0.075702,0.453520,-0.129572,-0.176964,-0.976626,-0.026113,0.446548,0.200464,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.756951,0.252484,-1.514013,0.122577,-0.175276,-0.737175,1.168138,0.106270,-1.136328,0.200464,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2614,0.264884,0.298440,-1.060438,-0.385416,-0.545398,-0.679873,0.876889,0.298620,-1.063744,-1.455125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2615,0.264884,-1.263076,-1.060438,-0.385440,-0.545398,-0.679873,0.876889,-0.952464,-1.063743,0.253645,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2616,0.264884,-1.334018,-1.060438,-0.385331,-0.545398,-0.679873,0.876889,-1.009302,-1.063743,0.253645,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2617,-1.147279,-0.093421,0.959098,0.710459,0.118918,-0.028970,-1.061706,0.164136,0.520428,0.253645,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
PARTICIPANTS = pd.read_csv(os.path.join(PATH_INTERMEDIATE, 'PARTICIPANT_INFO.csv'),index_col = 'pcode')
PINFO = PARTICIPANTS.assign(
    BFI_OPN=lambda x: x['openness'],
    BFI_CON=lambda x: x['conscientiousness'],
    BFI_NEU=lambda x: x['neuroticism'],
    BFI_EXT=lambda x: x['extraversion'],
    BFI_AGR=lambda x: x['agreeableness'],
)[[
    'BFI_OPN', 'BFI_CON', 'BFI_NEU', 'BFI_EXT', 'BFI_AGR'
]]
PINFO = pd.get_dummies(PINFO, prefix_sep='=', dtype=bool)

In [6]:
LABELS_PROC = pd.read_csv(os.path.join(PATH_INTERMEDIATE, 'LABELS_PROC.csv'), index_col=['pcode','timestamp'],parse_dates=True)

In [7]:
_df =LABELS_PROC
_df.reset_index(level='timestamp', inplace=True)
print('First timestamp:', _df['timestamp'].min())
print('Last timestamp:', _df['timestamp'].max())

First timestamp: 2019-04-30 10:03:28+09:00
Last timestamp: 2019-05-22 22:02:03+09:00


In [8]:
time_ranges = _df.groupby('pcode')['timestamp'].agg(['min', 'max'])
print(time_ranges)

                            min                       max
pcode                                                    
P01   2019-05-08 10:29:46+09:00 2019-05-14 21:12:31+09:00
P02   2019-05-08 10:52:29+09:00 2019-05-14 21:13:14+09:00
P03   2019-05-08 11:13:13+09:00 2019-05-14 20:23:23+09:00
P05   2019-05-08 10:40:49+09:00 2019-05-14 21:59:16+09:00
P06   2019-05-08 10:32:09+09:00 2019-05-14 21:56:51+09:00
P08   2019-05-08 10:42:48+09:00 2019-05-14 21:12:32+09:00
P09   2019-05-08 13:44:51+09:00 2019-05-14 20:31:01+09:00
P10   2019-05-08 10:40:26+09:00 2019-05-14 15:13:12+09:00
P12   2019-05-09 14:18:30+09:00 2019-05-14 21:09:22+09:00
P13   2019-05-08 10:30:38+09:00 2019-05-14 21:51:35+09:00
P15   2019-05-08 12:09:34+09:00 2019-05-14 19:44:17+09:00
P19   2019-05-08 10:41:14+09:00 2019-05-14 22:01:12+09:00
P21   2019-05-08 15:49:36+09:00 2019-05-14 21:43:22+09:00
P23   2019-05-08 10:20:41+09:00 2019-05-14 18:13:07+09:00
P26   2019-05-08 10:11:53+09:00 2019-05-14 21:59:17+09:00
P28   2019-05-

In [9]:
list_pid = set(LABELS_PROC.index.get_level_values('pcode').values)

In [10]:
PINFO_valid = PINFO.loc[PINFO.index.isin(list_pid)]

In [11]:
#Divide the features into different categories
feat_current = X.loc[:,[('#VAL' in str(x)) or ('ESM#LastLabel' in str(x)) for x in X.keys()]]  
feat_dsc = X.loc[:,[('#DSC' in str(x))  for x in X.keys()]]  
feat_yesterday = X.loc[:,[('Yesterday' in str(x))  for x in X.keys()]]  
feat_today = X.loc[:,[('Today' in str(x))  for x in X.keys()]]  
feat_sleep = X.loc[:,[('Sleep' in str(x))  for x in X.keys()]]  
feat_time = X.loc[:,[('Time' in str(x))  for x in X.keys()]]  
feat_pif = X.loc[:,[('PIF' in str(x))  for x in X.keys()]]  
feat_ImmediatePast = X.loc[:,[('ImmediatePast_15' in str(x))  for x in X.keys()]]
#Divide the time window features into sensor/past stress label
feat_current_sensor = X.loc[:,[('#VAL' in str(x))  for x in X.keys()]]  
feat_current_ESM = X.loc[:,[('ESM#LastLabel' in str(x)) for x in X.keys()]]  
feat_ImmediatePast_sensor = feat_ImmediatePast.loc[:,[('ESM' not in str(x)) for x in feat_ImmediatePast.keys()]]  
feat_ImmediatePast_ESM = feat_ImmediatePast.loc[:,[('ESM'  in str(x)) for x in feat_ImmediatePast.keys()]]  
feat_today_sensor = feat_today.loc[:,[('ESM' not in str(x))  for x in feat_today.keys()]]  
feat_today_ESM = feat_today.loc[:,[('ESM'  in str(x)) for x in feat_today.keys()]]  
feat_yesterday_sensor = feat_yesterday.loc[:,[('ESM' not in str(x)) for x in feat_yesterday.keys()]]  
feat_yesterday_ESM = feat_yesterday.loc[:,[('ESM'  in str(x)) for x in feat_yesterday.keys()]]
#Prepare the final feature set
feat_baseline = pd.concat([ feat_time,feat_dsc,feat_current_sensor, feat_ImmediatePast_sensor],axis=1)
feat_final = pd.concat([feat_baseline ], axis=1)
X = feat_final

In [12]:
X

Unnamed: 0,Time#DOW=MON,Time#DOW=TUE,Time#DOW=WED,Time#DOW=THU,Time#DOW=FRI,Time#DOW=SAT,Time#DOW=SUN,Time#WKD=Y,Time#WKD=N,Time#HRN=DAWN,...,ONF#ASC##ImmediatePast_15,ONF#RLV_SUP#ImmediatePast_15,MED_VID#AVG#ImmediatePast_15,MED_VID#STD#ImmediatePast_15,MED_VID#SKW#ImmediatePast_15,MED_VID#KUR#ImmediatePast_15,MED_VID#ASC#ImmediatePast_15,MED_VID#BEP#ImmediatePast_15,MED_VID#MED#ImmediatePast_15,MED_VID#TSC#ImmediatePast_15
0,-0.493710,-0.32914,2.143928,-0.373210,-0.373210,-0.373210,-0.414800,-0.608135,0.608135,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.493710,-0.32914,2.143928,-0.373210,-0.373210,-0.373210,-0.414800,-0.608135,0.608135,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.493710,-0.32914,2.143928,-0.373210,-0.373210,-0.373210,-0.414800,-0.608135,0.608135,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.493710,-0.32914,2.143928,-0.373210,-0.373210,-0.373210,-0.414800,-0.608135,0.608135,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.493710,-0.32914,2.143928,-0.373210,-0.373210,-0.373210,-0.414800,-0.608135,0.608135,0.0,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2614,-0.301735,0.00000,-0.514315,-0.546859,-0.378455,-0.448067,2.184325,1.377054,-1.377054,0.0,...,-0.145865,1.242816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2615,3.243652,0.00000,-0.514315,-0.546859,-0.378455,-0.448067,-0.448067,-0.710737,0.710737,0.0,...,-0.145865,1.242816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2616,3.243652,0.00000,-0.514315,-0.546859,-0.378455,-0.448067,-0.448067,-0.710737,0.710737,0.0,...,-0.145865,1.242816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2617,3.243652,0.00000,-0.514315,-0.546859,-0.378455,-0.448067,-0.448067,-0.710737,0.710737,0.0,...,-0.145865,-0.805437,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
similar_user = pd.read_csv(os.path.join(PATH_INTERMEDIATE,  'similar_user_5.csv'))
similar_user

Unnamed: 0,pcode,BFI_OPN,BFI_CON,BFI_NEU,BFI_EXT,BFI_AGR,cluster
0,P01,11,11,3,4,13,2
1,P02,14,5,12,14,5,3
2,P03,10,15,8,7,11,4
3,P05,10,11,13,10,6,3
4,P06,3,6,11,3,6,1
5,P08,10,8,9,9,12,0
6,P09,12,12,4,11,9,3
7,P10,6,7,9,9,11,0
8,P12,9,12,7,7,12,4
9,P13,5,12,3,12,13,4


In [14]:
import pandas as pd

# Convert groups to a pandas DataFrame
groups_df = pd.DataFrame(groups, columns=['pcode'])

# Ensure the data types of the 'pcode' and 'pcode' columns are the same
groups_df['pcode'] = groups_df['pcode'].astype(str)
similar_user['pcode'] = similar_user['pcode'].astype(str)

# Create a new Series that maps Pcode to cluster label
cluster_map = similar_user.set_index('pcode')['cluster']

# Create the 'cluster' column in the 'groups' DataFrame
groups_df['cluster'] = groups_df['pcode'].map(cluster_map)

In [15]:
# Add the 'cluster' column from 'groups_df' to 'X' as the first column
X.insert(0, 'cluster', groups_df['cluster'])
X

Unnamed: 0,cluster,Time#DOW=MON,Time#DOW=TUE,Time#DOW=WED,Time#DOW=THU,Time#DOW=FRI,Time#DOW=SAT,Time#DOW=SUN,Time#WKD=Y,Time#WKD=N,...,ONF#ASC##ImmediatePast_15,ONF#RLV_SUP#ImmediatePast_15,MED_VID#AVG#ImmediatePast_15,MED_VID#STD#ImmediatePast_15,MED_VID#SKW#ImmediatePast_15,MED_VID#KUR#ImmediatePast_15,MED_VID#ASC#ImmediatePast_15,MED_VID#BEP#ImmediatePast_15,MED_VID#MED#ImmediatePast_15,MED_VID#TSC#ImmediatePast_15
0,2,-0.493710,-0.32914,2.143928,-0.373210,-0.373210,-0.373210,-0.414800,-0.608135,0.608135,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,-0.493710,-0.32914,2.143928,-0.373210,-0.373210,-0.373210,-0.414800,-0.608135,0.608135,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,-0.493710,-0.32914,2.143928,-0.373210,-0.373210,-0.373210,-0.414800,-0.608135,0.608135,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,-0.493710,-0.32914,2.143928,-0.373210,-0.373210,-0.373210,-0.414800,-0.608135,0.608135,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,-0.493710,-0.32914,2.143928,-0.373210,-0.373210,-0.373210,-0.414800,-0.608135,0.608135,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2614,2,-0.301735,0.00000,-0.514315,-0.546859,-0.378455,-0.448067,2.184325,1.377054,-1.377054,...,-0.145865,1.242816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2615,2,3.243652,0.00000,-0.514315,-0.546859,-0.378455,-0.448067,-0.448067,-0.710737,0.710737,...,-0.145865,1.242816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2616,2,3.243652,0.00000,-0.514315,-0.546859,-0.378455,-0.448067,-0.448067,-0.710737,0.710737,...,-0.145865,1.242816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2617,2,3.243652,0.00000,-0.514315,-0.546859,-0.378455,-0.448067,-0.448067,-0.710737,0.710737,...,-0.145865,-0.805437,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# import os
# os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'

Neural Network

In [17]:
# import numpy as np
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from tensorflow.keras.models import Model
# from tensorflow.keras.layers import Input, Dense, Dropout
# from tensorflow.keras.metrics import AUC, Precision, Recall

# # One-hot encoding the cluster column
# clusters_onehot = pd.get_dummies(X['cluster'], prefix='cluster')
# X = pd.concat([X, clusters_onehot], axis=1)
# X.drop(columns=['cluster'], inplace=True)

# # Split data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Define the shared layers
# input_layer = Input(shape=(X_train.shape[1],))
# shared_layer = Dense(64, activation='relu')(input_layer)
# shared_layer = Dropout(0.5)(shared_layer)

# # Define task-specific layers and outputs for each cluster
# outputs = []

# # Masks for each cluster
# train_masks = [X_train[f'cluster_{i}'] == 1 for i in range(5)]
# test_masks = [X_test[f'cluster_{i}'] == 1 for i in range(5)]

# # Adjusted y_train and y_test dictionaries
# y_train_dict = {}
# y_test_dict = {}

# for i in range(5):
#     # Process labels
#     y_train_cluster = np.full(X_train.shape[0], -1.)  
#     y_train_cluster[train_masks[i]] = y_train[train_masks[i]]
#     y_train_dict[f'cluster_{i}_output'] = y_train_cluster

#     y_test_cluster = np.full(X_test.shape[0], -1.)
#     y_test_cluster[test_masks[i]] = y_test[test_masks[i]]
#     y_test_dict[f'cluster_{i}_output'] = y_test_cluster

#     # Task-specific layers
#     task_specific_layer = Dense(32, activation='relu')(shared_layer)
#     task_specific_output = Dense(1, activation='sigmoid', name=f'cluster_{i}_output')(task_specific_layer)
#     outputs.append(task_specific_output)

# # Construct the model
# model = Model(inputs=input_layer, outputs=outputs)

# # Using a custom loss function to mask the dummy values
# import tensorflow as tf

# def masked_binary_crossentropy(target, output):
#     mask = tf.math.greater(target, -0.5)
#     loss = tf.keras.losses.binary_crossentropy(target, output)
#     return tf.where(mask, loss, 0.)

# # Compile the model with the custom loss
# model.compile(optimizer='adam',
#               loss=[masked_binary_crossentropy]*5,
#               metrics=[[AUC(name='auc'), Precision(name='precision'), Recall(name='recall')] for _ in range(5)])

# # Train the model
# history = model.fit(X_train, y_train_dict, 
#           validation_data=(X_test, y_test_dict),
#           epochs=50, batch_size=32)

# # Evaluate the model
# evaluation = model.evaluate(X_test, y_test_dict, verbose=0)

# # Extract and compute metrics
# eval_results = {name: value for name, value in zip(model.metrics_names, evaluation)}

# for i in range(5):
#     precision = eval_results[f'cluster_{i}_output_precision']
#     recall = eval_results[f'cluster_{i}_output_recall']
#     f1_score = 2 * (precision * recall) / (precision + recall if precision + recall else 1) # added a check to avoid division by zero
#     auc = eval_results[f'cluster_{i}_output_auc']

#     print(f"\nMetrics for cluster {i}:")
#     print(f"AUC: {auc:.4f}")
#     print(f"F1 score: {f1_score:.4f}")
#     print(f"Precision: {precision:.4f}")
#     print(f"Recall: {recall:.4f}")

In [18]:
# import numpy as np
# import pandas as pd
# import keras
# import tensorflow as tf
# from sklearn.model_selection import KFold
# from sklearn.metrics import roc_auc_score
# from tensorflow.keras.models import Model
# from tensorflow.keras.layers import Dense, Input, Dropout, BatchNormalization
# from sklearn.preprocessing import OneHotEncoder
# from tqdm.autonotebook import tqdm
# import keras.backend as K


# def custom_gelu(x):
#     return 0.5 * x * (1 + tf.tanh(tf.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))



# def focal_loss(gamma=2., alpha=.25, weights=[2.0, 1.0, 1.0, 1.0]):
#     def focal_loss_fixed(y_true, y_pred):
#         pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
#         pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
#         return -K.mean(alpha * K.pow(1. - pt_1, gamma) * K.log(K.epsilon() + pt_1) * weights) - K.mean((1 - alpha) * K.pow(pt_0, gamma) * K.log(1. - pt_0 + K.epsilon()) * weights)
#     return focal_loss_fixed


# def create_model(X_tr):
#     inps = Input(shape=(X_tr.shape[1],))
#     shared = Dense(512, activation=custom_gelu)(inps)
#     shared = BatchNormalization()(shared)
#     shared = Dropout(0.3)(shared)

#     # Task 1 specific layers
#     t1 = Dense(128, activation=custom_gelu)(shared)
#     t1 = BatchNormalization()(t1)
#     t1_output = Dense(1, activation='sigmoid', name='task1_output')(t1)

#     # Task 2 specific layers
#     t2 = Dense(128, activation=custom_gelu)(shared)
#     t2 = BatchNormalization()(t2)
#     t2_output = Dense(1, activation='sigmoid', name='task2_output')(t2)

#     # Task 3 specific layers
#     t3 = Dense(128, activation=custom_gelu)(shared)
#     t3 = BatchNormalization()(t3)
#     t3_output = Dense(1, activation='sigmoid', name='task3_output')(t3)

#     # Task 4 specific layers
#     t4 = Dense(128, activation=custom_gelu)(shared)
#     t4 = BatchNormalization()(t4)
#     t4_output = Dense(1, activation='sigmoid', name='task4_output')(t4)

#     model = Model(inputs=inps, outputs=[t1_output, t2_output, t3_output, t4_output])
#     model.compile(optimizer='adam', loss=[focal_loss(), focal_loss(), focal_loss(), focal_loss()])
#     return model


# # Dummy data (you can replace with your own data)
# X_train = pd.DataFrame(np.random.randn(1000, 10))
# y_train = pd.DataFrame(np.random.randint(2, size=(1000, 4)))
# X_test = pd.DataFrame(np.random.randn(300, 10))
# y_test = pd.DataFrame(np.random.randint(2, size=(300, 4)))

# # Training the model
# folds = 3
# kf = KFold(n_splits=folds, shuffle=True)
# y_preds22 = np.zeros((X_test.shape[0], 4))
# y_oof = np.zeros((X_train.shape[0], 4))

# for tr_idx, val_idx in kf.split(X_train, y_train):

#     X_tr, X_vl = X_train.iloc[tr_idx, :], X_train.iloc[val_idx, :]
#     y_tr, y_vl = y_train.iloc[tr_idx], y_train.iloc[val_idx]

#     model = create_model(X_tr)
#     model.fit(X_tr, [y_tr.iloc[:, i] for i in range(4)], epochs=8, batch_size=2048, validation_data=(X_test, [y_test.iloc[:, i] for i in range(4)]), verbose=True)

#     y_pred_train = np.hstack([model.predict(X_vl)[i] for i in range(4)])
#     y_oof[val_idx, :] = y_pred_train
#     for i, task in enumerate(["Task1", "Task2", "Task3", "Task4"]):
#         print(f'ROC AUC for {task}: {roc_auc_score(y_vl.iloc[:, i], y_pred_train[:, i])}')

#     temp = np.hstack([model.predict(X_test)[i] for i in range(4)])
#     y_preds22 += temp / folds

In [19]:
print(len(y))

2619


In [20]:
y = pd.DataFrame(y)

In [24]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Concatenate
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split

# Assume df is your DataFrame
# and your targets are y1, y2, y3, y4, y5

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    {'y1': y1, 'y2': y2, 'y3': y3, 'y4': y4, 'y5': y5},
                                                    test_size=0.2,
                                                    random_state=42)

# Splitting data based on cluster for multi-task learning
X_train_split = [X_train[X_train['cluster'] == i].drop('cluster', axis=1) for i in range(5)]
X_test_split = [X_test[X_test['cluster'] == i].drop('cluster', axis=1) for i in range(5)]

y_train_split = [y_train['y' + str(i+1)][X_train['cluster'] == i] for i in range(5)]
y_test_split = [y_test['y' + str(i+1)][X_test['cluster'] == i] for i in range(5)]

# Multi-task model
def multi_task_model(input_dim):
    shared_input = Input(shape=(input_dim,))
    
    shared_dense = Dense(128, activation='relu')(shared_input)
    
    # Cluster specific layers
    cluster_inputs = [Input(shape=(input_dim,)) for _ in range(5)]
    cluster_outputs = []
    
    for cluster_input in cluster_inputs:
        merged = Concatenate()([shared_dense, cluster_input])
        d1 = Dense(64, activation='relu')(merged)
        d2 = Dense(32, activation='relu')(d1)
        output = Dense(1, activation='sigmoid')(d2)
        cluster_outputs.append(output)
    
    model = Model([shared_input] + cluster_inputs, cluster_outputs)
    
    return model

# Prepare data for shared layer
shared_X_train = pd.concat(X_train_split, ignore_index=True)

model = multi_task_model(shared_X_train.shape[1])
model.compile(optimizer='adam', loss=['binary_crossentropy']*5, metrics=['accuracy'])

model.fit([shared_X_train] + X_train_split, y_train_split, 
          validation_data=([pd.concat(X_test_split, ignore_index=True)] + X_test_split, y_test_split), 
          epochs=20, batch_size=32)

# Evaluations and Predictions
predictions = model.predict([pd.concat(X_test_split, ignore_index=True)] + X_test_split)

NameError: name 'y1' is not defined