In [138]:
import pytz
import os


DEFAULT_TZ = pytz.FixedOffset(540)  # GMT+09:00; Asia/Seoul

PATH_DATA = 'data/D'
PATH_ESM = os.path.join(PATH_DATA, 'EsmResponse.csv')
PATH_PARTICIPANT = os.path.join(PATH_DATA, 'UserInfo.csv')
PATH_SENSOR = os.path.join(PATH_DATA, 'Sensor')

PATH_INTERMEDIATE = os.path.join('data/intermediate')
PATH_SAVE = '/home/user/Collab/MT/11_cluster/l_data'

DATA_TYPES = {
    'Acceleration': 'ACC',
    'AmbientLight': 'AML',
    'Calorie': 'CAL',
    'Distance': 'DST',
    'EDA': 'EDA',
    'HR': 'HRT',
    'RRI': 'RRI',
    'SkinTemperature': 'SKT',
    'StepCount': 'STP',
    'UltraViolet': 'ULV',
    'ActivityEvent': 'ACE',
    'ActivityTransition': 'ACT',
    'AppUsageEvent': 'APP',
    'BatteryEvent': 'BAT',
    'CallEvent': 'CAE',
    'Connectivity': 'CON',
    'DataTraffic': 'DAT',
    'InstalledApp': 'INS',
    'Location': 'LOC',
    'MediaEvent': 'MED',
    'MessageEvent': 'MSG',
    'WiFi': 'WIF',
    'ScreenEvent': 'SCR',
    'RingerModeEvent': 'RNG',
    'ChargeEvent': 'CHG',
    'PowerSaveEvent': 'PWS',
    'OnOffEvent': 'ONF'
}

In [139]:
import pandas as pd
import numpy as np
import scipy.stats as st
import cloudpickle
import ray
from datetime import datetime
from contextlib import contextmanager
import warnings
import time


def load(path: str):
    with open(path, mode='rb') as f:
        return cloudpickle.load(f)

    
def dump(obj, path: str):
    with open(path, mode='wb') as f:
        cloudpickle.dump(obj, f)
        
    
def log(msg: any):
    print('[{}] {}'.format(datetime.now().strftime('%y-%m-%d %H:%M:%S'), msg))


def summary(x):
    x = np.asarray(x)
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')

        n = len(x)
        # Here, uppercase np.dtype.kind corresponds to non-numeric data.
        # Also, we view the boolean data as dichotomous categorical data.
        if x.dtype.kind.isupper() or x.dtype.kind == 'b': 
            cnt = pd.Series(x).value_counts(dropna=False)
            card = len(cnt)
            cnt = cnt[:20]                
            cnt_str = ', '.join([f'{u}:{c}' for u, c in zip(cnt.index, cnt)])
            if card > 30:
                cnt_str = f'{cnt_str}, ...'
            return {
                'n': n,
                'cardinality': card,
                'value_count': cnt_str
            }
        else: 
            x_nan = x[np.isnan(x)]
            x_norm = x[~np.isnan(x)]
            
            tot = np.sum(x_norm)
            m = np.mean(x_norm)
            me = np.median(x_norm)
            s = np.std(x_norm, ddof=1)
            l, u = np.min(x_norm), np.max(x)
            conf_l, conf_u = st.t.interval(0.95, len(x_norm) - 1, loc=m, scale=st.sem(x_norm))
            n_nan = len(x_nan)
            
            return {
                'n': n,
                'sum': tot,
                'mean': m,
                'SD': s,
                'med': me,
                'range': (l, u),
                'conf.': (conf_l, conf_u),
                'nan_count': n_nan
            }


@contextmanager
def on_ray(*args, **kwargs):
    try:
        if ray.is_initialized():
            ray.shutdown()
        ray.init(*args, **kwargs)
        yield None
    finally:
        ray.shutdown()

transform = {
    'GAME': 'ENTER',
    'GAME_TRIVIA': 'ENTER',
    'GAME_CASINO': 'ENTER',
    'GAME-ACTION': 'ENTER',
    'GAME_SPORTS': 'ENTER',
    'GAME_PUZZLE': 'ENTER',
    'GAME_SIMULATION': 'ENTER',
    'GAME_STRATEGY': 'ENTER',
    'GAME_ROLE_PLAYING': 'ENTER',
    'GAME_ACTION': 'ENTER',
    'GAME_ARCADE': 'ENTER',
    'GAME_RACING': 'ENTER',
    'GAME_CASUAL': 'ENTER',
    'GAME_MUSIC': 'ENTER',
    'GAME_CARD': 'ENTER',
    'GAME_ADVENTURE': 'ENTER',
    'GAME_BOARD': 'ENTER',
    'GAME_EDUCATIONAL': 'ENTER',
    'GAME_RACING': 'ENTER',
    'PHOTOGRAPHY': 'ENTER',
    'ENTERTAINMENT': 'ENTER',
    'SPORTS': 'ENTER',
    'MUSIC_AND_AUDIO': 'ENTER',
    'COMICS': 'ENTER',
    'VIDEO_PLAYERS_AND_EDITORS': 'ENTER',
    'VIDEO_PLAYERS': 'ENTER',
    'ART_AND_DESIGN': 'ENTER',
    'TRAVEL_AND_LOCAL': 'INFO',
    'FOOD_AND_DRINK': 'INFO',
    'NEWS_AND_MAGAZINES': 'INFO',
    'MAPS_AND_NAVIGATION': 'INFO',
    'WEATHER': 'INFO',
    'HOUSE_AND_HOME': 'INFO',
    'BOOKS_AND_REFERENCE': 'INFO',
    'SHOPPING': 'INFO',
    'LIBRARIES_AND_DEMO': 'INFO',
    'BEAUTY': 'INFO',
    'AUTO_AND_VEHICLES': 'INFO',
    'LIFESTYLE': 'INFO',
    'PERSONALIZATION': 'SYSTEM',
    'TOOLS': 'SYSTEM',
    'COMMUNICATION': 'SOCIAL',
    'SOCIAL': 'SOCIAL',
    'DATING': 'SOCIAL',
    'PARENTING':'SOCIAL',
    'FINANCE': 'WORK',
    'BUSINESS': 'WORK',
    'PRODUCTIVITY': 'WORK',
    'EDUCATION': 'WORK',
    'HEALTH_AND_FITNESS': 'HEALTH',
    'MEDICAL': 'HEALTH',
    'SYSTEM': 'SYSTEM',
    'MISC': 'SYSTEM', # ABC logger
     None: 'UNKNOWN',
    'UNKNOWN':'UNKNOWN'
}

In [140]:
p = os.path.join(PATH_INTERMEDIATE, 'stress.pkl')
X, y, groups, t, datetimes = load(p)

In [141]:
X

Unnamed: 0,PIF#AGE,PIF#BFI_OPN,PIF#BFI_CON,PIF#BFI_NEU,PIF#BFI_EXT,PIF#BFI_AGR,PIF#PSS10,PIF#PHQ9,PIF#GHQ12,PIF#GEN=F,...,MED_VID#MEDTodayEvening,MED_VID#TSCTodayEvening,MED_VID#AVGTodayNight,MED_VID#STDTodayNight,MED_VID#SKWTodayNight,MED_VID#KURTodayNight,MED_VID#ASCTodayNight,MED_VID#BEPTodayNight,MED_VID#MEDTodayNight,MED_VID#TSCTodayNight
0,27.0,11.0,11.0,3.0,4.0,13.0,13.0,0.0,1.0,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,27.0,11.0,11.0,3.0,4.0,13.0,13.0,0.0,1.0,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,27.0,11.0,11.0,3.0,4.0,13.0,13.0,0.0,1.0,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,27.0,11.0,11.0,3.0,4.0,13.0,13.0,0.0,1.0,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,27.0,11.0,11.0,3.0,4.0,13.0,13.0,0.0,1.0,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2614,25.0,13.0,7.0,5.0,4.0,12.0,29.0,15.0,15.0,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2615,25.0,13.0,7.0,5.0,4.0,12.0,29.0,15.0,15.0,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2616,25.0,13.0,7.0,5.0,4.0,12.0,29.0,15.0,15.0,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2617,25.0,13.0,7.0,5.0,4.0,12.0,29.0,15.0,15.0,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [142]:
y.size

2619

In [143]:
print(list(groups))

['P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P05', 'P05', 'P05', 'P05', 'P05', 'P05', 'P05', 'P05'

In [144]:
similar_user = pd.read_csv(os.path.join(PATH_INTERMEDIATE,  'similar_user_11.csv'))
similar_user

Unnamed: 0,pcode,BFI_OPN,BFI_CON,BFI_NEU,BFI_EXT,BFI_AGR,cluster
0,P01,11,11,3,4,13,6
1,P02,14,5,12,14,5,5
2,P03,10,15,8,7,11,4
3,P05,10,11,13,10,6,1
4,P06,3,6,11,3,6,1
5,P08,10,8,9,9,12,3
6,P09,12,12,4,11,9,9
7,P10,6,7,9,9,11,8
8,P12,9,12,7,7,12,4
9,P13,5,12,3,12,13,0


In [145]:
feat_current = X.loc[:,[('#VAL' in str(x)) or ('ESM#LastLabel' in str(x)) for x in X.keys()]]  
feat_dsc = X.loc[:,[('#DSC' in str(x))  for x in X.keys()]]  
feat_yesterday = X.loc[:,[('Yesterday' in str(x))  for x in X.keys()]]  
feat_today = X.loc[:,[('Today' in str(x))  for x in X.keys()]]  
feat_sleep = X.loc[:,[('Sleep' in str(x))  for x in X.keys()]]  
feat_time = X.loc[:,[('Time' in str(x))  for x in X.keys()]]  
feat_pif = X.loc[:,[('PIF' in str(x))  for x in X.keys()]]  
feat_ImmediatePast = X.loc[:,[('ImmediatePast' in str(x))  for x in X.keys()]]
#Divide the time window features into sensor/past stress label
feat_current_sensor = X.loc[:,[('#VAL' in str(x))  for x in X.keys()]]  
feat_current_ESM = X.loc[:,[('ESM#LastLabel' in str(x)) for x in X.keys()]]  
feat_ImmediatePast_sensor = feat_ImmediatePast.loc[:,[('ESM' not in str(x)) for x in feat_ImmediatePast.keys()]]  
feat_ImmediatePast_ESM = feat_ImmediatePast.loc[:,[('ESM'  in str(x)) for x in feat_ImmediatePast.keys()]]  
feat_today_sensor = feat_today.loc[:,[('ESM' not in str(x)) for x in feat_today.keys()]]  
feat_today_ESM = feat_today.loc[:,[('ESM'  in str(x)) for x in feat_today.keys()]]  
feat_yesterday_sensor = feat_yesterday.loc[:,[('ESM' not in str(x)) for x in feat_yesterday.keys()]]  
feat_yesterday_ESM = feat_yesterday.loc[:,[('ESM'  in str(x)) for x in feat_yesterday.keys()]] 
#Prepare the final feature set
feat_baseline = pd.concat([ feat_time,feat_dsc,feat_current_sensor, feat_ImmediatePast_sensor],axis=1)
feat_final = pd.concat([feat_baseline, feat_today_sensor],axis=1)
X = feat_final

In [146]:
X

Unnamed: 0,Time#DOW=MON,Time#DOW=TUE,Time#DOW=WED,Time#DOW=THU,Time#DOW=FRI,Time#DOW=SAT,Time#DOW=SUN,Time#WKD=Y,Time#WKD=N,Time#HRN=DAWN,...,MED_VID#MEDTodayEvening,MED_VID#TSCTodayEvening,MED_VID#AVGTodayNight,MED_VID#STDTodayNight,MED_VID#SKWTodayNight,MED_VID#KURTodayNight,MED_VID#ASCTodayNight,MED_VID#BEPTodayNight,MED_VID#MEDTodayNight,MED_VID#TSCTodayNight
0,False,False,True,False,False,False,False,False,True,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,False,False,True,False,False,False,False,False,True,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,False,False,True,False,False,False,False,False,True,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,False,False,True,False,False,False,False,False,True,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,False,False,True,False,False,False,False,False,True,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2614,False,False,False,False,False,False,True,True,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2615,True,False,False,False,False,False,False,False,True,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2616,True,False,False,False,False,False,False,False,True,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2617,True,False,False,False,False,False,False,False,True,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [147]:
import pandas as pd

# Convert groups to a pandas DataFrame
groups_df = pd.DataFrame(groups, columns=['pcode'])

# Ensure the data types of the 'pcode' and 'pcode' columns are the same
groups_df['pcode'] = groups_df['pcode'].astype(str)
similar_user['pcode'] = similar_user['pcode'].astype(str)

# Create a new Series that maps Pcode to cluster label
cluster_map = similar_user.set_index('pcode')['cluster']

# Create the 'cluster' column in the 'groups' DataFrame
groups_df['cluster'] = groups_df['pcode'].map(cluster_map)

In [148]:
print(groups_df)

     pcode  cluster
0      P01        6
1      P01        6
2      P01        6
3      P01        6
4      P01        6
...    ...      ...
2614   P80        6
2615   P80        6
2616   P80        6
2617   P80        6
2618   P80        6

[2619 rows x 2 columns]


In [149]:
# Add the 'cluster' column from 'groups_df' to 'X' as the first column
X.insert(0, 'cluster', groups_df['cluster'])

In [150]:
X

Unnamed: 0,cluster,Time#DOW=MON,Time#DOW=TUE,Time#DOW=WED,Time#DOW=THU,Time#DOW=FRI,Time#DOW=SAT,Time#DOW=SUN,Time#WKD=Y,Time#WKD=N,...,MED_VID#MEDTodayEvening,MED_VID#TSCTodayEvening,MED_VID#AVGTodayNight,MED_VID#STDTodayNight,MED_VID#SKWTodayNight,MED_VID#KURTodayNight,MED_VID#ASCTodayNight,MED_VID#BEPTodayNight,MED_VID#MEDTodayNight,MED_VID#TSCTodayNight
0,6,False,False,True,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,6,False,False,True,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,6,False,False,True,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6,False,False,True,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6,False,False,True,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2614,6,False,False,False,False,False,False,True,True,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2615,6,True,False,False,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2616,6,True,False,False,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2617,6,True,False,False,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [151]:
y

array([1, 1, 0, ..., 1, 0, 1])

In [152]:
X

Unnamed: 0,cluster,Time#DOW=MON,Time#DOW=TUE,Time#DOW=WED,Time#DOW=THU,Time#DOW=FRI,Time#DOW=SAT,Time#DOW=SUN,Time#WKD=Y,Time#WKD=N,...,MED_VID#MEDTodayEvening,MED_VID#TSCTodayEvening,MED_VID#AVGTodayNight,MED_VID#STDTodayNight,MED_VID#SKWTodayNight,MED_VID#KURTodayNight,MED_VID#ASCTodayNight,MED_VID#BEPTodayNight,MED_VID#MEDTodayNight,MED_VID#TSCTodayNight
0,6,False,False,True,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,6,False,False,True,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,6,False,False,True,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6,False,False,True,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6,False,False,True,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2614,6,False,False,False,False,False,False,True,True,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2615,6,True,False,False,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2616,6,True,False,False,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2617,6,True,False,False,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [153]:
import pandas as pd
import numpy as np

# Assuming your DataFrame is df and the numpy array is y

# Get unique clusters
clusters = X['cluster'].unique()

# Create a dictionary to store each cluster's DataFrame
cluster_dfs = {}
cluster_labels = {}
cluster_groups = {}

for cluster in clusters:
    # Get the data for the current cluster
    cluster_dfs[cluster] = X[X['cluster'] == cluster]
    
    # Get the corresponding labels for the current cluster
    cluster_labels[cluster] = y[X['cluster'] == cluster]
    
    cluster_groups[cluster] = groups_df[X['cluster'] == cluster]

# Now you have separated DataFrames and numpy arrays for each cluster
# You can access them by:
# cluster_dfs[cluster_number]
# cluster_arrays[cluster_number]

In [154]:
cluster_dfs[0]


Unnamed: 0,cluster,Time#DOW=MON,Time#DOW=TUE,Time#DOW=WED,Time#DOW=THU,Time#DOW=FRI,Time#DOW=SAT,Time#DOW=SUN,Time#WKD=Y,Time#WKD=N,...,MED_VID#MEDTodayEvening,MED_VID#TSCTodayEvening,MED_VID#AVGTodayNight,MED_VID#STDTodayNight,MED_VID#SKWTodayNight,MED_VID#KURTodayNight,MED_VID#ASCTodayNight,MED_VID#BEPTodayNight,MED_VID#MEDTodayNight,MED_VID#TSCTodayNight
451,0,False,False,True,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
452,0,False,False,True,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
453,0,False,False,True,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
454,0,False,False,True,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
455,0,False,False,True,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2004,0,True,False,False,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2005,0,True,False,False,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2006,0,True,False,False,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2007,0,True,False,False,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [155]:
cluster_labels[2].size

287

In [156]:
cluster_groups[0]

Unnamed: 0,pcode,cluster
451,P13,0
452,P13,0
453,P13,0
454,P13,0
455,P13,0
...,...,...
2004,P61,0
2005,P61,0
2006,P61,0
2007,P61,0


In [157]:
clusters = similar_user['cluster'].unique()

cluster_similarity = {}

for cluster in clusters:
    # Get the data for the current cluster
    cluster_similarity[cluster] = similar_user[similar_user['cluster'] == cluster]


In [158]:
cluster_similarity[2]

Unnamed: 0,pcode,BFI_OPN,BFI_CON,BFI_NEU,BFI_EXT,BFI_AGR,cluster
21,P39,12,11,7,8,7,2
24,P45,11,9,6,10,6,2
27,P49,11,11,9,10,8,2
39,P70,12,10,8,8,7,2
41,P75,12,12,4,5,7,2


In [159]:
random_element = cluster_similarity[0].sample().iloc[0]
print(random_element)

pcode      P23
BFI_OPN     13
BFI_CON     12
BFI_NEU      6
BFI_EXT      9
BFI_AGR     14
cluster      0
Name: 13, dtype: object


In [160]:
def process_dataframe(X):
    """
    Process the input DataFrame 'X':
    1. Set column names as a range from 0 to the number of columns.
    2. Replace boolean values with 1 for True and 0 for False.

    Parameters:
        X (pandas.DataFrame): The input DataFrame.

    Returns:
        pandas.DataFrame: The processed DataFrame.
    """
    # Set column names as a range from 0 to the number of columns
    X.columns = range(X.shape[1])

    # Replace boolean values with 1 for True and 0 for False
    X = X * 1

    return X

def save_data_to_data_file(X, y, filename):
    if not os.path.exists(PATH_SAVE):
        os.makedirs(PATH_SAVE)

    file_path = os.path.join(PATH_SAVE, filename)

    with open(file_path, 'w') as f:
        for i in range(len(X)):
            line = str(y.iloc[i].values[0])  # get the value of the series
            for col in X.columns:
                line += " {}:{}".format(col, X[col].iloc[i])
            f.write(line + '\n')

In [162]:
import os
import pandas as pd
import numpy as np

def split_train_test(df, labels, indices):
    test_X = df.loc[indices]
    test_y = labels.loc[indices]
    train_X = df.drop(indices)
    train_y = labels.drop(indices)
    return train_X, train_y, test_X, test_y
random_seed = 0
for turn in range(2):
    all_train_X = []
    all_train_y = []
    all_test_X = []
    all_test_y = []
    random_seed = random_seed + 25
    for y in range(11):
        # Select 1 person from cluster_similarity(y) randomly
        random_element = cluster_similarity[y].sample(random_state=random_seed).iloc[0]
        random_seed = random_seed + 100
        matching_indices = []
        for index, element in cluster_groups[y].iterrows():
            if element['pcode'] == random_element['pcode']:
                matching_indices.append(index)
        # convert cluster_labels[y] into DataFrame with matching indices to cluster_dfs[y]
        labels_df = pd.DataFrame(cluster_labels[y], index=cluster_dfs[y].index)
        train_X, train_y, test_X, test_y = split_train_test(cluster_dfs[y], 
                                                            labels_df,
                                                            matching_indices)
        test_X = process_dataframe(test_X)
        train_X = process_dataframe(train_X)
        all_train_X.append(train_X)
        all_test_X.append(test_X)
        all_train_y.append(pd.DataFrame(train_y))  
        all_test_y.append(pd.DataFrame(test_y))
        save_data_to_data_file(test_X, test_y, f'{turn}_val_cluster_{y}.data')
    all_train_X = pd.concat(all_train_X)
    all_train_y = pd.concat(all_train_y)
    all_test_X = pd.concat(all_test_X)
    all_test_y = pd.concat(all_test_y)
    save_data_to_data_file(all_train_X, all_train_y, f'{turn}_train.data')
    save_data_to_data_file(all_test_X, all_test_y, f'{turn}_val.data')