MT Data modeling

In [25]:
import pytz
import os


DEFAULT_TZ = pytz.FixedOffset(540)  # GMT+09:00; Asia/Seoul

PATH_DATA = 'data/D'
PATH_ESM = os.path.join(PATH_DATA, 'EsmResponse.csv')
PATH_PARTICIPANT = os.path.join(PATH_DATA, 'UserInfo.csv')
PATH_SENSOR = os.path.join(PATH_DATA, 'Sensor')

PATH_INTERMEDIATE = os.path.join('data/intermediate')

DATA_TYPES = {
    'Acceleration': 'ACC',
    'AmbientLight': 'AML',
    'Calorie': 'CAL',
    'Distance': 'DST',
    'EDA': 'EDA',
    'HR': 'HRT',
    'RRI': 'RRI',
    'SkinTemperature': 'SKT',
    'StepCount': 'STP',
    'UltraViolet': 'ULV',
    'ActivityEvent': 'ACE',
    'ActivityTransition': 'ACT',
    'AppUsageEvent': 'APP',
    'BatteryEvent': 'BAT',
    'CallEvent': 'CAE',
    'Connectivity': 'CON',
    'DataTraffic': 'DAT',
    'InstalledApp': 'INS',
    'Location': 'LOC',
    'MediaEvent': 'MED',
    'MessageEvent': 'MSG',
    'WiFi': 'WIF',
    'ScreenEvent': 'SCR',
    'RingerModeEvent': 'RNG',
    'ChargeEvent': 'CHG',
    'PowerSaveEvent': 'PWS',
    'OnOffEvent': 'ONF'
}

In [26]:
import pandas as pd
import numpy as np
import scipy.stats as st
import cloudpickle
import ray
from datetime import datetime
from contextlib import contextmanager
import warnings
import time


def load(path: str):
    with open(path, mode='rb') as f:
        return cloudpickle.load(f)

    
def dump(obj, path: str):
    with open(path, mode='wb') as f:
        cloudpickle.dump(obj, f)
        
    
def log(msg: any):
    print('[{}] {}'.format(datetime.now().strftime('%y-%m-%d %H:%M:%S'), msg))


def summary(x):
    x = np.asarray(x)
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')

        n = len(x)
        # Here, uppercase np.dtype.kind corresponds to non-numeric data.
        # Also, we view the boolean data as dichotomous categorical data.
        if x.dtype.kind.isupper() or x.dtype.kind == 'b': 
            cnt = pd.Series(x).value_counts(dropna=False)
            card = len(cnt)
            cnt = cnt[:20]                
            cnt_str = ', '.join([f'{u}:{c}' for u, c in zip(cnt.index, cnt)])
            if card > 30:
                cnt_str = f'{cnt_str}, ...'
            return {
                'n': n,
                'cardinality': card,
                'value_count': cnt_str
            }
        else: 
            x_nan = x[np.isnan(x)]
            x_norm = x[~np.isnan(x)]
            
            tot = np.sum(x_norm)
            m = np.mean(x_norm)
            me = np.median(x_norm)
            s = np.std(x_norm, ddof=1)
            l, u = np.min(x_norm), np.max(x)
            conf_l, conf_u = st.t.interval(0.95, len(x_norm) - 1, loc=m, scale=st.sem(x_norm))
            n_nan = len(x_nan)
            
            return {
                'n': n,
                'sum': tot,
                'mean': m,
                'SD': s,
                'med': me,
                'range': (l, u),
                'conf.': (conf_l, conf_u),
                'nan_count': n_nan
            }


@contextmanager
def on_ray(*args, **kwargs):
    try:
        if ray.is_initialized():
            ray.shutdown()
        ray.init(*args, **kwargs)
        yield None
    finally:
        ray.shutdown()

transform = {
    'GAME': 'ENTER',
    'GAME_TRIVIA': 'ENTER',
    'GAME_CASINO': 'ENTER',
    'GAME-ACTION': 'ENTER',
    'GAME_SPORTS': 'ENTER',
    'GAME_PUZZLE': 'ENTER',
    'GAME_SIMULATION': 'ENTER',
    'GAME_STRATEGY': 'ENTER',
    'GAME_ROLE_PLAYING': 'ENTER',
    'GAME_ACTION': 'ENTER',
    'GAME_ARCADE': 'ENTER',
    'GAME_RACING': 'ENTER',
    'GAME_CASUAL': 'ENTER',
    'GAME_MUSIC': 'ENTER',
    'GAME_CARD': 'ENTER',
    'GAME_ADVENTURE': 'ENTER',
    'GAME_BOARD': 'ENTER',
    'GAME_EDUCATIONAL': 'ENTER',
    'GAME_RACING': 'ENTER',
    'PHOTOGRAPHY': 'ENTER',
    'ENTERTAINMENT': 'ENTER',
    'SPORTS': 'ENTER',
    'MUSIC_AND_AUDIO': 'ENTER',
    'COMICS': 'ENTER',
    'VIDEO_PLAYERS_AND_EDITORS': 'ENTER',
    'VIDEO_PLAYERS': 'ENTER',
    'ART_AND_DESIGN': 'ENTER',
    'TRAVEL_AND_LOCAL': 'INFO',
    'FOOD_AND_DRINK': 'INFO',
    'NEWS_AND_MAGAZINES': 'INFO',
    'MAPS_AND_NAVIGATION': 'INFO',
    'WEATHER': 'INFO',
    'HOUSE_AND_HOME': 'INFO',
    'BOOKS_AND_REFERENCE': 'INFO',
    'SHOPPING': 'INFO',
    'LIBRARIES_AND_DEMO': 'INFO',
    'BEAUTY': 'INFO',
    'AUTO_AND_VEHICLES': 'INFO',
    'LIFESTYLE': 'INFO',
    'PERSONALIZATION': 'SYSTEM',
    'TOOLS': 'SYSTEM',
    'COMMUNICATION': 'SOCIAL',
    'SOCIAL': 'SOCIAL',
    'DATING': 'SOCIAL',
    'PARENTING':'SOCIAL',
    'FINANCE': 'WORK',
    'BUSINESS': 'WORK',
    'PRODUCTIVITY': 'WORK',
    'EDUCATION': 'WORK',
    'HEALTH_AND_FITNESS': 'HEALTH',
    'MEDICAL': 'HEALTH',
    'SYSTEM': 'SYSTEM',
    'MISC': 'SYSTEM', # ABC logger
     None: 'UNKNOWN',
    'UNKNOWN':'UNKNOWN'
}

In [27]:
p = os.path.join(PATH_INTERMEDIATE, 'stress.pkl')
X, y, groups, t, datetimes = load(p)

In [28]:
X

Unnamed: 0,PIF#AGE,PIF#BFI_OPN,PIF#BFI_CON,PIF#BFI_NEU,PIF#BFI_EXT,PIF#BFI_AGR,PIF#PSS10,PIF#PHQ9,PIF#GHQ12,PIF#GEN=F,...,MED_VID#MEDTodayEvening,MED_VID#TSCTodayEvening,MED_VID#AVGTodayNight,MED_VID#STDTodayNight,MED_VID#SKWTodayNight,MED_VID#KURTodayNight,MED_VID#ASCTodayNight,MED_VID#BEPTodayNight,MED_VID#MEDTodayNight,MED_VID#TSCTodayNight
0,27.0,11.0,11.0,3.0,4.0,13.0,13.0,0.0,1.0,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,27.0,11.0,11.0,3.0,4.0,13.0,13.0,0.0,1.0,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,27.0,11.0,11.0,3.0,4.0,13.0,13.0,0.0,1.0,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,27.0,11.0,11.0,3.0,4.0,13.0,13.0,0.0,1.0,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,27.0,11.0,11.0,3.0,4.0,13.0,13.0,0.0,1.0,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2614,25.0,13.0,7.0,5.0,4.0,12.0,29.0,15.0,15.0,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2615,25.0,13.0,7.0,5.0,4.0,12.0,29.0,15.0,15.0,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2616,25.0,13.0,7.0,5.0,4.0,12.0,29.0,15.0,15.0,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2617,25.0,13.0,7.0,5.0,4.0,12.0,29.0,15.0,15.0,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
print(list(X))

['PIF#AGE', 'PIF#BFI_OPN', 'PIF#BFI_CON', 'PIF#BFI_NEU', 'PIF#BFI_EXT', 'PIF#BFI_AGR', 'PIF#PSS10', 'PIF#PHQ9', 'PIF#GHQ12', 'PIF#GEN=F', 'PIF#GEN=M', 'ACC_AXX#VAL', 'ACC_AXX#AVG_ImmediatePast', 'ACC_AXX#STD_ImmediatePast', 'ACC_AXX#SKW_ImmediatePast', 'ACC_AXX#KUR_ImmediatePast', 'ACC_AXX#ASC_ImmediatePast', 'ACC_AXX#BEP_ImmediatePast', 'ACC_AXX#MED_ImmediatePast', 'ACC_AXX#TSC_ImmediatePast', 'ACC_AXX#AVGYesterdayDawn', 'ACC_AXX#STDYesterdayDawn', 'ACC_AXX#SKWYesterdayDawn', 'ACC_AXX#KURYesterdayDawn', 'ACC_AXX#ASCYesterdayDawn', 'ACC_AXX#BEPYesterdayDawn', 'ACC_AXX#MEDYesterdayDawn', 'ACC_AXX#TSCYesterdayDawn', 'ACC_AXX#AVGYesterdayMorning', 'ACC_AXX#STDYesterdayMorning', 'ACC_AXX#SKWYesterdayMorning', 'ACC_AXX#KURYesterdayMorning', 'ACC_AXX#ASCYesterdayMorning', 'ACC_AXX#BEPYesterdayMorning', 'ACC_AXX#MEDYesterdayMorning', 'ACC_AXX#TSCYesterdayMorning', 'ACC_AXX#AVGYesterdayAfternoon', 'ACC_AXX#STDYesterdayAfternoon', 'ACC_AXX#SKWYesterdayAfternoon', 'ACC_AXX#KURYesterdayAfternoon'

In [30]:
y.size

2619

In [31]:
print(list(groups))

['P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P01', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P02', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P03', 'P05', 'P05', 'P05', 'P05', 'P05', 'P05', 'P05', 'P05'

In [32]:
groups.size

2619

In [33]:
datetimes

array([Timestamp('2019-05-08 10:29:46+0900', tz='pytz.FixedOffset(540)'),
       Timestamp('2019-05-08 11:16:12+0900', tz='pytz.FixedOffset(540)'),
       Timestamp('2019-05-08 15:58:22+0900', tz='pytz.FixedOffset(540)'),
       ...,
       Timestamp('2019-05-06 15:55:56+0900', tz='pytz.FixedOffset(540)'),
       Timestamp('2019-05-06 19:43:21+0900', tz='pytz.FixedOffset(540)'),
       Timestamp('2019-05-06 21:11:41+0900', tz='pytz.FixedOffset(540)')],
      dtype=object)

In [34]:
similar_user = pd.read_csv(os.path.join(PATH_INTERMEDIATE,  'similar_user.csv'))
similar_user

Unnamed: 0,pcode,BFI_OPN,BFI_CON,BFI_NEU,BFI_EXT,BFI_AGR,cluster
0,P01,11,11,3,4,13,5
1,P02,14,5,12,14,5,2
2,P03,10,15,8,7,11,1
3,P05,10,11,13,10,6,3
4,P06,3,6,11,3,6,3
5,P08,10,8,9,9,12,6
6,P09,12,12,4,11,9,0
7,P10,6,7,9,9,11,4
8,P12,9,12,7,7,12,1
9,P13,5,12,3,12,13,1


In [35]:
        feat_current = X.loc[:,[('#VAL' in str(x)) or ('ESM#LastLabel' in str(x)) for x in X.keys()]]  
        feat_dsc = X.loc[:,[('#DSC' in str(x))  for x in X.keys()]]  
        feat_yesterday = X.loc[:,[('Yesterday' in str(x))  for x in X.keys()]]  
        feat_today = X.loc[:,[('Today' in str(x))  for x in X.keys()]]  
        feat_sleep = X.loc[:,[('Sleep' in str(x))  for x in X.keys()]]  
        feat_time = X.loc[:,[('Time' in str(x))  for x in X.keys()]]  
        feat_pif = X.loc[:,[('PIF' in str(x))  for x in X.keys()]]  
        feat_ImmediatePast = X.loc[:,[('ImmediatePast' in str(x))  for x in X.keys()]]
        #Divide the time window features into sensor/past stress label
        feat_current_sensor = X.loc[:,[('#VAL' in str(x))  for x in X.keys()]]  
        feat_current_ESM = X.loc[:,[('ESM#LastLabel' in str(x)) for x in X.keys()]]  
        feat_ImmediatePast_sensor = feat_ImmediatePast.loc[:,[('ESM' not in str(x)) for x in feat_ImmediatePast.keys()]]  
        feat_ImmediatePast_ESM = feat_ImmediatePast.loc[:,[('ESM'  in str(x)) for x in feat_ImmediatePast.keys()]]  
        feat_today_sensor = feat_today.loc[:,[('ESM' not in str(x)) for x in feat_today.keys()]]  
        feat_today_ESM = feat_today.loc[:,[('ESM'  in str(x)) for x in feat_today.keys()]]  
        feat_yesterday_sensor = feat_yesterday.loc[:,[('ESM' not in str(x)) for x in feat_yesterday.keys()]]  
        feat_yesterday_ESM = feat_yesterday.loc[:,[('ESM'  in str(x)) for x in feat_yesterday.keys()]] 
        #Prepare the final feature set
        feat_baseline = pd.concat([ feat_time,feat_dsc,feat_current_sensor, feat_ImmediatePast_sensor],axis=1)
        feat_final = pd.concat([feat_baseline, feat_today_sensor],axis=1)
        X = feat_final

In [36]:
X

Unnamed: 0,Time#DOW=MON,Time#DOW=TUE,Time#DOW=WED,Time#DOW=THU,Time#DOW=FRI,Time#DOW=SAT,Time#DOW=SUN,Time#WKD=Y,Time#WKD=N,Time#HRN=DAWN,...,MED_VID#MEDTodayEvening,MED_VID#TSCTodayEvening,MED_VID#AVGTodayNight,MED_VID#STDTodayNight,MED_VID#SKWTodayNight,MED_VID#KURTodayNight,MED_VID#ASCTodayNight,MED_VID#BEPTodayNight,MED_VID#MEDTodayNight,MED_VID#TSCTodayNight
0,False,False,True,False,False,False,False,False,True,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,False,False,True,False,False,False,False,False,True,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,False,False,True,False,False,False,False,False,True,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,False,False,True,False,False,False,False,False,True,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,False,False,True,False,False,False,False,False,True,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2614,False,False,False,False,False,False,True,True,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2615,True,False,False,False,False,False,False,False,True,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2616,True,False,False,False,False,False,False,False,True,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2617,True,False,False,False,False,False,False,False,True,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
print(type(groups))

<class 'numpy.ndarray'>


In [38]:
import pandas as pd

# Convert groups to a pandas DataFrame
groups_df = pd.DataFrame(groups, columns=['pcode'])

# Ensure the data types of the 'pcode' and 'Pcode' columns are the same
groups_df['pcode'] = groups_df['pcode'].astype(str)
similar_user['pcode'] = similar_user['pcode'].astype(str)

# Create a new Series that maps Pcode to cluster label
cluster_map = similar_user.set_index('pcode')['cluster']

# Create the 'cluster' column in the 'groups' DataFrame
groups_df['cluster'] = groups_df['pcode'].map(cluster_map)


In [39]:
print(groups_df)

     pcode  cluster
0      P01        5
1      P01        5
2      P01        5
3      P01        5
4      P01        5
...    ...      ...
2614   P80        5
2615   P80        5
2616   P80        5
2617   P80        5
2618   P80        5

[2619 rows x 2 columns]


In [40]:
# Add the 'cluster' column from 'groups_df' to 'X' as the first column
X.insert(0, 'cluster', groups_df['cluster'])

In [41]:
X

Unnamed: 0,cluster,Time#DOW=MON,Time#DOW=TUE,Time#DOW=WED,Time#DOW=THU,Time#DOW=FRI,Time#DOW=SAT,Time#DOW=SUN,Time#WKD=Y,Time#WKD=N,...,MED_VID#MEDTodayEvening,MED_VID#TSCTodayEvening,MED_VID#AVGTodayNight,MED_VID#STDTodayNight,MED_VID#SKWTodayNight,MED_VID#KURTodayNight,MED_VID#ASCTodayNight,MED_VID#BEPTodayNight,MED_VID#MEDTodayNight,MED_VID#TSCTodayNight
0,5,False,False,True,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,False,False,True,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5,False,False,True,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,False,False,True,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,False,False,True,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2614,5,False,False,False,False,False,False,True,True,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2615,5,True,False,False,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2616,5,True,False,False,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2617,5,True,False,False,False,False,False,False,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
import pandas as pd

# Assuming X is your dataframe
# Change column names to numbers
X.columns = range(X.shape[1])

# Replace boolean values with 1 for True and 0 for False
X = X * 1

In [43]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3400,3401,3402,3403,3404,3405,3406,3407,3408,3409
0,5,0,0,1,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,0,0,1,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5,0,0,1,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,0,0,1,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0,0,1,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2614,5,0,0,0,0,0,0,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2615,5,1,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2616,5,1,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2617,5,1,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
for col in X.columns:
    print(f"{col}: {X[col].dtype}")


0: int64
1: int64
2: int64
3: int64
4: int64
5: int64
6: int64
7: int64
8: int64
9: int64
10: int64
11: int64
12: int64
13: int64
14: int64
15: int64
16: int64
17: float32
18: float32
19: float32
20: float32
21: float32
22: float32
23: float32
24: float32
25: float32
26: float32
27: float32
28: float32
29: float32
30: float32
31: float32
32: float32
33: float32
34: float32
35: float32
36: float32
37: float32
38: float32
39: float32
40: float32
41: float32
42: float32
43: float32
44: float32
45: float32
46: float32
47: float32
48: float32
49: float32
50: float32
51: float32
52: float32
53: float32
54: float32
55: float32
56: float32
57: float32
58: float32
59: float32
60: float32
61: float32
62: float32
63: float32
64: float32
65: float32
66: float32
67: float32
68: float32
69: float32
70: float32
71: float32
72: float32
73: float32
74: float32
75: float32
76: float32
77: float32
78: float32
79: int64
80: int64
81: int64
82: int64
83: float32
84: float32
85: float32
86: float32
87: floa

In [46]:
y.size

2619

In [47]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3400,3401,3402,3403,3404,3405,3406,3407,3408,3409
0,5,0,0,1,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,0,0,1,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5,0,0,1,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,0,0,1,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0,0,1,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2614,5,0,0,0,0,0,0,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2615,5,1,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2616,5,1,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2617,5,1,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
import pandas as pd

# First, convert the y numpy array to pandas Series if it's a 1D array
if len(y.shape) == 1:
    y = pd.Series(y)
    
def save_data_to_data_file(X, y, filename):
    with open(filename, 'w') as f:
        for i in range(len(X)):
            line = str(y.iloc[i])
            for col in X.columns:
                line += " {}:{}".format(col, X[col].iloc[i])
            f.write(line + '\n')

# Save all data set as a .data file
save_data_to_data_file(X, y, 'all.data')

In [33]:
from sklearn.model_selection import train_test_split
import pandas as pd

# First, convert the y numpy array to pandas Series if it's a 1D array
if len(y.shape) == 1:
    y = pd.Series(y)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=X[0], random_state=42)

# Function to save data in .data format
def save_data_to_data_file(X, y, filename):
    with open(filename, 'w') as f:
        for i in range(len(X)):
            line = str(y.iloc[i])
            for col in X.columns:
                line += " {}:{}".format(col, X[col].iloc[i])
            f.write(line + '\n')

# Save the training set as a .data file
save_data_to_data_file(X_train, y_train, 'train.data')

# Save the validation set as a .data file
save_data_to_data_file(X_val, y_val, 'val.data')

In [35]:
unique_clusters = X_val[0].unique()

In [36]:
for cluster in unique_clusters:
    # Create a mask for the current cluster
    cluster_mask = X_val[0] == cluster

    # Filter the data for the current cluster
    X_val_cluster = X_val[cluster_mask]
    y_val_cluster = y_val[cluster_mask]

    # Save the filtered validation set as a .data file
    save_data_to_data_file(X_val_cluster, y_val_cluster, f'val_cluster_{cluster}.data')

In [34]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3400,3401,3402,3403,3404,3405,3406,3407,3408,3409
86,2,0,1,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
571,1,0,0,0,0,0,1,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
279,6,0,0,0,0,0,0,1,1,0,...,1.0,0.0,0.0,0.0,-3.0,-3.0,0.0,0.0,0.0,0.0
138,3,0,0,1,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,-3.0,-3.0,0.0,0.0,0.0,0.0
1373,4,0,0,0,0,0,1,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,1,0,0,0,0,0,1,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
889,4,0,0,0,0,0,0,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
912,4,0,1,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2148,3,0,0,1,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,-3.0,-3.0,0.0,0.0,0.0,0.0


In [14]:
y_train.shape

(2095,)

Sample MTL NN

In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

class BaseModel(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(BaseModel, self).__init__()
        self.layer1 = nn.Linear(input_size, hidden_size)
        self.layer2 = nn.Linear(hidden_size, hidden_size)

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        return x

class MultiTaskModel(nn.Module):
    def __init__(self, base_model):
        super(MultiTaskModel, self).__init__()
        self.base_model = base_model
        self.head_0 = nn.Linear(base_model.layer2.out_features, 1)
        self.head_1 = nn.Linear(base_model.layer2.out_features, 1)
        self.head_2 = nn.Linear(base_model.layer2.out_features, 1)
        self.head_3 = nn.Linear(base_model.layer2.out_features, 1)
        self.head_4 = nn.Linear(base_model.layer2.out_features, 1)

    def forward(self, x):
        x = self.base_model(x)
        out_0 = torch.sigmoid(self.head_0(x))
        out_1 = torch.sigmoid(self.head_1(x))
        out_2 = torch.sigmoid(self.head_2(x))
        out_3 = torch.sigmoid(self.head_3(x))
        out_4 = torch.sigmoid(self.head_4(x))
        return out_0, out_1, out_2, out_3, out_4

# Specify the size of the input vectors and the size of the hidden layers
input_size = 3365
hidden_size = 64

# Create the base model
base_model = BaseModel(input_size, hidden_size)

# Create the full multi-task model
model = MultiTaskModel(base_model)

# Specify class weights
# Compute class weights inversely proportional to the number of samples per class
counts = np.array([label1, label2, label3, label4, label5])
class_weights = torch.tensor(counts.sum() / counts, dtype=torch.float32)

# Define separate binary cross entropy loss for each head
loss_fn = nn.BCELoss(reduction='none')

# Define optimizer
optimizer = torch.optim.Adam(model.parameters())

inputs = torch.rand(100, input_size)
targets = torch.rand(100, 5)
dataset = TensorDataset(inputs, targets)
dataloader = DataLoader(dataset)

# Train the model
for epoch in range(50):  
    for inputs, targets in dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        # Compute the loss for each head separately
        losses = [loss_fn(output, target.view(-1, 1)) * weight for output, target, weight in zip(outputs, targets.t(), class_weights)]
        # Sum the losses
        total_loss = sum(losses)
        total_loss.backward()
        optimizer.step()
