# mBrain: stress features

In [502]:
from mobiledna.core.appevents import Appevents
from mobiledna.core.sessions import Sessions
from mobiledna.core.notifications import Notifications

import pandas as pd
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

## Open the needed files

In [None]:
# file with the stress labels
df_eod_experiencekit = pd.read_parquet("../data/data_nervosity/df_eod_experiencekit.parquet")

# file with the mapping of the mobileDNA-id and the panelkit-id
df_mapping = pd.read_csv("../data/data_nervosity/MobileDNA_mapping_panelkitid.csv", sep=';')

# files needed for the feature calculation
ae = Appevents.load_data("../data/data_nervosity/wave_3_2/210803_nervocity_appevents.csv", sep=';')
no = Notifications.load_data("../data/data_nervosity/wave_3_2/210803_nervocity_notifications.parquet")
se = Sessions.load_data("../data/data_nervosity/wave_3_2/210803_nervocity_sessions.csv", sep=';')

ae.add_category(scrape=False).add_time_of_day()
no.add_category(scrape=False)
no.add_time_of_day(time_col='time')
no.__data__ = no.__data__.rename(columns={'TOD': 'startTOD'})  # sketchy but otherwise it won't work

# only keep the columns we need in the mapping file and experiencekit data
df_mapping = df_mapping[['panelkit_id', 'MobileDNA_id']]
df_eod_experiencekit = df_eod_experiencekit[['panelkit_id', 'daily_stress_level', 'timestamp']]
df_eod_experiencekit['day'] = df_eod_experiencekit['timestamp'].dt.date  # only keep date from timestamp
df_eod_experiencekit.pop('timestamp')

# add a column with the day for each appEvent
ae.__data__['day'] = ae.__data__['startTime'].dt.date
print('Modified appEvents data file: ')
print(ae.__data__.head())

2021-10-11 13:02:45 - Recognized file type as <csv>.




  res = f(*args, **kwargs)


2021-10-11 13:02:49 - 'load' took 3.416 seconds to complete.
2021-10-11 13:02:50 - Recognized file type as <parquet>.
2021-10-11 13:03:01 - 'load' took 10.717 seconds to complete.
2021-10-11 13:03:33 - Recognized file type as <csv>.
2021-10-11 13:03:35 - 'load' took 1.33 seconds to complete.


Adding category: 100%|██████████| 1489208/1489208 [00:01<00:00, 1176999.63it/s]
Adding tod <startTime>: 100%|██████████| 1489208/1489208 [00:00<00:00, 1778810.83it/s]
Adding category:  90%|████████▉ | 24913775/27772223 [00:17<00:02, 1418873.25it/s]

In [None]:
# load dummy data (for testing)
no_dummy = Notifications.load_data("./dummy_notifications.csv", sep=';')

## Output some info about the files
Shapes of the files:

In [None]:
# stress labels file
print("stress LABELS file: ")
print(f'> (rows, columns): {df_eod_experiencekit.shape[0], df_eod_experiencekit.shape[1]}')
# show some labels
#print(df_eod_experiencekit.daily_stress_level.head(3))
# stress levels distribution
#print(df_eod_experiencekit['daily_stress_level'].describe())
#print(df_eod_experiencekit['daily_stress_level'].value_counts(normalize=True))
print("----------------")
print("")

# mapping file
print("MAPPINGS file: ")
print(f'> (rows, columns): {df_mapping.shape[0], df_mapping.shape[1]}')
print("----------------")
print("")

# features file
print("MobileDNA DATA file: ")
print(f'> (rows, columns): {ae.__data__.shape[0], ae.__data__.shape[1]}')
print("----------------")
print("")

number of unique id's in each file:

In [None]:

print(f"number of unique id's in the mobileDNA data: {ae.__data__['id'].nunique()}")
print(f"number of unique id's in the mapping file: {df_mapping['MobileDNA_id'].nunique()}")
print(f"number of unique panelkit id's in the mapping file: {df_mapping['panelkit_id'].nunique()}")
print(f"number of unique panelkit id's in the experiencekit data: {df_eod_experiencekit['panelkit_id'].nunique()}")
print("-----------")
print("")

## Join the dataframes

In [None]:
# add the panelkit-id's to the mobileDNA data for shared MobileDNA id (inner join)
df_mapping = df_mapping.rename(columns={'MobileDNA_id': 'id'})
ae.__data__ = ae.__data__.merge(df_mapping, on='id')

print(f"number of unique id's after adding the panelkit-id to the MobileDNA data: {ae.__data__['id'].nunique()}")
print(f"number of entries in the MobileDNA data: {ae.__data__.shape[0]}")
# add the stress-level from experience-kit data to the mobileDNA data (inner join on panelkit-id and day)
ae.__data__ = pd.merge(ae.__data__, df_eod_experiencekit, on=['day', 'panelkit_id'])
print(f"number of entries in the MobileDNA data after adding the stresslabels: {ae.__data__.shape[0]}")

#print(ae.__data__['category'].unique())

The resulting file with the stress levels mapped to the mobileDNA-id's:

## Feature calculation
#### General screen time

In [None]:
#df_features = ae.__data__.groupby(['id', 'day']).agg()
general_screen_time = (ae.get_daily_duration(series_unit='day') / 60)
print(general_screen_time.head())

#### Smartphone use frequency

In [None]:
smartphone_use_freq = ae.get_daily_events(series_unit=('day'))
print(smartphone_use_freq.head(2))

#### Checking behaviour

In [None]:
checking_behaviour = se.get_daily_sessions()
print(checking_behaviour.head())

#### Smartphone multitasking

In [None]:
# todo

#### Duration MIM applications

In [None]:
# (MIM=mobile instant messaging == chat?)
duration_MIM_applications = (ae.get_daily_duration(category='chat', series_unit='day') / 60)
print(duration_MIM_applications.head())

#### Frequency MIM applications

In [None]:
freq_MIM_applications = ae.get_daily_events(category='chat', series_unit='day')
print(freq_MIM_applications.head())

#### Notifications MIM applications

In [None]:
avg_daily_MIM_notifications = no.get_daily_notifications(category='chat')
print(avg_daily_MIM_notifications.head())

#### Average daily use of MIM applications during work hours

In [None]:
# TODO change morning & noon to work hours (8-16 => 9-17)
avg_daily_use_work_hours = (ae.get_daily_duration(time_of_day=['morning', 'noon'], category='chat') / 60)
print(avg_daily_use_work_hours.head())
avg_daily_use_work_hours.plot.hist()

#### Average amount of social media notifications

In [None]:
# first put some more specific social categories in one general 'Social' category
unknown_categories = {"banking": ["com.coinbase.pro", "com.kraken.trade", "com.kraken.invest.app"],"medical": ["be.imec.apt.stressy","be.imec.apt.ichange.chillplusclient","be.ilabt.contextaware.empatica","be.ilabt.contextaware.mbrain","be.sciensano.coronalert","com.j_ware.polarsensorlogger","com.urbandroid.sleep","heartzones.com.heartzonestraining","com.empatica.e4realtime",],"calling": ["com.oneplus.dialer"],"calendar": ["com.komorebi.SimpleCalendar"],"productivity": ["partl.workinghours"],}
category_map = {"medical": "Health","chat": "Social","email": "Productivity","system": "none", "unknown": "none","social": "Social","tools": "Productivity","browser": "Web","productivity": "Productivity","photography": "none","business": "Productivity","music&audio": "Entertainment","clock": "none","banking": "Finance","lifestyle": "none","health&fitness": "Health","news&magazines": "News","gaming": "Entertainment","calling": "Calling","calendar": "Productivity","video": "Entertainment","contacts": "Calling","video&audioconferencing": "Productivity","health & fitness": "Health","maps&navigation": "Navigation","food & drink": "none","finance": "Finance","communication": "Social","ecommerce": "Shopping","retail": "Shopping","weather": "none","sports": "none","smartconnectivity": "none","card": "Entertainment","travel & local": "none","education": "Productivity","entertainment": "Entertainment","music & audio": "Entertainment","books & reference": "none","shopping": "Shopping","mobility": "Navigation","news & magazines": "News","puzzle": "Entertainment",}
no.__data__['category'] = no.__data__['category'].apply(lambda x: category_map.get(x,x))

avg_daily_social_notifications = no.get_daily_notifications(category='Social')
print(avg_daily_social_notifications.head())

#### Average daily use of social media applications

In [None]:
i = ae.__data__['category'].nunique()
# first map some more specific social categories in one 'Social' category
ae.__data__['category'] = ae.__data__['category'].apply(lambda x: category_map.get(x,x))
j = ae.__data__['category'].nunique()
print(f'number of categories is reduced by {i-j}')

avg_daily_social_applications = (ae.get_daily_duration(category='Social', series_unit='day') / 60)
print(avg_daily_social_applications.head())

#### Average daily amount of social media app events

In [None]:
freq_social_applications = ae.get_daily_events(category='Social', series_unit='day')
print(freq_social_applications.head())

#### Duration of different social media applications

In [None]:
#todo

#### Average daily use during evening time

In [None]:
avg_daily_use_evening = (ae.get_daily_duration(time_of_day='eve') / 60) #per day: series_unit='day'
print(avg_daily_use_evening.head())


#### Average daily use during night time

In [None]:
avg_daily_use_night = (ae.get_daily_duration(time_of_day='night') / 60) #per day: series_unit='day'
print(avg_daily_use_night.head())

#### Average daily amount of app events during evening time

In [None]:
freq_evening_use = (ae.get_daily_events(time_of_day='eve'))
print(freq_evening_use.head())

#### Average daily amount of app events during night time

In [None]:
freq_night_use = (ae.get_daily_events(time_of_day='night'))
print(freq_evening_use.head())

#### Average daily amount of notifications during evening time

In [None]:
avg_daily_eve_notifications = no.get_daily_notifications(time_of_day='eve')
print(avg_daily_eve_notifications.head())
# amount of notifications is high

#### Average daily amount of notifications during night time

In [None]:
avg_daily_night_notifications = (no.get_daily_notifications(time_of_day='night'))
print(avg_daily_night_notifications.head())
# amount of notifications is high

## Merge all features in one dataframe

In [None]:
temp = pd.merge(avg_daily_social_applications, freq_social_applications, on=['id', 'day'])
temp_2 = pd.merge(temp, general_screen_time, on=['id', 'day'])
temp = pd.merge(temp_2, duration_MIM_applications, on=['id', 'day'])
temp_2 = pd.merge(temp, smartphone_use_freq, on=['id', 'day'])
temp_res =  pd.merge(temp_2, freq_MIM_applications, on=['id', 'day'])
#print(temp_res.head())

#### Add stress labels

In [None]:
labels = ae.__data__[['id', 'day','daily_stress_level']].groupby(by=['id','day']).first()
res = pd.merge(temp_res, labels, on=['id', 'day'])

This is the resulting dataframe with the features and labels per day, per user:

In [None]:
print(res.head())

## Feature importance/ correlation/...
Now the importance of the features for the stress level can be estimated.

In [None]:
import seaborn as sn
corrMatrix = res.corr()
sn.heatmap(corrMatrix, annot=True)

In [None]:
k_fold = KFold(n_splits=10, shuffle=True)
labels = res.pop('daily_stress_level')

all_test_labels = []
all_predicted_labels = []

for i_fold, (tr, tst) in enumerate(k_fold.split(res, labels)):
    #print(f"fold number {i_fold}")
    res_train, res_test = res.iloc[tr], res.iloc[tst]
    labels_train, labels_test = labels[tr], labels[tst]
    clf = RandomForestClassifier()
    clf.fit(res_train, labels_train)
    predicted_label = clf.predict(res_test)

    all_test_labels.extend(labels_test)
    all_predicted_labels.extend(predicted_label)

print(classification_report(all_test_labels, all_predicted_labels))

In [None]:
print("FINISHED PROGRAM")
