# mBrain: Feature calculation

In [1]:
from mobiledna.core.appevents import Appevents
from mobiledna.core.sessions import Sessions
from mobiledna.core.notifications import Notifications

import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

## 1. Open the needed files

In [2]:
# file with the stress labels
df_eod_experiencekit = pd.read_parquet("../data/data_nervosity/df_eod_experiencekit.parquet")

# file with the mapping of the mobileDNA-id and the panelkit-id
df_mapping = pd.read_csv("../data/data_nervosity/MobileDNA_mapping_panelkitid.csv", sep=';')

# files needed for the feature calculation
ae = Appevents.load_data("../data/data_nervosity/wave_3_2/210803_nervocity_appevents.csv", sep=';')
no = Notifications.load_data("../data/data_nervosity/wave_3_2/210803_nervocity_notifications.parquet")
se = Sessions.load_data("../data/data_nervosity/wave_3_2/210803_nervocity_sessions.csv", sep=';')

ae.add_category(scrape=False).add_time_of_day()
no.add_category(scrape=False)
no.add_time_of_day(time_col='time')
no.__data__ = no.__data__.rename(columns={'TOD': 'startTOD'})  # sketchy but otherwise it won't work

# only keep the columns we need in the mapping file and experiencekit data
df_mapping = df_mapping[['panelkit_id', 'MobileDNA_id']]
df_eod_experiencekit = df_eod_experiencekit[['panelkit_id', 'daily_stress_level', 'timestamp']]
df_eod_experiencekit['day'] = df_eod_experiencekit['timestamp'].dt.date  # only keep date from timestamp
df_eod_experiencekit.pop('timestamp')

# add a column with the day for each appEvent
ae.__data__['day'] = ae.__data__['startTime'].dt.date
print('Modified appEvents data file: ')
print(ae.__data__.head())

2021-10-12 16:45:42 - Recognized file type as <csv>.




  res = f(*args, **kwargs)


2021-10-12 16:45:45 - 'load' took 2.222 seconds to complete.
2021-10-12 16:45:46 - Recognized file type as <parquet>.
2021-10-12 16:45:50 - 'load' took 3.977 seconds to complete.
2021-10-12 16:46:12 - Recognized file type as <csv>.
2021-10-12 16:46:13 - 'load' took 0.976 seconds to complete.


Adding category: 100%|██████████| 1489208/1489208 [00:00<00:00, 1652794.85it/s]
Adding tod <startTime>: 100%|██████████| 1489208/1489208 [00:00<00:00, 2112025.59it/s]
Adding category: 100%|██████████| 27772223/27772223 [00:15<00:00, 1741453.90it/s]
Adding tod <time>: 100%|██████████| 27772223/27772223 [00:12<00:00, 2269963.03it/s]


Modified appEvents data file: 
                                           id    model     session               startTime                 endTime  notification  notificationId                        application  battery   latitude  longitude   studyKey        surveyId data_version  startDate    endDate  duration category startTOD         day
1320339  001b6dc4-9c95-4c7c-a21a-56b5deca6689  ELE-L29  1622720128 2021-06-03 13:35:58.071 2021-06-03 13:36:02.945         False               0        com.huawei.android.launcher       63  51.057247   3.706872  nervocity  03062021lepato        1.6.1 2021-06-03 2021-06-03     4.874  unknown     noon  2021-06-03
1320340  001b6dc4-9c95-4c7c-a21a-56b5deca6689  ELE-L29  1622720128 2021-06-03 13:36:05.643 2021-06-03 13:36:22.971         False               0  com.google.android.apps.messaging       63  51.057247   3.706872  nervocity  03062021lepato        1.6.1 2021-06-03 2021-06-03    17.328     chat     noon  2021-06-03
1320293  001b6dc4-9c95-4c7c-a2

In [3]:
# load dummy data (for testing)
no_dummy = Notifications.load_data("./dummy_notifications.csv", sep=';')

2021-10-12 16:46:55 - Recognized file type as <csv>.
2021-10-12 16:46:55 - 'load' took 0.003 seconds to complete.




  res = f(*args, **kwargs)


## Output some info about the files
Shapes of the files:

In [4]:
# stress labels file
print("stress LABELS file: ")
print(f'> (rows, columns): {df_eod_experiencekit.shape[0], df_eod_experiencekit.shape[1]}')
# show some labels
#print(df_eod_experiencekit.daily_stress_level.head(3))
# stress levels distribution
#print(df_eod_experiencekit['daily_stress_level'].describe())
#print(df_eod_experiencekit['daily_stress_level'].value_counts(normalize=True))
print("----------------")
print("")

# mapping file
print("MAPPINGS file: ")
print(f'> (rows, columns): {df_mapping.shape[0], df_mapping.shape[1]}')
print("----------------")
print("")

# features file
print("MobileDNA DATA file: ")
print(f'> (rows, columns): {ae.__data__.shape[0], ae.__data__.shape[1]}')
print("----------------")
print("")

stress LABELS file: 
> (rows, columns): (4257, 3)
----------------

MAPPINGS file: 
> (rows, columns): (385, 2)
----------------

MobileDNA DATA file: 
> (rows, columns): (1489208, 20)
----------------



number of unique id's in each file:

In [5]:

print(f"number of unique id's in the mobileDNA data: {ae.__data__['id'].nunique()}")
print(f"number of unique id's in the mapping file: {df_mapping['MobileDNA_id'].nunique()}")
print(f"number of unique panelkit id's in the mapping file: {df_mapping['panelkit_id'].nunique()}")
print(f"number of unique panelkit id's in the experiencekit data: {df_eod_experiencekit['panelkit_id'].nunique()}")
print("-----------")
print("")

number of unique id's in the mobileDNA data: 326
number of unique id's in the mapping file: 274
number of unique panelkit id's in the mapping file: 377
number of unique panelkit id's in the experiencekit data: 447
-----------



## 2. Join the dataframes

In [6]:
# add the panelkit-id's to the mobileDNA data for shared MobileDNA id (inner join)
df_mapping = df_mapping.rename(columns={'MobileDNA_id': 'id'})
ae.__data__ = ae.__data__.merge(df_mapping, on='id')

print(f"number of unique id's after adding the panelkit-id to the MobileDNA data: {ae.__data__['id'].nunique()}")
print(f"number of entries in the MobileDNA data: {ae.__data__.shape[0]}")
# add the stress-level from experience-kit data to the mobileDNA data (inner join on panelkit-id and day)
ae.__data__ = pd.merge(ae.__data__, df_eod_experiencekit, on=['day', 'panelkit_id'])
print(f"number of entries in the MobileDNA data after adding the stresslabels: {ae.__data__.shape[0]}")

#print(ae.__data__['category'].unique())

number of unique id's after adding the panelkit-id to the MobileDNA data: 216
number of entries in the MobileDNA data: 932781
number of entries in the MobileDNA data after adding the stresslabels: 300105


The resulting file with the stress levels mapped to the mobileDNA-id's:

In [7]:
print(ae.__data__.head())

                                     id           model     session               startTime                 endTime  notification  notificationId                   application  battery   latitude  longitude   studyKey        surveyId data_version  startDate    endDate  duration category       startTOD         day                           panelkit_id daily_stress_level
0  00534866-5655-4e6c-ba15-dd5b5437041a  moto g(8) plus  1623038430 2021-06-07 06:15:00.433 2021-06-07 06:15:10.007          True              66  com.google.android.deskclock      100  51.069824   3.755381  nervocity  21061975frgear        1.6.1 2021-06-07 2021-06-07     9.574    clock  early_morning  2021-06-07  4af7b991-a5b3-11eb-9dd4-cb04303e036e                  5
1  00534866-5655-4e6c-ba15-dd5b5437041a  moto g(8) plus  1623039300 2021-06-07 06:30:00.377 2021-06-07 06:30:10.109          True              61  com.google.android.deskclock      100  51.069817   3.755354  nervocity  21061975frgear        1.6.1 2021-06-0

## 3. Feature calculation
All features from literature are listed [here](./Constructlijst_features.xlsx).
### Stress Features
#### General screen time

In [8]:
#df_features = ae.__data__.groupby(['id', 'day']).agg()
general_screen_time = (ae.get_daily_duration(series_unit='day') / 60)
print(general_screen_time.head())

id                                    day       
00534866-5655-4e6c-ba15-dd5b5437041a  2021-06-07    131.344917
                                      2021-06-08    122.027483
                                      2021-06-09    136.289317
                                      2021-06-11    190.306950
                                      2021-06-12     80.732533
Name: daily_durations, dtype: float64


#### Smartphone use frequency

In [9]:
smartphone_use_freq = ae.get_daily_events(series_unit=('day'))
print(smartphone_use_freq.head(2))

id                                    day       
00534866-5655-4e6c-ba15-dd5b5437041a  2021-06-07    155.0
                                      2021-06-08    106.0
Name: daily_events, dtype: float64


#### Checking behaviour

In [10]:
checking_behaviour = se.get_daily_sessions(avg=False)
print(checking_behaviour.head())

avg_checking_behaviour = se.get_daily_sessions(avg=True)
print(avg_checking_behaviour.head())

TypeError: rename() got an unexpected keyword argument 'columns'

#### Smartphone multitasking

In [None]:
# todo

#### Duration MIM applications

In [None]:
duration_MIM_applications = (ae.get_daily_duration(category='chat', series_unit='day') / 60)
print(duration_MIM_applications.head())

#### Frequency MIM applications

In [None]:
freq_MIM_applications = ae.get_daily_events(category='chat', series_unit='day')
print(freq_MIM_applications.head())

#### Notifications MIM applications

In [None]:
daily_MIM_notifications = no.get_daily_notifications(category='chat', avg=False)
print(daily_MIM_notifications.head())

avg_daily_MIM_notifications = no.get_daily_notifications(category='chat', avg=True)
print(avg_daily_MIM_notifications.head())

#### (Average) daily use of MIM applications during work hours

In [None]:
# TODO change morning & noon to work hours (8-16 => 9-17)
daily_use_work_hours = (ae.get_daily_duration(time_of_day=['morning', 'noon'], category='chat', series_unit='day') / 60)
print(daily_use_work_hours.head())

avg_daily_use_work_hours = (ae.get_daily_duration(time_of_day=['morning', 'noon'], category='chat') / 60)
print(avg_daily_use_work_hours.head())

#### (Average) amount of social media notifications

In [None]:
# first put some more specific social categories in one general 'Social' category
unknown_categories = {"banking": ["com.coinbase.pro", "com.kraken.trade", "com.kraken.invest.app"],"medical": ["be.imec.apt.stressy","be.imec.apt.ichange.chillplusclient","be.ilabt.contextaware.empatica","be.ilabt.contextaware.mbrain","be.sciensano.coronalert","com.j_ware.polarsensorlogger","com.urbandroid.sleep","heartzones.com.heartzonestraining","com.empatica.e4realtime",],"calling": ["com.oneplus.dialer"],"calendar": ["com.komorebi.SimpleCalendar"],"productivity": ["partl.workinghours"],}

category_map = {"medical": "Health","chat": "Social","email": "Productivity","system": "none", "unknown": "none",
                "social": "Social","tools": "Productivity","browser": "Web","productivity": "Productivity",
                "photography": "none","business": "Productivity","music&audio": "Entertainment","clock": "none",
                "banking": "Finance","lifestyle": "none","health&fitness": "Health","news&magazines": "News",
                "gaming": "Entertainment","calling": "Calling","calendar": "Productivity","video": "Entertainment",
                "maps&navigation": "Navigation","food & drink": "none","finance": "Finance","communication": "Social",
                "ecommerce": "Shopping","retail": "Shopping","weather": "none","sports": "none","smartconnectivity": "none",
                "card": "Entertainment","travel & local": "none","education": "Productivity","entertainment": "Entertainment",
                "music & audio": "Entertainment","books & reference": "none","shopping": "Shopping","mobility": "Navigation",
                "news & magazines": "News","puzzle": "Entertainment",}

no.__data__['category'] = no.__data__['category'].apply(lambda x: category_map.get(x,x))

avg_daily_social_notifications = no.get_daily_notifications(category='Social')
print(avg_daily_social_notifications.head())

avg_daily_social_notifications = no.get_daily_notifications(category='Social', avg=True)
print(avg_daily_social_notifications.head())

#### (Average) daily use of social media applications

In [None]:
i = ae.__data__['category'].nunique()
# first map some more specific social categories in one 'Social' category
ae.__data__['category'] = ae.__data__['category'].apply(lambda x: category_map.get(x,x))
j = ae.__data__['category'].nunique()
print(f'number of categories is reduced by {i-j}')

daily_social_applications = (ae.get_daily_duration(category='Social', series_unit='day') / 60)
print(daily_social_applications.head())

avg_daily_social_applications = (ae.get_daily_duration(category='Social') / 60)
print(avg_daily_social_applications.head())

#### (Average) daily amount of social media app events

In [None]:
freq_social_applications = ae.get_daily_events(category='Social', series_unit='day')
print(freq_social_applications.head())

avg_freq_social_applications = ae.get_daily_events(category='Social')
print(avg_freq_social_applications.head())

#### Average daily use during evening time

In [None]:
avg_daily_use_evening = (ae.get_daily_duration(time_of_day='eve', series_unit='day') / 60)
print(avg_daily_use_evening.head())

avg_daily_use_evening = (ae.get_daily_duration(time_of_day='eve') / 60)
print(avg_daily_use_evening.head())


#### (Average) daily use during night time

In [None]:
daily_use_night = (ae.get_daily_duration(time_of_day='night', series_unit='day') / 60)
print(daily_use_night.head())

avg_daily_use_night = (ae.get_daily_duration(time_of_day='night') / 60)
print(avg_daily_use_night.head())

#### (Average) daily amount of app events during evening/night time

In [None]:
freq_evening_use = (ae.get_daily_events(time_of_day='eve', series_unit='day'))
print(freq_evening_use.head())

avg_freq_evening_use = (ae.get_daily_events(time_of_day='eve'))
print(avg_freq_evening_use.head())

freq_night_use = (ae.get_daily_events(time_of_day='night', series_unit='day'))
print(freq_evening_use.head())

avg_freq_night_use = (ae.get_daily_events(time_of_day='night'))
print(avg_freq_evening_use.head())

#### (Average) daily amount of notifications during evening/night time

In [None]:
daily_eve_notifications = no.get_daily_notifications(time_of_day='eve')
print(daily_eve_notifications.head())
avg_daily_eve_notifications = no.get_daily_notifications(time_of_day='eve', avg=True)
print(avg_daily_eve_notifications.head())
# amount of notifications is hoog

daily_night_notifications = (no.get_daily_notifications(time_of_day='night'))
print(daily_night_notifications.head())
avg_daily_night_notifications = (no.get_daily_notifications(time_of_day='night', avg=True))
print(avg_daily_night_notifications.head())
# amount of notifications is hoog

### Depression features
#### General screen time
#### Smartphone use frequency
#### Screen unlocks (=checking behaviour)
#### Average daily social smartphone use/ appevents/ notifications
#### Average smartphone appevents/ use during evening/night hours
 &#8594; All done above
#### (Average) time between sessions started on notification

In [None]:
def calc_time_between_notification_sessions(df: pd.DataFrame, avg=False):
    session_firsts = df.groupby(["id", "session"]).head(1)
    session_firsts_notif = session_firsts[session_firsts["notification"] == True]
    session_firsts_notif = session_firsts_notif.assign(start_shift= session_firsts_notif.groupby(["id", "startDate"])[["startTime"]].shift(-1))
    session_firsts_notif = session_firsts_notif.assign(duration_shift=(session_firsts_notif["start_shift"] - session_firsts_notif["endTime"]).dt.total_seconds())

    mean_shift_pd = (session_firsts_notif.groupby(["id", "startDate"])["duration_shift"].mean() / 60)
    mean_shift = mean_shift_pd.groupby("id").mean()

    if avg:
        return mean_shift.rename("mins_between_notif_sessions")
    else:
        return mean_shift_pd.rename("mins_between_notif_sessions")


In [None]:
time_between_notif_sessions = calc_time_between_notification_sessions(ae.__data__) # minutes
print(time_between_notif_sessions.head())

avg_time_between_notif_sessions = calc_time_between_notification_sessions(ae.__data__, avg=True) # minutes
print(avg_time_between_notif_sessions.head())

#### Variability smartphone use during week

In [None]:
def calc_weekly_use_variability(df: pd.DataFrame, duration: None):
    if duration:
        name = "duration"
        variability = df.groupby(["id", pd.Grouper(key="startDate", freq="W")])["duration"].sum().groupby("id").std()
    else:
        name = "appevents"
        variability = df.groupby(["id", pd.Grouper(key="startDate", freq="W")])["application"].count().groupby("id").std()

    return variability.rename(f"weekly_variability_{name}")

In [None]:
weekly_use_variability = calc_weekly_use_variability(ae.__data__, duration=True)
print(weekly_use_variability.head())

#### (Average) daily use/events/notifications
##### non-social (process) related apps

In [None]:
social_cat = ["Social", "Calling"]
all_cat = ae.__data__.category.unique().tolist()
non_social_cat = list(set(all_cat) - set(social_cat))

daily_non_social_applications = (ae.get_daily_duration(category=non_social_cat, series_unit='day') / 60)
print(daily_non_social_applications.head())
avg_daily_non_social_applications = (ae.get_daily_duration(category=non_social_cat) / 60)
print(avg_daily_non_social_applications.head())

In [None]:
freq_non_social_applications = ae.get_daily_events(category=non_social_cat, series_unit='day')
print(freq_non_social_applications.head())
avg_freq_non_social_applications = ae.get_daily_events(category=non_social_cat)
print(avg_freq_non_social_applications.head())

In [None]:
daily_non_social_notifications = no.get_daily_notifications(category=non_social_cat)
print(daily_non_social_notifications.head())

avg_daily_non_social_notifications = no.get_daily_notifications(category=non_social_cat, avg=True)
print(avg_daily_non_social_notifications.head())

##### browser application

In [None]:
browser_use = (ae.get_daily_duration(category='Web') / 60)
print(browser_use.head())

freq_browser_use = ae.get_daily_events(category='Web', series_unit='day')
print(freq_browser_use.head())

##### news applications

In [None]:
news_use = (ae.get_daily_duration(category='News', series_unit='day') / 60)
print(news_use.head())

avg_news_use = (ae.get_daily_duration(category='News') / 60)
print(avg_news_use.head())

freq_news_use = ae.get_daily_events(category='News', series_unit='day')
print(freq_news_use.head())

avg_freq_news_use = ae.get_daily_events(category='News')
print(avg_freq_news_use.head())

##### instagram

In [None]:
daily_instagram_use = (ae.get_daily_duration(application="com.instagram.android", series_unit='day') / 60)
print(daily_instagram_use.head())

avg_daily_instagram_use = (ae.get_daily_duration(application="com.instagram.android") / 60)
print(avg_daily_instagram_use.head())

freq_instagram_use = ae.get_daily_events(application="com.instagram.android", series_unit='day')
print(freq_instagram_use.head())

avg_freq_instagram_use = ae.get_daily_events(application="com.instagram.android")
print(avg_freq_instagram_use.head())

### Headaches features
##### Daily screen time
&#8594; Already done above

##### (Average) Daily call duration/frequency

In [None]:
daily_call_duration = (ae.get_daily_duration(category='Calling', series_unit='day') /60)
print(daily_call_duration.head())

avg_daily_call_duration = (ae.get_daily_duration(category='Calling') /60)
print(avg_daily_call_duration.head())

freq_daily_call = ae.get_daily_events(category='Calling', series_unit='day')
print(freq_daily_call.head())

avg_freq_daily_call = ae.get_daily_events(category='Calling')
print(avg_freq_daily_call.head())

### Activity features
##### Average daily number of (unique) used apps

In [None]:
# TODO unique apps

##### (Average) daily duration/ frequency of app use

In [None]:
# TODO

##### Increase/decrease in battery status

In [None]:
def calc_battery_status(df: pd.DataFrame):
    """
    Calculates four battery status variables per participant:
    - daily average battery level
    - daily std dev of battery level
    - daily average charge %
    - daily average discharge %

    :param df: the appevents DataFrame
    :return: results DataFrame with 4 variables per participant
    """
    df = df.copy()
    df = df.sort_values(['id', 'startTime']).assign(battery_shift= df.groupby(['id', 'startDate'])['battery'].shift(-1))
    df = df.assign(battery_change=df['battery_shift'] - df['battery'])

    battery_avg = (df.groupby(["id", "startDate"])["battery"].mean().groupby('id').mean()).rename('avg_daily_battery')
    battery_std = (df.groupby(["id", "startDate"])["battery"].mean().groupby('id').std()).rename("battery_std")

    battery_discharge = (df[df["battery_change"] < 0].groupby(['id', 'startDate'])['battery_change'].sum()
                         .abs().groupby('id').mean()).rename("battery_daily_discharge")
    battery_charge = (df[df["battery_change"] > 0].groupby(["id", "startDate"])["battery_change"].sum()
                      .abs().groupby("id").mean()).rename("battery_daily_charge")

    res = pd.concat([
        battery_charge,
        battery_discharge
    ], axis=1)

    return res

In [None]:
    battery_status = calc_battery_status(ae.__data__)
    print(battery_status.head())
    # TODO not completely right, averaging app events not best way
    # TODO per day

##### Average daily time between consecutive phone use sessions

In [None]:
def calc_time_between_consecutive_sessions(df: pd.DataFrame, avg=False):
    session_firsts = df.groupby(["id", "session"]).head(1)

    session_firsts = session_firsts.assign(start_shift= session_firsts.groupby(["id", "startDate"])[["startTime"]].shift(-1))
    session_firsts= session_firsts.assign(duration_shift=(session_firsts["start_shift"] - session_firsts["endTime"]).dt.total_seconds())

    mean_shift_pd = (session_firsts.groupby(["id", "startDate"])["duration_shift"].mean() / 60) # pd= per day
    mean_shift = mean_shift_pd.groupby("id").mean()  # avg by user

    if avg:
        return mean_shift.rename("mins_between_sessions")
    else:
        return mean_shift_pd.rename("mins_between_sessions")

In [None]:
daily_time_between_sessions = calc_time_between_consecutive_sessions(ae.__data__)
print(daily_time_between_sessions.head())

avg_daily_time_between_sessions = calc_time_between_consecutive_sessions(ae.__data__, avg=True)
print(avg_daily_time_between_sessions.head())

## 4. Merge all features in one dataframe

In [None]:
temp_res = (pd.merge(general_screen_time, smartphone_use_freq, on=['id', 'day']
).merge(checking_behaviour,left_on=['id', 'day'], right_on=['id', 'startDate']
).merge(duration_MIM_applications, on=['id', 'day']
).merge(freq_MIM_applications, on=['id', 'day']
))

#print(temp_res.head())

#### Add stress labels

In [None]:
labels = ae.__data__[['id', 'day','daily_stress_level']].groupby(by=['id','day']).first()
res = pd.merge(temp_res, labels, on=['id', 'day'])

The resulting dataframe with the features and labels per day, per user:

In [None]:
print(res.head())


## Feature importance/ correlation/...
Now the importance of each feature for the stress level can be estimated.

In [None]:
import seaborn as sn
corrMatrix = res.corr()
sn.heatmap(corrMatrix, annot=True)

In [None]:
k_fold = KFold(n_splits=10, shuffle=True)
labels = res.pop('daily_stress_level')

all_test_labels = []
all_predicted_labels = []

for i_fold, (tr, tst) in enumerate(k_fold.split(res, labels)):
    #print(f"fold number {i_fold}")
    res_train, res_test = res.iloc[tr], res.iloc[tst]
    labels_train, labels_test = labels[tr], labels[tst]
    clf = RandomForestClassifier()
    clf.fit(res_train, labels_train)
    predicted_label = clf.predict(res_test)

    all_test_labels.extend(labels_test)
    all_predicted_labels.extend(predicted_label)

print(classification_report(all_test_labels, all_predicted_labels))

In [None]:
print("FINISHED PROGRAM")
