# mBrain: Feature calculation

In [303]:
from mobiledna.core.appevents import Appevents
from mobiledna.core.sessions import Sessions
from mobiledna.core.notifications import Notifications

import pandas as pd
from sklearn.model_selection import KFold, cross_validate
from sklearn.ensemble import RandomForestRegressor
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LogisticRegression, ElasticNet, Lasso
from sklearn.svm import SVR
import xgboost
from xgboost import XGBRegressor, plot_importance


## 1. Open the needed files

In [None]:
# file with the stress labels
df_eod_experiencekit = pd.read_parquet("../data/data_nervosity/df_eod_experiencekit.parquet")

# file with the mapping of the mobileDNA-id and the panelkit-id
df_mapping = pd.read_csv("../data/data_nervosity/MobileDNA_mapping_panelkitid.csv", sep=';')

# files needed for the feature calculation
ae = Appevents.load_data("../data/data_nervosity/wave_3_2/210803_nervocity_appevents.csv", sep=';')
no = Notifications.load_data("../data/data_nervosity/wave_3_2/210803_nervocity_notifications.parquet")
se = Sessions.load_data("../data/data_nervosity/wave_3_2/210803_nervocity_sessions.csv", sep=';')

ae.add_category(scrape=False).add_time_of_day()
no.add_category(scrape=False)
no.add_time_of_day(time_col='time')
no.__data__ = no.__data__.rename(columns={'TOD': 'startTOD'})  # sketchy but otherwise it won't work

# only keep the columns we need in the mapping file and experiencekit data
df_mapping = df_mapping[['panelkit_id', 'MobileDNA_id']]
df_eod_experiencekit = df_eod_experiencekit[['panelkit_id', 'daily_stress_level', 'timestamp']]
df_eod_experiencekit['day'] = df_eod_experiencekit['timestamp'].dt.date  # only keep date from timestamp
df_eod_experiencekit.pop('timestamp')

# add a column with the day for each appEvent
ae.__data__['day'] = ae.__data__['startTime'].dt.date
se.__data__['day'] = se.__data__['startDate'].dt.date

print('Modified appEvents data file: ')
print(ae.__data__.head())

2021-10-18 09:33:25 - Recognized file type as <csv>.




  res = f(*args, **kwargs)


2021-10-18 09:33:28 - 'load' took 2.666 seconds to complete.
2021-10-18 09:33:30 - Recognized file type as <parquet>.
2021-10-18 09:33:38 - 'load' took 8.515 seconds to complete.
2021-10-18 09:34:03 - Recognized file type as <csv>.
2021-10-18 09:34:04 - 'load' took 1.028 seconds to complete.


Adding category: 100%|██████████| 1489208/1489208 [00:00<00:00, 1620248.56it/s]
Adding tod <startTime>: 100%|██████████| 1489208/1489208 [00:00<00:00, 2213734.40it/s]
Adding category: 100%|██████████| 27772223/27772223 [00:15<00:00, 1770514.36it/s]
Adding tod <time>:  75%|███████▌  | 20897884/27772223 [00:09<00:03, 2164245.66it/s]

## Output some info about the files
Shapes of the files:

In [None]:
# stress labels file
print("stress LABELS file: ")
print(f'> (rows, columns): {df_eod_experiencekit.shape[0], df_eod_experiencekit.shape[1]}')
# show some labels
#print(df_eod_experiencekit.daily_stress_level.head(3))
# stress levels distribution
#print(df_eod_experiencekit['daily_stress_level'].describe())
#print(df_eod_experiencekit['daily_stress_level'].value_counts(normalize=True))
print("----------------")
print("")

# mapping file
print("MAPPINGS file: ")
print(f'> (rows, columns): {df_mapping.shape[0], df_mapping.shape[1]}')
print("----------------")
print("")

# features file
print("MobileDNA DATA file: ")
print(f'> (rows, columns): {ae.__data__.shape[0], ae.__data__.shape[1]}')
print("----------------")
print("")

number of unique id's in each file:

In [None]:

print(f"number of unique id's in the mobileDNA data: {ae.__data__['id'].nunique()}")
print(f"number of unique id's in the mapping file: {df_mapping['MobileDNA_id'].nunique()}")
print(f"number of unique panelkit id's in the mapping file: {df_mapping['panelkit_id'].nunique()}")
print(f"number of unique panelkit id's in the experiencekit data: {df_eod_experiencekit['panelkit_id'].nunique()}")
print("-----------")
print("")

## 2. Join the dataframes

In [None]:
# add the panelkit-id's to the mobileDNA data for shared MobileDNA id (inner join)
df_mapping = df_mapping.rename(columns={'MobileDNA_id': 'id'})
ae.__data__ = ae.__data__.merge(df_mapping, on='id')

print(f"number of unique id's after adding the panelkit-id to the MobileDNA data: {ae.__data__['id'].nunique()}")
print(f"number of entries in the MobileDNA data: {ae.__data__.shape[0]}")
# add the stress-level from experience-kit data to the mobileDNA data (inner join on panelkit-id and day)
ae.__data__ = pd.merge(ae.__data__, df_eod_experiencekit, on=['day', 'panelkit_id'])
print(f"number of entries in the MobileDNA data after adding the stresslabels: {ae.__data__.shape[0]}")

#print(ae.__data__['category'].unique())

The resulting file with the stress levels mapped to the mobileDNA-id's:

In [None]:
print(ae.__data__.head())

## 3. Feature calculation
All features from literature are listed [here](./Constructlijst_features.xlsx).
### Stress Features
#### General screen time

In [None]:
#df_features = ae.__data__.groupby(['id', 'day']).agg()
general_screen_time = (ae.get_daily_duration(series_unit='day') / 60)
print(general_screen_time.head())

#### Smartphone use frequency

In [None]:
smartphone_use_freq = ae.get_daily_events(series_unit=('day'))
print(smartphone_use_freq.head(2))

#### Checking behaviour

In [None]:
checking_behaviour = se.get_daily_sessions(avg=False)
print(checking_behaviour.head())

avg_checking_behaviour = se.get_daily_sessions(avg=True)
print(avg_checking_behaviour.head())

#### Smartphone multitasking

In [None]:
# todo

#### Duration MIM applications

In [None]:
duration_MIM_applications = (ae.get_daily_duration(category='chat', series_unit='day') / 60)
print(duration_MIM_applications.head())

#### Frequency MIM applications

In [None]:
freq_MIM_applications = ae.get_daily_events(category='chat', series_unit='day')
print(freq_MIM_applications.head())

#### Notifications MIM applications

In [None]:
daily_MIM_notifications = no.get_daily_notifications(category='chat', avg=False)
print(daily_MIM_notifications.head())

avg_daily_MIM_notifications = no.get_daily_notifications(category='chat', avg=True)
print(avg_daily_MIM_notifications.head())

#### (Average) daily use of MIM applications during work hours

In [None]:
# TODO change morning & noon to work hours (8-16 => 9-17)
daily_use_work_hours = (ae.get_daily_duration(time_of_day=['morning', 'noon'], category='chat', series_unit='day') / 60)
print(daily_use_work_hours.head())

avg_daily_use_work_hours = (ae.get_daily_duration(time_of_day=['morning', 'noon'], category='chat') / 60)
print(avg_daily_use_work_hours.head())

#### (Average) amount of social media notifications

In [None]:
# first put some more specific social categories in one general 'Social' category
unknown_categories = {"banking": ["com.coinbase.pro", "com.kraken.trade", "com.kraken.invest.app"],"medical": ["be.imec.apt.stressy","be.imec.apt.ichange.chillplusclient","be.ilabt.contextaware.empatica","be.ilabt.contextaware.mbrain","be.sciensano.coronalert","com.j_ware.polarsensorlogger","com.urbandroid.sleep","heartzones.com.heartzonestraining","com.empatica.e4realtime",],"calling": ["com.oneplus.dialer"],"calendar": ["com.komorebi.SimpleCalendar"],"productivity": ["partl.workinghours"],}

category_map = {"medical": "Health","chat": "Social","email": "Productivity","system": "none", "unknown": "none",
                "social": "Social","tools": "Productivity","browser": "Web","productivity": "Productivity",
                "photography": "none","business": "Productivity","music&audio": "Entertainment","clock": "none",
                "banking": "Finance","lifestyle": "none","health&fitness": "Health","news&magazines": "News",
                "gaming": "Entertainment","calling": "Calling","calendar": "Productivity","video": "Entertainment",
                "maps&navigation": "Navigation","food & drink": "none","finance": "Finance","communication": "Social",
                "ecommerce": "Shopping","retail": "Shopping","weather": "none","sports": "none","smartconnectivity": "none",
                "card": "Entertainment","travel & local": "none","education": "Productivity","entertainment": "Entertainment",
                "music & audio": "Entertainment","books & reference": "none","shopping": "Shopping","mobility": "Navigation",
                "news & magazines": "News","puzzle": "Entertainment",}

no.__data__['category'] = no.__data__['category'].apply(lambda x: category_map.get(x,x))

daily_social_notifications = no.get_daily_notifications(category='Social')
print(daily_social_notifications.head())

avg_daily_social_notifications = no.get_daily_notifications(category='Social', avg=True)
print(avg_daily_social_notifications.head())

#### (Average) daily use of social media applications

In [None]:
i = ae.__data__['category'].nunique()
# first map some more specific social categories in one 'Social' category
ae.__data__['category'] = ae.__data__['category'].apply(lambda x: category_map.get(x,x))
j = ae.__data__['category'].nunique()
print(f'number of categories is reduced by {i-j}')

daily_social_applications = (ae.get_daily_duration(category='Social', series_unit='day') / 60)
print(daily_social_applications.head())

avg_daily_social_applications = (ae.get_daily_duration(category='Social') / 60)
print(avg_daily_social_applications.head())

#### (Average) daily amount of social media app events

In [None]:
freq_social_applications = ae.get_daily_events(category='Social', series_unit='day')
print(freq_social_applications.head())

avg_freq_social_applications = ae.get_daily_events(category='Social')
print(avg_freq_social_applications.head())

#### Average daily use during evening time

In [None]:
daily_use_evening = (ae.get_daily_duration(time_of_day='eve', series_unit='day') / 60)
print(daily_use_evening.head())

avg_daily_use_evening = (ae.get_daily_duration(time_of_day='eve') / 60)
print(avg_daily_use_evening.head())


#### (Average) daily use during night time

In [None]:
daily_use_night = (ae.get_daily_duration(time_of_day='night', series_unit='day') / 60)
print(daily_use_night.head())

avg_daily_use_night = (ae.get_daily_duration(time_of_day='night') / 60)
print(avg_daily_use_night.head())

#### (Average) daily amount of app events during evening/night time

In [None]:
freq_evening_use = (ae.get_daily_events(time_of_day='eve', series_unit='day'))
print(freq_evening_use.head())

avg_freq_evening_use = (ae.get_daily_events(time_of_day='eve'))
print(avg_freq_evening_use.head())

freq_night_use = (ae.get_daily_events(time_of_day='night', series_unit='day'))
print(freq_evening_use.head())

avg_freq_night_use = (ae.get_daily_events(time_of_day='night'))
print(avg_freq_night_use.head())

#### (Average) daily amount of notifications during evening/night time

In [None]:
daily_eve_notifications = no.get_daily_notifications(time_of_day='eve')
print(daily_eve_notifications.head())
avg_daily_eve_notifications = no.get_daily_notifications(time_of_day='eve', avg=True)
print(avg_daily_eve_notifications.head())
# amount of notifications is hoog

daily_night_notifications = (no.get_daily_notifications(time_of_day='night'))
print(daily_night_notifications.head())
avg_daily_night_notifications = (no.get_daily_notifications(time_of_day='night', avg=True))
print(avg_daily_night_notifications.head())
# amount of notifications is hoog

### Depression features
#### General screen time
#### Smartphone use frequency
#### Screen unlocks (=checking behaviour)
#### Average daily social smartphone use/ appevents/ notifications
#### Average smartphone appevents/ use during evening/night hours
 &#8594; All done above
#### (Average) time between sessions started on notification

In [None]:
def calc_time_between_notification_sessions(df: pd.DataFrame, avg=False):
    session_firsts = df.groupby(["id", "session"]).head(1)
    session_firsts_notif = session_firsts[session_firsts["notification"] == True]
    session_firsts_notif = session_firsts_notif.assign(start_shift= session_firsts_notif.groupby(["id", "startDate"])[["startTime"]].shift(-1))
    session_firsts_notif = session_firsts_notif.assign(duration_shift=(session_firsts_notif["start_shift"] - session_firsts_notif["endTime"]).dt.total_seconds())

    mean_shift_pd = (session_firsts_notif.groupby(["id", "startDate"])["duration_shift"].mean() / 60)
    mean_shift = mean_shift_pd.groupby("id").mean()

    if avg:
        return mean_shift.rename("mins_between_notif_sessions")
    else:
        return mean_shift_pd.rename("mins_between_notif_sessions")


In [None]:
time_between_notif_sessions = calc_time_between_notification_sessions(ae.__data__) # minutes
print(time_between_notif_sessions.head())

avg_time_between_notif_sessions = calc_time_between_notification_sessions(ae.__data__, avg=True) # minutes
print(avg_time_between_notif_sessions.head())

#### Variability smartphone use during week

In [None]:
def calc_weekly_use_variability(df: pd.DataFrame, duration: None):
    if duration:
        name = "duration"
        variability = df.groupby(["id", pd.Grouper(key="startDate", freq="W")])["duration"].sum().groupby("id").std()
    else:
        name = "appevents"
        variability = df.groupby(["id", pd.Grouper(key="startDate", freq="W")])["application"].count().groupby("id").std()

    return variability.rename(f"weekly_variability_{name}")

In [None]:
weekly_use_variability = calc_weekly_use_variability(ae.__data__, duration=True)
print(weekly_use_variability.head())

#### (Average) daily use/events/notifications
##### non-social (process) related apps

In [None]:
social_cat = ["Social", "Calling"]
all_cat = ae.__data__.category.unique().tolist()
non_social_cat = list(set(all_cat) - set(social_cat))

daily_non_social_applications = (ae.get_daily_duration(category=non_social_cat, series_unit='day') / 60).rename('daily_durations_non_social')
print(daily_non_social_applications.head())
avg_daily_non_social_applications = (ae.get_daily_duration(category=non_social_cat) / 60).rename('avg_daily_durations_non_social')
print(avg_daily_non_social_applications.head())

In [None]:
freq_non_social_applications = ae.get_daily_events(category=non_social_cat, series_unit='day').rename('daily_events_non_social')
print(freq_non_social_applications.head())
avg_freq_non_social_applications = ae.get_daily_events(category=non_social_cat).rename('daily_events_non_social')
print(avg_freq_non_social_applications.head())

In [None]:
daily_non_social_notifications = no.get_daily_notifications(category=non_social_cat).rename('daily_non_social_notifications')
print(daily_non_social_notifications.head())

avg_daily_non_social_notifications = no.get_daily_notifications(category=non_social_cat, avg=True).rename('avg_daily_non_social_notifications')
print(avg_daily_non_social_notifications.head())

##### browser application

In [None]:
browser_use = (ae.get_daily_duration(category='Web', series_unit='day') / 60)
print(browser_use.head())

avg_browser_use = (ae.get_daily_duration(category='Web') / 60)
print(avg_browser_use.head())


freq_browser_use = ae.get_daily_events(category='Web', series_unit='day')
print(freq_browser_use.head())

avg_freq_browser_use = ae.get_daily_events(category='Web', series_unit='day')
print(avg_freq_browser_use.head())


##### news applications

In [None]:
news_use = (ae.get_daily_duration(category='News', series_unit='day') / 60)
print(news_use.head())

avg_news_use = (ae.get_daily_duration(category='News') / 60)
print(avg_news_use.head())

freq_news_use = ae.get_daily_events(category='News', series_unit='day')
print(freq_news_use.head())

avg_freq_news_use = ae.get_daily_events(category='News')
print(avg_freq_news_use.head())

##### instagram

In [None]:
daily_instagram_use = (ae.get_daily_duration(application="com.instagram.android", series_unit='day') / 60).rename('daily_durations_instagram')
print(daily_instagram_use.head())

avg_daily_instagram_use = (ae.get_daily_duration(application="com.instagram.android") / 60).rename('avg_daily_durations_instagram')
print(avg_daily_instagram_use.head())

freq_instagram_use = ae.get_daily_events(application="com.instagram.android", series_unit='day').rename('daily_events_instagram')
print(freq_instagram_use.head())

avg_freq_instagram_use = ae.get_daily_events(application="com.instagram.android").rename('avg_daily_events_instagram')
print(avg_freq_instagram_use.head())

### Headaches features
##### Daily screen time
&#8594; Already done above

##### (Average) Daily call duration/frequency

In [None]:
daily_call_duration = (ae.get_daily_duration(category='Calling', series_unit='day') /60)
print(daily_call_duration.head())

avg_daily_call_duration = (ae.get_daily_duration(category='Calling') /60)
print(avg_daily_call_duration.head())

freq_daily_call = ae.get_daily_events(category='Calling', series_unit='day')
print(freq_daily_call.head())

avg_freq_daily_call = ae.get_daily_events(category='Calling')
print(avg_freq_daily_call.head())

### Activity features
##### Average daily number of (unique) used apps

In [None]:
# TODO unique apps

##### (Average) daily duration/ frequency of app use

In [None]:
# TODO

##### Increase/decrease in battery status

In [None]:
def calc_battery_status(df: pd.DataFrame):
    """
    Calculates four battery status variables per participant:
    - daily average battery level
    - daily std dev of battery level
    - daily average charge %
    - daily average discharge %

    :param df: the appevents DataFrame
    :return: results DataFrame with 4 variables per participant
    """
    df = df.copy()
    df = df.sort_values(['id', 'startTime']).assign(battery_shift= df.groupby(['id', 'startDate'])['battery'].shift(-1))
    df = df.assign(battery_change=df['battery_shift'] - df['battery'])

    battery_avg = (df.groupby(["id", "startDate"])["battery"].mean().groupby('id').mean()).rename('avg_daily_battery')
    battery_std = (df.groupby(["id", "startDate"])["battery"].mean().groupby('id').std()).rename("battery_std")

    battery_discharge = (df[df["battery_change"] < 0].groupby(['id', 'startDate'])['battery_change'].sum()
                         .abs().groupby('id').mean()).rename("battery_daily_discharge")
    battery_charge = (df[df["battery_change"] > 0].groupby(["id", "startDate"])["battery_change"].sum()
                      .abs().groupby("id").mean()).rename("battery_daily_charge")

    res = pd.concat([
        battery_charge,
        battery_discharge
    ], axis=1)

    return res

In [None]:
    battery_status = calc_battery_status(ae.__data__)
    print(battery_status.head())
    # TODO not completely right, averaging app events not best way
    # TODO per day

##### Average daily time between consecutive phone use sessions

In [None]:
def calc_time_between_consecutive_sessions(df: pd.DataFrame, avg=False):
    session_firsts = df.groupby(["id", "session"]).head(1)

    session_firsts = session_firsts.assign(start_shift= session_firsts.groupby(["id", "startDate"])[["startTime"]].shift(-1))
    session_firsts= session_firsts.assign(duration_shift=(session_firsts["start_shift"] - session_firsts["endTime"]).dt.total_seconds())

    mean_shift_pd = (session_firsts.groupby(["id", "startDate"])["duration_shift"].mean() / 60) # pd= per day
    mean_shift = mean_shift_pd.groupby("id").mean()  # avg by user

    if avg:
        return mean_shift.rename("mins_between_sessions")
    else:
        return mean_shift_pd.rename("mins_between_sessions")

In [None]:
daily_time_between_sessions = calc_time_between_consecutive_sessions(ae.__data__)
print(daily_time_between_sessions.index.dtype)
avg_daily_time_between_sessions = calc_time_between_consecutive_sessions(ae.__data__, avg=True)
print(avg_daily_time_between_sessions.head())

## 4. Merge all features in one dataframe
### Daily counted features

In [None]:
# merge all dataframes with 'day' index
temp1 = (pd.merge(general_screen_time, smartphone_use_freq, on=['id', 'day']
).merge(duration_MIM_applications, on=['id', 'day']
).merge(freq_MIM_applications, on=['id', 'day']
).merge(daily_use_work_hours, on=['id', 'day']
).merge(daily_social_applications, on=['id', 'day']
).merge(freq_social_applications, on=['id', 'day']
).merge(daily_use_evening,  on=['id', 'day']
).merge(daily_use_night, on=['id', 'day']
).merge(freq_evening_use, on=['id', 'day']
).merge(freq_night_use, on=['id', 'day']
).merge(daily_non_social_applications, on=['id', 'day']
).merge(freq_non_social_applications, on=['id', 'day']
).merge(browser_use, on=['id', 'day']
).merge(freq_browser_use, on=['id', 'day']
).merge(news_use, on=['id', 'day']
).merge(freq_news_use, on=['id', 'day']
).merge(daily_instagram_use, on=['id', 'day']
).merge(freq_instagram_use, on=['id', 'day']
).merge(daily_call_duration, on=['id', 'day']
).merge(freq_daily_call, on=['id', 'day']
))

# merge all dataframes with 'date' index
temp2 = (pd.merge(checking_behaviour, time_between_notif_sessions, on=['id', 'startDate']))

# merge all dataframes with 'startDate' index
temp3 = (pd.merge(daily_MIM_notifications, daily_social_notifications, on=['id', 'date']
).merge(daily_eve_notifications, on=['id', 'date']
).merge(daily_night_notifications, on=['id', 'date']
).merge(daily_non_social_notifications, on=['id', 'date']))

# change type of the date columns a give similar name
temp1 = temp1.reset_index().astype({'day': 'datetime64[ns]'}).set_index(['id','day'])
temp2 = temp2.reset_index().rename(columns={'startDate': 'day'}).set_index(['id', 'day'])
temp3 = temp3.reset_index().astype({'date': 'datetime64[ns]'}).rename(columns={'date': 'day'}).set_index(['id','day'])

# merge into one dataframe
temp = (pd.merge(temp1, temp2, on=['id', 'day']).merge(temp3, on=['id', 'day']))
print(temp.head())

#### Add stress labels

In [None]:
labels = ae.__data__[['id', 'day','daily_stress_level']].reset_index(drop=True)\
    .astype({'day': 'datetime64[ns]', 'daily_stress_level': 'int64'}).set_index(['id','day'])\
    .groupby(['id','day']).first()

print(labels.head(15))
daily_feat = pd.merge(temp, labels, on=['id', 'day'])

The resulting dataframe with the features and labels per day, per user:

In [None]:
print(daily_feat.head())



### Averaged features:

In [None]:
avg_feat = pd.merge(avg_checking_behaviour, avg_daily_MIM_notifications, on=['id']
).merge(avg_daily_use_work_hours, on=['id']
).merge(avg_daily_social_notifications, on=['id']
).merge(avg_daily_social_applications, on=['id']
).merge(avg_freq_social_applications, on=['id']
).merge(avg_daily_use_evening, on=['id']
).merge(avg_daily_use_night, on=['id']
).merge(avg_freq_evening_use, on=['id']
).merge(avg_freq_night_use, on=['id']
).merge(avg_daily_eve_notifications, on=['id']
).merge(avg_daily_night_notifications, on=['id']
).merge(avg_time_between_notif_sessions, on=['id']
).merge(weekly_use_variability, on=['id']
).merge(avg_daily_non_social_applications, on=['id']
).merge(avg_freq_non_social_applications, on=['id']
).merge(avg_daily_non_social_notifications, on=['id']
).merge(avg_browser_use, on=['id']
).merge(avg_freq_browser_use, on=['id']
).merge(avg_news_use, on=['id']
).merge(avg_freq_news_use, on=['id']
).merge(avg_daily_instagram_use, on=['id']
).merge(avg_freq_instagram_use, on=['id']
).merge(avg_daily_call_duration, on=['id']
).merge(avg_freq_daily_call, on=['id']
).merge(avg_daily_time_between_sessions, on=['id']).groupby('id').first()

print(avg_feat.head())

Add stress labels averaged per user:

In [None]:
labels = ae.__data__[['id', 'day','daily_stress_level']].reset_index(drop=True)\
    .astype({'day': 'datetime64[ns]', 'daily_stress_level': 'int64'}).set_index(['id','day'])\
    .groupby(['id','day']).first().groupby('id').mean().astype({'daily_stress_level':'float64'})


In [None]:
# add label to each feature vector
avg_feat = avg_feat.merge(labels, on=['id'])
# fill in Nan's with mean of the feature
avg_feat = avg_feat.fillna(avg_feat.mean(axis=0))

print(avg_feat.head())

## Feature correlation/importance/selection/...

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

correlation = avg_feat.corr()
f ,ax = plt.subplots(figsize = (18, 14))
plt.title('Correlation of Numeric Features and Daily Stress Level',y=1,size=16)
sns.heatmap(correlation, square= True, annot=True, cmap='coolwarm')

correlations with daily stress level:

In [None]:
plt.figure(figsize=(6,6))
sns.heatmap(correlation[['daily_stress_level']].sort_values(by=['daily_stress_level'],ascending=False),
            cmap='coolwarm',
            vmax=0.3,
            annot=True);
#plt.savefig('correlations.png', format='png', dpi=1000, bbox_inches='tight')


Scatterplot of most correlated feature with daily stress level

In [None]:
sns.scatterplot(data=avg_feat, x=avg_feat['daily_stress_level'], y=avg_feat['avg_daily_notifications_chat'])

### F-test for each feature
Univariate linear regression tests returning F-statistic.
Explanation [here](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html)


In [None]:
from sklearn.feature_selection import f_regression, SequentialFeatureSelector
labels = avg_feat.pop('daily_stress_level')
feature_names = avg_feat.columns.tolist()

In [None]:
f_statistic,_ = f_regression(avg_feat, labels)

f_statistics = sorted(zip(feature_names, f_statistic), key=lambda x:x[1], reverse=True)
print(f_statistics)

### Standardize feature set

In [None]:
# standardize features
res_normalized = avg_feat.transform(lambda x: ((x - x.mean()) / x.std()))
print(res_normalized.head())

### Fit regression models

In [None]:
# build regression
X = res_normalized
y = labels

# set-up models
dum = DummyRegressor(strategy='mean')
en = ElasticNet()
las = Lasso()
svr = SVR()
rfr = RandomForestRegressor(n_estimators=100)

# takes too long
#data_dmatrix = xgboost.DMatrix(data=X, label=y)
#xg_params = {"objective": "reg:squarederror", 'colsample_bytree': 0.3, 'learning_rate': 0.2,
#              'max_depth': 10, 'alpha': 10}


# fit and cross-validate
dum_results = cross_validate(dum, X, y, cv=5,
                             scoring=['explained_variance', 'r2', 'neg_mean_absolute_error',
                                      'neg_root_mean_squared_error'])
en_results = cross_validate(en, X, y, cv=5,
                             scoring=['explained_variance', 'r2', 'neg_mean_absolute_error',
                                      'neg_root_mean_squared_error'])
las_results = cross_validate(las, X, y, cv=5,
                             scoring=['explained_variance', 'r2', 'neg_mean_absolute_error',
                                      'neg_root_mean_squared_error'])
svr_results = cross_validate(svr, X, y, cv=5,
                             scoring=['explained_variance', 'r2', 'neg_mean_absolute_error',
                                      'neg_root_mean_squared_error'])
rfr_results = cross_validate(rfr, X, y, cv=5,
                             scoring=['explained_variance', 'r2', 'neg_mean_absolute_error',
                                      'neg_root_mean_squared_error'])
#xg_results = xgboost.cv(dtrain=data_dmatrix, params=xg_params, nfold=5, num_boost_round=50,
#                        early_stopping_rounds=10, metrics="rmse", as_pandas=True, seed=123)

Rank all models frow lowest to highest RMSE

In [None]:
dict = {'dummy': dum_results['test_neg_root_mean_squared_error'][0],
       'en': en_results['test_neg_root_mean_squared_error'][0],
        'lasso' : las_results['test_neg_root_mean_squared_error'][0],
       'svr': svr_results['test_neg_root_mean_squared_error'][0],
        'rfr': rfr_results['test_neg_root_mean_squared_error'][0]}

dict = {key : round(dict[key], 3) for key in dict}

ranked = sorted(dict, key=dict.get, reverse=True)
print(ranked)
print(f"models ranked from better to worse: {ranked} (RMSE as metric)")

# plot results
metric = 'test_neg_root_mean_squared_error'
plt.boxplot([dum_results[metric],
             en_results[metric],
             las_results[metric],
             svr_results[metric],
             rfr_results[metric]],
             labels=['dummy', 'elasticnet', 'lasso', 'svr', 'rfr'])

plt.title('RMSE on test set for different regressors')
plt.savefig('./RMSE_boxplots.png', format='png', dpi=1000)


In [None]:
print("FINISHED PROGRAM")
