In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from IPython.display import clear_output
import datetime as dt

In [2]:
def aggregate_individual_data_per_day(raw=pd.DataFrame, to_average=None ):
    """
    for the raw data of one individual a new df is generated with the variables as columns. Values are summed unless their variable name is 
    specified to be averaged in to_average.

    Parameters
    ----------
    INPUT
    raw : df
        DataFrame with the raw data
    to_average : array-like
        array specifiying which variables to average

    RETURNS
    processed : df
        DataFrame wiht the processed data
    """
    raw = raw.copy()
    # add column with date only
    raw['time'] = pd.to_datetime(raw.loc[:,'time'])
    raw['time'] = raw['time'].dt.date

    # initialize new df with variables as columns
    vars = ['date', 'mood', 'circumplex.arousal', 'circumplex.valence',
       'activity', 'screen', 'call', 'sms', 'appCat.builtin',
       'appCat.communication', 'appCat.entertainment', 'appCat.finance',
       'appCat.game', 'appCat.office', 'appCat.other', 'appCat.social',
       'appCat.travel', 'appCat.unknown', 'appCat.utilities', 'appCat.weather',
       'mood_std', 'circumplex.arousal_std', 'circumplex.valence_std',
       'activity_std']
    processed = pd.DataFrame(columns= vars)

    # fill df
    # loop over days
    for i, day in enumerate(raw['time'].unique()):
        processed.loc[i, 'time'] = day
        row = (raw['time'] == day)

        # loop over variables
        for col in vars[1:-1]:

            # average values of given specified variables
            if col in to_average:
                processed.loc[i, col] = raw.loc[row & (raw['variable'] == col)].value.mean()
                processed.loc[i, col+'_std'] = raw.loc[row & (raw['variable'] == col)].value.std()

            
            # sum values for the other variables
            else:
                processed.loc[i, col] = raw.loc[row & (raw['variable'] == col)].value.abs().sum()

    # sort the df according to date
    processed['time'] = pd.to_datetime(processed.loc[:,'time'])
    processed.sort_values('time')

    # drop rows without mood or or screen reading
    processed.drop(processed.loc[processed['mood'].isna() | (processed['screen'] == 0)].index, inplace=True)

    processed.reset_index(drop= True, inplace= True)
    
    processed.insert(1, 'weekday', processed['time'].dt.weekday)

    return processed

def aggregate_individual_data_per_reading(raw=pd.DataFrame, to_average=None ):
    """
    for the raw data of one individual a new df is generated with the variables as columns. Values are summed unless their variable name is 
    specified to be averaged in to_average.

    Parameters
    ----------
    INPUT
    raw : df
        DataFrame with the raw data
    to_average : array-like
        array specifiying which variables to average

    RETURNS
    processed : df
        DataFrame wiht the processed data
    """
    raw = raw.copy()
    # add column with date only
    raw['time'] = pd.to_datetime(raw.loc[:,'time'])
    raw.sort_values('time')

    # initialize new df with variables as columns
    vars = ['time', 'mood', 'circumplex.arousal', 'circumplex.valence',
       'activity', 'screen', 'call', 'sms', 'appCat.builtin',
       'appCat.communication', 'appCat.entertainment', 'appCat.finance',
       'appCat.game', 'appCat.office', 'appCat.other', 'appCat.social',
       'appCat.travel', 'appCat.unknown', 'appCat.utilities', 'appCat.weather']
    processed = pd.DataFrame(columns = vars)

    measure_times = raw['time'].loc[raw['variable'] == 'mood']
    # add an date in the past for first measurement
    measure_times = pd.concat([pd.to_datetime(pd.Series(['2014-01-01 00:00:00.000'])), measure_times], ignore_index=True)

    # fill df
    # loop over times
    for i in range(len(measure_times)-1):
        processed.loc[i, 'time'] = measure_times[i+1]
        rows = (measure_times[i] < raw['time']) & (raw['time'] <= measure_times[i+1])

        # loop over variables
        for col in vars[1:-1]:

            # average values of given specified variables
            if col in to_average:
                # using mean of 1 value to get single value and not an array
                processed.loc[i, col] = raw.loc[(raw['time'] == measure_times[i]) & (raw['variable'] == col)].value.mean()

            
            # sum values for the other variables
            else:
                processed.loc[i, col] = raw.loc[rows & (raw['variable'] == col)].value.abs().sum()

    # sort the df according to date
    processed['time'] = pd.to_datetime(processed.loc[:,'time'])
    processed.sort_values('time')

    # delete all data points where mood or screen are 0
    processed.drop(processed.loc[processed['mood'].isna() | (processed['screen'] == 0)].index, inplace=True)
    #processed.drop(processed.loc[processed['mood'].isna()].index, inplace=True)

    processed.reset_index(drop= True, inplace= True)

    # check that screen time does not exceed the time intervalls
    for i in range(1,len(processed)):
        dt = processed['time'].loc[i] - processed['time'].loc[i-1]
        dt = dt.seconds
        if  dt < processed['screen'].loc[i]:
            processed.iloc[i, 5:-1] = np.nan

    processed.insert(1, 'weekday', processed['time'].dt.weekday)

    return processed

def aggregate_per_day_from_measurement(raw=pd.DataFrame, to_average=None ):
    """
    for the aggregated data per measurement the values are aggregated per day. Summed unless the column name is 
    specified to be averaged in to_average.

    Parameters
    ----------
    INPUT
    raw : df
        DataFrame with the aggragated data per measurement
    to_average : array-like
        array specifiying which variables to average

    RETURNS
    processed : df
        DataFrame wiht the processed data
    """
    raw = raw.copy()

    # add column with date only
    raw['time'] = pd.to_datetime(raw.loc[:,'time'])
    raw['time'] = raw['time'].dt.date

    # initialize new df with variables as columns
    vars = raw.columns
    processed = pd.DataFrame(columns= vars)

    # fill df
    # loop over days
    for i, day in enumerate(raw['time'].unique()):
        processed.loc[i, 'time'] = day
        processed.loc[i, 'weekday'] = day.weekday()
        row = (raw['time'] == day)

        # loop over variables
        for col in vars[2:-1]:

            # average values of given specified variables
            if col in to_average:
                processed.loc[i, col] = raw.loc[row, col].mean()
                #processed.loc[i, col+'_std'] = raw.loc[row, col].value.std()

            
            # sum values for the other variables
            else:
                processed.loc[i, col] = raw.loc[row, col].abs().sum()

    # sort the df according to date
    processed['time'] = pd.to_datetime(processed.loc[:,'time'])
    processed.sort_values('time')

    # # drop rows without mood or or screen reading
    # processed.drop(processed.loc[processed['mood'].isna() & (processed['screen'] == 0)].index, inplace=True)

    processed.reset_index(drop= True, inplace= True)

    return processed

In [3]:
raw_data = pd.read_csv('data/dataset_mood_smartphone.csv')

In [4]:
raw_data.drop('Unnamed: 0', inplace=True, axis=1)
raw_data.head()

Unnamed: 0,id,time,variable,value
0,AS14.01,2014-02-26 13:00:00.000,mood,6.0
1,AS14.01,2014-02-26 15:00:00.000,mood,6.0
2,AS14.01,2014-02-26 18:00:00.000,mood,6.0
3,AS14.01,2014-02-26 21:00:00.000,mood,7.0
4,AS14.01,2014-02-27 09:00:00.000,mood,6.0


In [5]:
raw_data['time'] = pd.to_datetime(raw_data['time'])

In [6]:
# delete lonely values followed by long gap at the start of series
raw_data.drop(raw_data.loc[(raw_data.id == 'AS14.01') & (raw_data.time.dt.date == dt.date(2014, 2, 26))].index, inplace= True)
raw_data.drop(raw_data.loc[(raw_data.id == 'AS14.01') & (raw_data.time.dt.date == dt.date(2014, 2, 27))].index, inplace= True)
raw_data.drop(raw_data.loc[(raw_data.id == 'AS14.12') & (raw_data.time.dt.date == dt.date(2014, 3, 15))].index, inplace= True)
raw_data.dropna(inplace=True)

In [7]:
# cast id to integers
ids = raw_data['id'].unique()

### per day

In [8]:
# # specify variables to average
# to_average = np.array(['mood', 'circumplex.arousal', 'circumplex.valence', 'activity'])

# # process df for each individual
# processed_dfs = []
# for i in tqdm(ids):
#     raw_df = raw_data.loc[raw_data['id'] == i].copy()
#     processed_df = aggregate_individual_data_per_day(raw_df, to_average)
#     processed_dfs.append(processed_df)
#     processed_df.to_csv('aggregated_individual_data/'+str(i)+'_aggregated_per_day.csv')

### per measurement

In [9]:
# # specify variables to average
# to_average = np.array(['mood', 'circumplex.arousal', 'circumplex.valence', 'activity'])

# # process df for each individual
# processed_dfs_measure = []
# for i in tqdm(ids):
#     raw_df = raw_data.loc[raw_data['id'] == i].copy()
#     processed_df = aggregate_individual_data_per_reading(raw_df, to_average)
#     processed_dfs_measure.append(processed_df)
#     processed_df.to_csv('aggregated_individual_data_per_measurement/'+str(i)+'_aggregated_per_measurement.csv')

### per measurement and day

In [10]:
# specify variables to average
to_average = np.array(['mood', 'circumplex.arousal', 'circumplex.valence', 'activity'])
processed_dfs_measure = []
processed_dfs_day = []

for i in tqdm(ids):
    raw_df = raw_data.loc[raw_data['id'] == i].copy()
    processed_df = aggregate_individual_data_per_reading(raw_df, to_average)
    processed_dfs_measure.append(processed_df)
    processed_df.to_csv('data/aggregated_individual_data_per_measurement/'+str(i)+'_aggregated_per_measurement.csv')

    processed_df = aggregate_per_day_from_measurement(processed_df, to_average)
    processed_dfs_day.append(processed_df)
    processed_df.to_csv('data/aggregated_individual_data/'+str(i)+'_aggregated_per_day.csv')

100%|██████████| 27/27 [01:03<00:00,  2.37s/it]
