In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.tsaplots import plot_acf

# Read in the data
df = pd.read_csv('dataset_mood_smartphone-2.csv')

#split date and time
df['date'] = pd.to_datetime(df['time']).dt.date
df['time'] = pd.to_datetime(df['time']).dt.time

grouped_df = df.groupby(['id', 'date', 'time', 'variable']).sum().reset_index()

#create dataframe of values per id per date per time
df = grouped_df.pivot_table(index=['id', 'date', 'time'], columns='variable', values='value')

# Apply different aggregation functions to different variables
agg_dict = {'mood': 'mean', 'circumplex.arousal': 'mean', 'circumplex.valence': 'mean', 'activity': 'mean', 'screen': 'sum', 'call': 'sum', 'sms': 'sum', 'appCat.builtin': 'sum', 'appCat.communication': 'sum', 'appCat.entertainment': 'sum', 'appCat.finance': 'sum', 'appCat.game': 'sum', 'appCat.office': 'sum', 'appCat.other': 'sum', 'appCat.social': 'sum', 'appCat.travel': 'sum', 'appCat.unknown': 'sum', 'appCat.weather': 'sum', 'appCat.utilities': 'sum'}

# Create df where some of the columns values are summed and for some the mean is taken for each day
df = df.groupby(['date', 'id']).agg(agg_dict).reset_index()

df
# one-hot encode the person
# Perform one-hot encoding using Pandas get_dummies() function
one_hot_df = pd.get_dummies(df['id'])

# Concatenate the one-hot encoded columns to the original DataFrame
df = pd.concat([df, one_hot_df], axis=1)

# Drop the original categorical column
df.drop('id', axis=1, inplace=True)

df


Unnamed: 0,date,mood,circumplex.arousal,circumplex.valence,activity,screen,call,sms,appCat.builtin,appCat.communication,...,AS14.24,AS14.25,AS14.26,AS14.27,AS14.28,AS14.29,AS14.30,AS14.31,AS14.32,AS14.33
0,2014-02-17,,,,,0.000000,2.0,0.0,0.000,0.000,...,0,0,0,0,0,0,0,0,0,0
1,2014-02-17,,,,,0.000000,4.0,2.0,0.000,0.000,...,0,0,0,0,0,0,0,0,0,0
2,2014-02-17,,,,,0.000000,0.0,1.0,0.000,0.000,...,0,0,0,0,0,0,0,0,0,0
3,2014-02-17,,,,,0.000000,5.0,0.0,0.000,0.000,...,0,0,0,0,0,0,0,0,0,0
4,2014-02-17,,,,,0.000000,0.0,4.0,0.000,0.000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1968,2014-06-05,7.0,0.0,0.80,0.152343,2371.588999,11.0,0.0,1749.277,958.821,...,1,0,0,0,0,0,0,0,0,0
1969,2014-06-06,7.5,0.5,0.25,0.189476,2733.332000,16.0,0.0,945.898,1710.924,...,1,0,0,0,0,0,0,0,0,0
1970,2014-06-07,8.0,-1.0,0.00,0.081893,5530.377999,0.0,2.0,5027.498,735.965,...,1,0,0,0,0,0,0,0,0,0
1971,2014-06-08,6.5,-1.5,1.00,0.079510,1131.197999,0.0,0.0,351.389,375.515,...,1,0,0,0,0,0,0,0,0,0


In [67]:
#df['MA'] = df['mood'].ewm(span=10).mean()

def MA_on_missing_values(dataframe, column, n):
    '''
    Performs moving averages on a column and creates new column where NaN values are substituted
    '''
    dataframe['MA'] = dataframe[column].ewm(span=n).mean()

    for index, row in dataframe.iterrows():
        if pd.isna(row[column]):
            df.loc[index, column] = row['MA']

    return dataframe

selected_df = MA_on_missing_values(df, 'mood', 10)


# Select rows where column 'B' has the value 1
selected_df = df[df['AS14.01'] == 1]
selected_df.head(60)
            



Unnamed: 0,date,mood,circumplex.arousal,circumplex.valence,activity,screen,call,sms,appCat.builtin,appCat.communication,...,AS14.26,AS14.27,AS14.28,AS14.29,AS14.30,AS14.31,AS14.32,AS14.33,weighted_moving_avg_mood,MA
0,2014-02-17,,,,,0.0,2.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,,
23,2014-02-18,,,,,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,,
44,2014-02-19,,,,,0.0,7.0,2.0,0.0,0.0,...,0,0,0,0,0,0,0,0,,
64,2014-02-20,,,,,0.0,2.0,3.0,0.0,0.0,...,0,0,0,0,0,0,0,0,,
82,2014-02-21,,,,,0.0,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,,
102,2014-02-22,,,,,0.0,2.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,,
159,2014-02-25,,,,,0.0,3.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,,
177,2014-02-26,6.25,-0.25,0.75,,0.0,1.0,2.0,0.0,0.0,...,0,0,0,0,0,0,0,0,,6.25
196,2014-02-27,6.333333,0.0,0.333333,,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,,6.26543
217,2014-02-28,6.331532,,,,0.0,4.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,,6.330573


In [42]:
app_times = ['appCat.builtin','appCat.communication','appCat.entertainment','appCat.finance','appCat.game','appCat.office','appCat.other','appCat.social','appCat.travel','appCat.unknown','appCat.utilities','appCat.weather']

# Sum the values in the specified columns and create a new column 'app_times_sum'
df['sum_screentime'] = df[app_times].sum(axis=1)

df['overdue_time'] = df['screen'] - df['sum_screentime']

df.head(-51)

Unnamed: 0,date,mood,circumplex.arousal,circumplex.valence,activity,screen,call,sms,appCat.builtin,appCat.communication,...,AS14.26,AS14.27,AS14.28,AS14.29,AS14.30,AS14.31,AS14.32,AS14.33,sum_screentime,overdue_time
0,2014-02-17,,,,,0.000000,2.0,0.0,0.000,0.000,...,0,0,0,0,0,0,0,0,0.000,0.000000
1,2014-02-17,,,,,0.000000,4.0,2.0,0.000,0.000,...,0,0,0,0,0,0,0,0,0.000,0.000000
2,2014-02-17,,,,,0.000000,0.0,1.0,0.000,0.000,...,0,0,0,0,0,0,0,0,0.000,0.000000
3,2014-02-17,,,,,0.000000,5.0,0.0,0.000,0.000,...,0,0,0,0,0,0,0,0,0.000,0.000000
4,2014-02-17,,,,,0.000000,0.0,4.0,0.000,0.000,...,0,0,0,0,0,0,0,0,0.000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1917,2014-05-16,7.4,0.6,0.8,0.107087,9174.194000,2.0,0.0,2902.587,4533.666,...,1,0,0,0,0,0,0,0,10469.791,-1295.597000
1918,2014-05-16,7.0,0.4,0.8,0.051781,5389.559000,11.0,0.0,2212.464,541.080,...,0,0,0,0,0,0,0,1,6274.341,-884.782000
1919,2014-05-17,7.6,-0.4,0.8,0.030052,5039.252001,1.0,0.0,2311.960,3430.633,...,0,0,0,0,0,0,0,0,6060.445,-1021.192999
1920,2014-05-17,7.4,0.4,0.4,0.133427,4837.187001,3.0,0.0,1027.730,1690.919,...,1,0,0,0,0,0,0,0,5265.258,-428.070999
