In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from statsmodels.graphics.tsaplots import plot_acf

# Read in the data
df = pd.read_csv('dataset_mood_smartphone-2.csv')

#split date and time
df['date'] = pd.to_datetime(df['time']).dt.date
df['time'] = pd.to_datetime(df['time']).dt.time

grouped_df = df.groupby(['id', 'date', 'time', 'variable']).sum().reset_index()

#create dataframe of values per id per date per time
df = grouped_df.pivot_table(index=['id', 'date', 'time'], columns='variable', values='value')

# Apply different aggregation functions to different variables
agg_dict = {'mood': 'mean', 'circumplex.arousal': 'mean', 'circumplex.valence': 'mean', 'activity': 'mean', 'screen': 'sum', 'call': 'sum', 'sms': 'sum', 'appCat.builtin': 'sum', 'appCat.communication': 'sum', 'appCat.entertainment': 'sum', 'appCat.finance': 'sum', 'appCat.game': 'sum', 'appCat.office': 'sum', 'appCat.other': 'sum', 'appCat.social': 'sum', 'appCat.travel': 'sum', 'appCat.unknown': 'sum', 'appCat.weather': 'sum', 'appCat.utilities': 'sum'}

# Create df where some of the columns values are summed and for some the mean is taken for each day
df = df.groupby(['date', 'id']).agg(agg_dict).reset_index()



Removal of values 

- Remove instances that start with a lot of NaN for the mood
- Remove values that are not in the range depicted (this should actually be done before summing and taking the mean)
- Give the value 0 to all NaN for the applications (we can check later if there is a better way)

In [22]:
# Function that removes the starting nan values
def remove_starting_nan(column_name, dataframe, n_param):
    '''
    params: 
        colmun_name -> takes the name of the column as a string
        dataframe -> takes the dataframe object
        n_params -> int that takes the number of real float values before the NaN are removed

    This function removes the starting NaN values before an amount of n float values are detected
    Therefore removing all the instances before any real measurements are performed
    '''

    distinct_values = dataframe['id'].unique()
    new_df = pd.DataFrame()

    for value in distinct_values:
        distinct_df = dataframe[dataframe['id'] == value]
        count_nan = 0
        count_float = 0
        count_final = 0

        for e in distinct_df[column_name]:
            if math.isnan(e):
                count_nan += 1
                count_float = 0
            if isinstance(e, float):
                count_float += 1
            if count_float > n_param:
                n = count_nan 
                added_df = distinct_df.iloc[n:]
                added_df
                new_df = new_df.append(added_df)
                break

    return new_df
    
df = remove_starting_nan('mood', df, 4)
        
num_nan_values = df['mood'].isna().sum()
print(num_nan_values)

df 


40


variable,date,id,mood,circumplex.arousal,circumplex.valence,activity,screen,call,sms,appCat.builtin,...,appCat.entertainment,appCat.finance,appCat.game,appCat.office,appCat.other,appCat.social,appCat.travel,appCat.unknown,appCat.weather,appCat.utilities
645,2014-03-21,AS14.01,6.20,0.2,0.20,0.134050,17978.907000,6.0,0.0,3139.218,...,1007.456,49.544,0.0,172.206,239.751,4508.500,915.445,0.000,0.000,598.754
670,2014-03-22,AS14.01,6.40,0.6,0.40,0.236880,6142.161000,3.0,1.0,731.429,...,93.324,21.076,0.0,0.000,98.143,439.632,37.305,0.000,0.000,117.621
691,2014-03-23,AS14.01,6.80,0.2,0.80,0.142741,6773.832001,0.0,0.0,1286.246,...,94.346,43.403,0.0,0.000,72.823,900.839,0.000,0.000,30.386,30.086
714,2014-03-24,AS14.01,6.00,0.8,0.00,0.078961,15047.351001,10.0,0.0,866.956,...,976.971,34.106,0.0,3.010,66.558,3223.626,419.805,0.000,0.000,178.732
739,2014-03-25,AS14.01,6.75,0.5,0.50,0.098374,21475.354999,0.0,1.0,1032.768,...,68.206,43.054,0.0,0.000,178.819,1919.471,0.000,235.223,0.000,222.893
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1968,2014-06-05,AS14.24,7.00,0.0,0.80,0.152343,2371.588999,11.0,0.0,1749.277,...,0.000,0.000,0.0,0.000,47.360,171.964,225.737,0.000,0.000,0.000
1969,2014-06-06,AS14.24,7.50,0.5,0.25,0.189476,2733.332000,16.0,0.0,945.898,...,0.000,0.000,0.0,0.000,36.414,0.000,0.000,0.000,0.000,2.072
1970,2014-06-07,AS14.24,8.00,-1.0,0.00,0.081893,5530.377999,0.0,2.0,5027.498,...,0.000,0.000,0.0,0.000,6.214,106.850,0.000,0.000,0.000,0.000
1971,2014-06-08,AS14.24,6.50,-1.5,1.00,0.079510,1131.197999,0.0,0.0,351.389,...,0.000,0.000,0.0,0.000,18.069,190.760,0.000,0.000,0.000,0.000


In [16]:
# # Perform one-hot encoding one the persons
# one_hot_df = pd.get_dummies(df['id'])

# # Concatenate the one-hot encoded columns to the original DataFrame
# df = pd.concat([df, one_hot_df], axis=1)

# # Drop the original categorical column
# df.drop('id', axis=1, inplace=True)

# df

In [23]:
# Remove instances that are not in the range of the depicted column
# ATTENTION should be performed before merging the data!!
def range_removal(lower, upper, column_name):

    column = df[column_name]

    # If we want to remove the NaN values use this
    #filtered_df = df[(column >= lower) & (column <= upper)]

    # If we want to keep the NaN values use this, so either comment one of those out
    filtered_df = df[column.between(lower, upper) | column.isna()]

    return filtered_df

df = range_removal(1, 10, 'mood')
df = range_removal(-2, 2, 'circumplex.arousal')
df = range_removal(-2, 2, 'circumplex.valence')
df = range_removal(0, 1, 'activity')

df




variable,date,id,mood,circumplex.arousal,circumplex.valence,activity,screen,call,sms,appCat.builtin,...,appCat.entertainment,appCat.finance,appCat.game,appCat.office,appCat.other,appCat.social,appCat.travel,appCat.unknown,appCat.weather,appCat.utilities
645,2014-03-21,AS14.01,6.20,0.2,0.20,0.134050,17978.907000,6.0,0.0,3139.218,...,1007.456,49.544,0.0,172.206,239.751,4508.500,915.445,0.000,0.000,598.754
670,2014-03-22,AS14.01,6.40,0.6,0.40,0.236880,6142.161000,3.0,1.0,731.429,...,93.324,21.076,0.0,0.000,98.143,439.632,37.305,0.000,0.000,117.621
691,2014-03-23,AS14.01,6.80,0.2,0.80,0.142741,6773.832001,0.0,0.0,1286.246,...,94.346,43.403,0.0,0.000,72.823,900.839,0.000,0.000,30.386,30.086
714,2014-03-24,AS14.01,6.00,0.8,0.00,0.078961,15047.351001,10.0,0.0,866.956,...,976.971,34.106,0.0,3.010,66.558,3223.626,419.805,0.000,0.000,178.732
739,2014-03-25,AS14.01,6.75,0.5,0.50,0.098374,21475.354999,0.0,1.0,1032.768,...,68.206,43.054,0.0,0.000,178.819,1919.471,0.000,235.223,0.000,222.893
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1968,2014-06-05,AS14.24,7.00,0.0,0.80,0.152343,2371.588999,11.0,0.0,1749.277,...,0.000,0.000,0.0,0.000,47.360,171.964,225.737,0.000,0.000,0.000
1969,2014-06-06,AS14.24,7.50,0.5,0.25,0.189476,2733.332000,16.0,0.0,945.898,...,0.000,0.000,0.0,0.000,36.414,0.000,0.000,0.000,0.000,2.072
1970,2014-06-07,AS14.24,8.00,-1.0,0.00,0.081893,5530.377999,0.0,2.0,5027.498,...,0.000,0.000,0.0,0.000,6.214,106.850,0.000,0.000,0.000,0.000
1971,2014-06-08,AS14.24,6.50,-1.5,1.00,0.079510,1131.197999,0.0,0.0,351.389,...,0.000,0.000,0.0,0.000,18.069,190.760,0.000,0.000,0.000,0.000


In [24]:
import pandas as pd

def MA_on_missing_values(dataframe, column, n):
    '''
    Performs moving averages on a column for each distinct 'id' in the 'mood' column,
    and creates a new column where NaN values are substituted
    '''
    # Group by 'id' column
    groups = dataframe.groupby('id')

    # Perform moving averages for each group
    dataframe['MA'] = groups[column].ewm(span=n).mean().reset_index(0, drop=True)

    for index, row in dataframe.iterrows():
        if pd.isna(row[column]):
            dataframe.loc[index, column] = row['MA']

    return dataframe

selected_df = MA_on_missing_values(df, 'mood', 10)
selected_df = MA_on_missing_values(df, 'circumplex.arousal', 10)
selected_df = MA_on_missing_values(df, 'circumplex.valence', 10)
selected_df = MA_on_missing_values(df, 'activity', 10)

selected_df


variable,date,id,mood,circumplex.arousal,circumplex.valence,activity,screen,call,sms,appCat.builtin,...,appCat.finance,appCat.game,appCat.office,appCat.other,appCat.social,appCat.travel,appCat.unknown,appCat.weather,appCat.utilities,MA
645,2014-03-21,AS14.01,6.20000,0.200000,0.200000,0.134050,17978.907000,6.0,0.0,3139.218,...,49.544,0.0,172.206,239.751,4508.500,915.445,0.000,0.000,598.754,0.134050
670,2014-03-22,AS14.01,6.40000,0.600000,0.400000,0.236880,6142.161000,3.0,1.0,731.429,...,21.076,0.0,0.000,98.143,439.632,37.305,0.000,0.000,117.621,0.190607
691,2014-03-23,AS14.01,6.80000,0.200000,0.800000,0.142741,6773.832001,0.0,0.0,1286.246,...,43.403,0.0,0.000,72.823,900.839,0.000,0.000,30.386,30.086,0.171365
714,2014-03-24,AS14.01,6.00000,0.800000,0.000000,0.078961,15047.351001,10.0,0.0,866.956,...,34.106,0.0,3.010,66.558,3223.626,419.805,0.000,0.000,178.732,0.140922
739,2014-03-25,AS14.01,6.75000,0.500000,0.500000,0.098374,21475.354999,0.0,1.0,1032.768,...,43.054,0.0,0.000,178.819,1919.471,0.000,235.223,0.000,222.893,0.128708
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1968,2014-06-05,AS14.24,7.00000,0.000000,0.800000,0.152343,2371.588999,11.0,0.0,1749.277,...,0.000,0.0,0.000,47.360,171.964,225.737,0.000,0.000,0.000,0.079928
1969,2014-06-06,AS14.24,7.50000,0.500000,0.250000,0.189476,2733.332000,16.0,0.0,945.898,...,0.000,0.0,0.000,36.414,0.000,0.000,0.000,0.000,2.072,0.099847
1970,2014-06-07,AS14.24,8.00000,-1.000000,0.000000,0.081893,5530.377999,0.0,2.0,5027.498,...,0.000,0.0,0.000,6.214,106.850,0.000,0.000,0.000,0.000,0.096582
1971,2014-06-08,AS14.24,6.50000,-1.500000,1.000000,0.079510,1131.197999,0.0,0.0,351.389,...,0.000,0.0,0.000,18.069,190.760,0.000,0.000,0.000,0.000,0.093478


In [25]:
selected_df

# Save DataFrame to a CSV file
df.to_csv('cleaned_data.csv', index=False)

In [120]:
nan_count = df.isna().sum()
print(nan_count)

date                    0
mood                    0
circumplex.arousal      0
circumplex.valence      0
activity                0
screen                  0
call                    0
sms                     0
appCat.builtin          0
appCat.communication    0
appCat.entertainment    0
appCat.finance          0
appCat.game             0
appCat.office           0
appCat.other            0
appCat.social           0
appCat.travel           0
appCat.unknown          0
appCat.weather          0
appCat.utilities        0
AS14.01                 0
AS14.02                 0
AS14.03                 0
AS14.05                 0
AS14.06                 0
AS14.07                 0
AS14.08                 0
AS14.09                 0
AS14.12                 0
AS14.13                 0
AS14.14                 0
AS14.15                 0
AS14.16                 0
AS14.17                 0
AS14.19                 0
AS14.20                 0
AS14.23                 0
AS14.24                 0
AS14.25     

In [42]:
app_times = ['appCat.builtin','appCat.communication','appCat.entertainment','appCat.finance','appCat.game','appCat.office','appCat.other','appCat.social','appCat.travel','appCat.unknown','appCat.utilities','appCat.weather']

# Sum the values in the specified columns and create a new column 'app_times_sum'
df['sum_screentime'] = df[app_times].sum(axis=1)

df['overdue_time'] = df['screen'] - df['sum_screentime']

df.head(-51)

Unnamed: 0,date,mood,circumplex.arousal,circumplex.valence,activity,screen,call,sms,appCat.builtin,appCat.communication,...,AS14.26,AS14.27,AS14.28,AS14.29,AS14.30,AS14.31,AS14.32,AS14.33,sum_screentime,overdue_time
0,2014-02-17,,,,,0.000000,2.0,0.0,0.000,0.000,...,0,0,0,0,0,0,0,0,0.000,0.000000
1,2014-02-17,,,,,0.000000,4.0,2.0,0.000,0.000,...,0,0,0,0,0,0,0,0,0.000,0.000000
2,2014-02-17,,,,,0.000000,0.0,1.0,0.000,0.000,...,0,0,0,0,0,0,0,0,0.000,0.000000
3,2014-02-17,,,,,0.000000,5.0,0.0,0.000,0.000,...,0,0,0,0,0,0,0,0,0.000,0.000000
4,2014-02-17,,,,,0.000000,0.0,4.0,0.000,0.000,...,0,0,0,0,0,0,0,0,0.000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1917,2014-05-16,7.4,0.6,0.8,0.107087,9174.194000,2.0,0.0,2902.587,4533.666,...,1,0,0,0,0,0,0,0,10469.791,-1295.597000
1918,2014-05-16,7.0,0.4,0.8,0.051781,5389.559000,11.0,0.0,2212.464,541.080,...,0,0,0,0,0,0,0,1,6274.341,-884.782000
1919,2014-05-17,7.6,-0.4,0.8,0.030052,5039.252001,1.0,0.0,2311.960,3430.633,...,0,0,0,0,0,0,0,0,6060.445,-1021.192999
1920,2014-05-17,7.4,0.4,0.4,0.133427,4837.187001,3.0,0.0,1027.730,1690.919,...,1,0,0,0,0,0,0,0,5265.258,-428.070999
