In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from statsmodels.graphics.tsaplots import plot_acf

# Read in the data
df = pd.read_csv('Datasets/dataset_mood_smartphone.csv')

df

Unnamed: 0.1,Unnamed: 0,id,time,variable,value
0,1,AS14.01,2014-02-26 13:00:00.000,mood,6.000
1,2,AS14.01,2014-02-26 15:00:00.000,mood,6.000
2,3,AS14.01,2014-02-26 18:00:00.000,mood,6.000
3,4,AS14.01,2014-02-26 21:00:00.000,mood,7.000
4,5,AS14.01,2014-02-27 09:00:00.000,mood,6.000
...,...,...,...,...,...
376907,2770399,AS14.30,2014-04-11 07:51:16.948,appCat.weather,8.032
376908,2772465,AS14.30,2014-04-19 11:00:32.747,appCat.weather,3.008
376909,2774026,AS14.30,2014-04-26 10:19:07.434,appCat.weather,7.026
376910,2774133,AS14.30,2014-04-27 00:44:48.450,appCat.weather,23.033


In [32]:
df['date'] = pd.to_datetime(df['time']).dt.date
df['time'] = pd.to_datetime(df['time']).dt.time

grouped_df = df.groupby(['id', 'date', 'time', 'variable']).sum().reset_index()

#create dataframe of values per id per date per time
df = grouped_df.pivot_table(index=['id', 'date', 'time'], columns='variable', values='value')
df

Unnamed: 0_level_0,Unnamed: 1_level_0,variable,activity,appCat.builtin,appCat.communication,appCat.entertainment,appCat.finance,appCat.game,appCat.office,appCat.other,appCat.social,appCat.travel,appCat.unknown,appCat.utilities,appCat.weather,call,circumplex.arousal,circumplex.valence,mood,screen,sms
id,date,time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AS14.01,2014-02-17,12:04:42.394000,,,,,,,,,,,,,,1.0,,,,,
AS14.01,2014-02-17,18:28:25.520000,,,,,,,,,,,,,,1.0,,,,,
AS14.01,2014-02-18,09:29:51.257000,,,,,,,,,,,,,,1.0,,,,,
AS14.01,2014-02-19,14:43:30.575000,,,,,,,,,,,,,,1.0,,,,,
AS14.01,2014-02-19,17:29:10.378000,,,,,,,,,,,,,,1.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AS14.33,2014-05-30,22:32:05.016000,,,,,,,,,,,,6.019,,,,,,,
AS14.33,2014-05-30,22:32:11.049000,,3.122,,,,,,,,,,,,,,,,,
AS14.33,2014-05-30,22:32:14.240000,,1.003,,,,,,,,,,,,,,,,,
AS14.33,2014-05-30,22:32:15.246000,,4.134,,,,,,,,,,,,,,,,,


In [28]:
# Remove instances that are not in the range of the depicted column
# ATTENTION should be performed before merging the data!!
def range_removal(lower, upper, column_name, keep_nan=True):

    column = df[column_name]

    if keep_nan:
        # If we want to keep the NaN values use this
        filtered_df = df[column.between(lower, upper) | column.isna()]
    else:
        # If we want to remove the NaN values use this
        filtered_df = df[(column >= lower) & (column <= upper)]

    return filtered_df

df = range_removal(1, 10, 'mood')
df = range_removal(-2, 2, 'circumplex.arousal')
df = range_removal(-2, 2, 'circumplex.valence')
df = range_removal(0, 1, 'activity')

df

Unnamed: 0_level_0,Unnamed: 1_level_0,variable,activity,appCat.builtin,appCat.communication,appCat.entertainment,appCat.finance,appCat.game,appCat.office,appCat.other,appCat.social,appCat.travel,appCat.unknown,appCat.utilities,appCat.weather,call,circumplex.arousal,circumplex.valence,mood,screen,sms
id,date,time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AS14.01,2014-02-17,12:04:42.394000,,,,,,,,,,,,,,1.0,,,,,
AS14.01,2014-02-17,18:28:25.520000,,,,,,,,,,,,,,1.0,,,,,
AS14.01,2014-02-18,09:29:51.257000,,,,,,,,,,,,,,1.0,,,,,
AS14.01,2014-02-19,14:43:30.575000,,,,,,,,,,,,,,1.0,,,,,
AS14.01,2014-02-19,17:29:10.378000,,,,,,,,,,,,,,1.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AS14.33,2014-05-30,22:32:05.016000,,,,,,,,,,,,6.019,,,,,,,
AS14.33,2014-05-30,22:32:11.049000,,3.122,,,,,,,,,,,,,,,,,
AS14.33,2014-05-30,22:32:14.240000,,1.003,,,,,,,,,,,,,,,,,
AS14.33,2014-05-30,22:32:15.246000,,4.134,,,,,,,,,,,,,,,,,


In [33]:
df['my_column'] = df.loc[~(df['appCat.builtin'] < 0), 'appCat.builtin']

df

Unnamed: 0_level_0,Unnamed: 1_level_0,variable,activity,appCat.builtin,appCat.communication,appCat.entertainment,appCat.finance,appCat.game,appCat.office,appCat.other,appCat.social,appCat.travel,appCat.unknown,appCat.utilities,appCat.weather,call,circumplex.arousal,circumplex.valence,mood,screen,sms,my_column
id,date,time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
AS14.01,2014-02-17,12:04:42.394000,,,,,,,,,,,,,,1.0,,,,,,
AS14.01,2014-02-17,18:28:25.520000,,,,,,,,,,,,,,1.0,,,,,,
AS14.01,2014-02-18,09:29:51.257000,,,,,,,,,,,,,,1.0,,,,,,
AS14.01,2014-02-19,14:43:30.575000,,,,,,,,,,,,,,1.0,,,,,,
AS14.01,2014-02-19,17:29:10.378000,,,,,,,,,,,,,,1.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AS14.33,2014-05-30,22:32:05.016000,,,,,,,,,,,,6.019,,,,,,,,
AS14.33,2014-05-30,22:32:11.049000,,3.122,,,,,,,,,,,,,,,,,,3.122
AS14.33,2014-05-30,22:32:14.240000,,1.003,,,,,,,,,,,,,,,,,,1.003
AS14.33,2014-05-30,22:32:15.246000,,4.134,,,,,,,,,,,,,,,,,,4.134


In [37]:
# Remove the negative instances for the applications
def remove_negative_values(columns, df):
    for column in columns:
        # Keep the NaN values and replace the negative values with 0
        df[column] = df.loc[~(df[column] < 0), column]

    return df

df = remove_negative_values(['appCat.builtin', 'appCat.communication', 'appCat.entertainment', 'appCat.finance', 'appCat.game', 'appCat.office', 'appCat.other', 'appCat.social', 'appCat.travel', 'appCat.unknown', 'appCat.utilities', 'appCat.weather', 'screen', 'call', 'sms'], df)

df

0.001


In [None]:
#find outliers
def find_outliers(series):
    outliers = []
    std = np.std(series)
    threshold = 3 * std
    mean = np.mean(series)
        
    for y in series:
        value = (y - mean)
        if np.abs(value) > threshold:
            outliers.append(y)

    return outliers
    
    #print number of outliers per attribute
    for col in df.columns:
        print(col, ': ', len(find_outliers(df[col])))