# Preprocessing of dataset

In [13]:
import pandas as pd
import numpy as np
import os
import matplotlib as plt

#### Reading the file

In [14]:
cwd = os.getcwd()
dataset = pd.read_csv("dataset_mood_smartphone.csv")
dataset.head()

Unnamed: 0.1,Unnamed: 0,id,time,variable,value
0,1,AS14.01,2014-02-26 13:00:00.000,mood,6.0
1,2,AS14.01,2014-02-26 15:00:00.000,mood,6.0
2,3,AS14.01,2014-02-26 18:00:00.000,mood,6.0
3,4,AS14.01,2014-02-26 21:00:00.000,mood,7.0
4,5,AS14.01,2014-02-27 09:00:00.000,mood,6.0


#### The id field contains many different users. Our goal is aggregate all data per user per day, and then sum each 5 days or so into a single row

In [15]:
# First we remove the timestamps, as we only want the dates for when trying to find unique dates
# temp_dataset = dataset

# for i in range(len(temp_dataset)):
#     temp_dataset["time"][i] = temp_dataset["time"][i][0:10]

temp_dataset = pd.read_csv("Time_edited_dataset.csv")
# temp_dataset = temp_dataset.sort_values(by=['id', 'time', 'variable'])
# temp_dataset.to_csv("Time_edited_dataset.csv")

In [16]:
temp_dataset.head()

Unnamed: 0.1,Unnamed: 0,id,time,variable,value
0,0,AS14.01,2014-02-17,call,1.0
1,1,AS14.01,2014-02-17,call,1.0
2,2,AS14.01,2014-02-18,call,1.0
3,3,AS14.01,2014-02-19,call,1.0
4,4,AS14.01,2014-02-19,call,1.0


In [8]:
def preprocessing(dataset, remove_empty_mood, normalize_time_data, drop_cols, fill_missing_cols_mean):
    data = dataset
    id_list = []
    time_list = []
    mood_list = []
    circumplex_arousal_list = []
    circumplex_valence_list = []
    activity_list = []
    screen_list = []
    call_list = []
    sms_list = []
    appCat_builtin_list = []
    appCat_communication_list = []
    appCat_entertainment_list = []
    appCat_finance_list = []
    appCat_game_list = []
    appCat_office_list = []
    appCat_other_list = []
    appCat_social_list = []
    appCat_travel_list = []
    appCat_unknown_list = []
    appCat_utilities_list = []
    appCat_weather_list = []

    for users in pd.unique(data["id"]):
        
        print(f"Working on user {users}")
        user_data = data.loc[data["id"] == users]
        
        


        for day in pd.unique(user_data["time"]):
            id_list.append(users)
            time_list.append(day)

            try:
                mood_list.append(user_data.loc[user_data["time"] == day].loc[user_data["variable"] == "mood"]["value"].mean())
            except:
                mood_list.append(np.nan)
            
            try:
                circumplex_arousal_list.append(round(user_data.loc[user_data["time"] == day].loc[user_data["variable"] == "circumplex.arousal"]["value"].mean()))
            except:
                circumplex_arousal_list.append(np.nan)

            try:
                circumplex_valence_list.append(round(user_data.loc[user_data["time"] == day].loc[user_data["variable"] == "circumplex.valence"]["value"].mean()))
            except:
                circumplex_valence_list.append(np.nan)

            try:
                activity_list.append(user_data.loc[user_data["time"] == day].loc[user_data["variable"] == "activity"]["value"].mean())
            except:
                activity_list.append(np.nan)

            try:
                screen_list.append(sum(user_data.loc[user_data["time"] == day].loc[user_data["variable"] == "screen"]["value"]))
            except:
                screen_list.append(np.nan)

            try:
                call_list.append(int(sum(user_data.loc[user_data["time"] == day].loc[user_data["variable"] == "call"]["value"])))
            except:
                call_list.append(np.nan)

            try:
                sms_list.append(int(sum(user_data.loc[user_data["time"] == day].loc[user_data["variable"] == "sms"]["value"])))
            except:
                sms_list.append(np.nan)

            try:
                appCat_builtin_list.append(sum(user_data.loc[user_data["time"] == day].loc[user_data["variable"] == "appCat.builtin"]["value"]))
            except:
                appCat_builtin_list.append(np.nan)

            try:
                appCat_communication_list.append(sum(user_data.loc[user_data["time"] == day].loc[user_data["variable"] == "appCat.communication"]["value"]))
            except:
                appCat_communication_list.append(np.nan)

            try:
                appCat_entertainment_list.append(sum(user_data.loc[user_data["time"] == day].loc[user_data["variable"] == "appCat.entertainment"]["value"]))
            except:
                appCat_entertainment_list.append(np.nan)

            try:
                appCat_finance_list.append(sum(user_data.loc[user_data["time"] == day].loc[user_data["variable"] == "appCat.finance"]["value"]))
            except:
                appCat_finance_list.append(np.nan)
            
            try:
                appCat_game_list.append(sum(user_data.loc[user_data["time"] == day].loc[user_data["variable"] == "appCat.game"]["value"]))
            except:
                appCat_game_list.append(np.nan)

            try:
                appCat_office_list.append(sum(user_data.loc[user_data["time"] == day].loc[user_data["variable"] == "appCat.office"]["value"]))
            except:
                appCat_office_list.append(np.nan)

            try:
                appCat_other_list.append(sum(user_data.loc[user_data["time"] == day].loc[user_data["variable"] == "appCat.other"]["value"]))
            except:
                appCat_other_list.append(np.nan)

            try:
                appCat_social_list.append(sum(user_data.loc[user_data["time"] == day].loc[user_data["variable"] == "appCat.social"]["value"]))
            except:
                appCat_social_list.append(np.nan)

            try:
                appCat_travel_list.append(sum(user_data.loc[user_data["time"] == day].loc[user_data["variable"] == "appCat.travel"]["value"]))
            except:
                appCat_travel_list.append(np.nan)

            try:
                appCat_unknown_list.append(sum(user_data.loc[user_data["time"] == day].loc[user_data["variable"] == "appCat.builtin"]["value"]))
            except:
                appCat_unknown_list.append(np.nan)

            try:
                appCat_utilities_list.append(sum(user_data.loc[user_data["time"] == day].loc[user_data["variable"] == "appCat.utilities"]["value"]))
            except:
                appCat_utilities_list.append(np.nan)

            try:
                appCat_weather_list.append(sum(user_data.loc[user_data["time"] == day].loc[user_data["variable"] == "appCat.weather"]["value"]))
            except:
                appCat_weather_list.append(np.nan)

    
            
       
        

    if normalize_time_data == True:
        screen_list = [i / max(screen_list) for i in screen_list]
        appCat_builtin_list = [i / max(appCat_builtin_list) for i in appCat_builtin_list]
        appCat_communication_list = [i / max(appCat_communication_list) for i in appCat_communication_list]
        appCat_entertainment_list = [i / max(appCat_entertainment_list) for i in appCat_entertainment_list]
        appCat_finance_list = [i / max(appCat_finance_list) for i in appCat_finance_list]
        appCat_game_list = [i / max(appCat_game_list) for i in appCat_game_list]
        appCat_office_list = [i / max(appCat_office_list) for i in appCat_office_list]
        appCat_other_list = [i / max(appCat_other_list) for i in appCat_other_list]
        appCat_social_list = [i / max(appCat_social_list) for i in appCat_social_list]
        appCat_travel_list = [i / max(appCat_travel_list) for i in appCat_travel_list]
        appCat_unknown_list = [i / max(appCat_unknown_list) for i in appCat_unknown_list]
        appCat_utilities_list = [i / max(appCat_utilities_list) for i in appCat_utilities_list]
        appCat_weather_list = [i / max(appCat_weather_list) for i in appCat_weather_list]
        



    final_dataframe = pd.DataFrame({"id" : id_list, 
                                    "time": time_list, 
                                    "mood" : mood_list, 
                                    "circumplex.arousal" : circumplex_arousal_list, 
                                    "circumplex.valence" : circumplex_valence_list, 
                                    "activity": activity_list,
                                    "screen" : screen_list,
                                    "call" : call_list,
                                    "sms" : sms_list,
                                    "appCat.builtin" : appCat_builtin_list,
                                    "appCat.communication" : appCat_communication_list,
                                    "appCat.entertainment" : appCat_entertainment_list,
                                    "appCat.finance" : appCat_finance_list,
                                    "appCat.game" : appCat_game_list,
                                    "appCat.office" : appCat_office_list,
                                    "appCat.other" : appCat_other_list,
                                    "appCat.social" : appCat_social_list,
                                    "appCat.travel" : appCat_travel_list,
                                    "appCat.unknown" : appCat_unknown_list,
                                    "appCat.utilities" : appCat_utilities_list,
                                    "appCat.weather" : appCat_weather_list
                                }) 

    if remove_empty_mood == True:
        final_dataframe = final_dataframe.dropna(subset=['mood'])

    for i in range(len(drop_cols)):
        final_dataframe = final_dataframe.drop(drop_cols[i], inplace=True, axis=1) 
    
    for i in range(len(fill_missing_cols_mean)):
        final_dataframe = final_dataframe[i].fillna(final_dataframe[i].mean())

    return final_dataframe





In [9]:
# (dataset=temp_dataset, remove_empty_mood=False, normalize_time_data = False, drop_cols = [], fill_missing_cols_mean = [])
test = preprocessing(dataset=temp_dataset, remove_empty_mood=True)
test.to_csv(f'{cwd}/datasets/without_empty_mood_values.csv')

Working on user AS14.01
Working on user AS14.02
Working on user AS14.03
Working on user AS14.05
Working on user AS14.06
Working on user AS14.07
Working on user AS14.08
Working on user AS14.09
Working on user AS14.12
Working on user AS14.13
Working on user AS14.14
Working on user AS14.15
Working on user AS14.16
Working on user AS14.17
Working on user AS14.19
Working on user AS14.20
Working on user AS14.23
Working on user AS14.24
Working on user AS14.25
Working on user AS14.26
Working on user AS14.27
Working on user AS14.28
Working on user AS14.29
Working on user AS14.30
Working on user AS14.31
Working on user AS14.32
Working on user AS14.33


In [10]:
test.head()

Unnamed: 0,id,time,mood,circumplex.arousal,circumplex.valence,activity,screen,call,sms,appCat.builtin,...,appCat.entertainment,appCat.finance,appCat.game,appCat.office,appCat.other,appCat.social,appCat.travel,appCat.unknown,appCat.utilities,appCat.weather
7,AS14.01,2014-02-26,6.25,0.0,1.0,,0.0,1,2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,AS14.01,2014-02-27,6.333333,0.0,0.0,,0.0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26,AS14.01,2014-03-21,6.2,0.0,0.0,0.13405,17978.907,6,0,3139.218,...,1007.456,49.544,0.0,172.206,239.751,4508.5,915.445,3139.218,598.754,0.0
27,AS14.01,2014-03-22,6.4,1.0,0.0,0.23688,6142.161,3,1,731.429,...,93.324,21.076,0.0,0.0,98.143,439.632,37.305,731.429,117.621,0.0
28,AS14.01,2014-03-23,6.8,0.0,1.0,0.142741,6773.832001,0,0,1286.246,...,94.346,43.403,0.0,0.0,72.823,900.839,0.0,1286.246,30.086,30.386


# Analysis of our custom dataset

In [None]:
pl