# Notebook for actual cleaning

includes:
- Removing records (outliers)
- Adjusting (mis-scaled values)
- Imputing (missing values)
- etc

In [40]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Data import and transform

In [None]:
# Read in the data
input_df = pd.read_csv('Datasets/dataset_mood_smartphone.csv')

df = input_df.copy()

#split date and time
df['hour'] = pd.to_datetime(df['time']).dt.time
df['date'] = pd.to_datetime(df['time']).dt.date

#round hour to miliseconds
df['hour'] = df['hour'].apply(lambda x: x.replace(microsecond=0))
df.tail()

In [34]:
#reorder columns
print(df.columns)
df = df[['id', 'date', 'hour', 'variable', 'value']]

df.tail()

Index(['Unnamed: 0', 'id', 'time', 'variable', 'value', 'hour', 'date'], dtype='object')


Unnamed: 0,id,date,hour,variable,value
376907,AS14.30,2014-04-11,07:51:16,appCat.weather,8.032
376908,AS14.30,2014-04-19,11:00:32,appCat.weather,3.008
376909,AS14.30,2014-04-26,10:19:07,appCat.weather,7.026
376910,AS14.30,2014-04-27,00:44:48,appCat.weather,23.033
376911,AS14.32,2014-04-07,18:25:14,appCat.weather,22.431


In [35]:
#unique variable values
df['variable'].unique()

array(['mood', 'circumplex.arousal', 'circumplex.valence', 'activity',
       'screen', 'call', 'sms', 'appCat.builtin', 'appCat.communication',
       'appCat.entertainment', 'appCat.finance', 'appCat.game',
       'appCat.office', 'appCat.other', 'appCat.social', 'appCat.travel',
       'appCat.unknown', 'appCat.utilities', 'appCat.weather'],
      dtype=object)

## Investigation

It seems from the previous used notebooks that the pivoted df has around 20.000 less records than the initial.
Next section was to find out why

Answer:
Since the pivoted df joins all records with the same id, date, time combination, records of 'valence', 'mood' and 'arousal' are merged into a single record (these are recorded at the same moment: whole hours). 
However, this is not for all situations: see final two instances of the printed df down below. It seems that summing these values in the aggregation makes most sense.

In [39]:
#cant aggreagate yet, as there apear to be instances with dubplicate "user, date, hour" combinations

def find_duplicate_combinations(df):
    # Group the rows by 'id', 'date', and 'hour'
    grouped = df.groupby(['id', 'date', 'hour'])
    
    # Get the groups with more than one row
    groups_with_duplicates = grouped.filter(lambda x: len(x) > 1)
    
    # Get the duplicated combinations of 'id', 'date', and 'hour'
    duplicated_combinations = groups_with_duplicates[['id', 'date', 'hour']].drop_duplicates()
    
    # Get the rows corresponding to the duplicated combinations
    duplicated_rows = pd.merge(duplicated_combinations, df, on=['id', 'date', 'hour'])
    
    return duplicated_rows

find_duplicate_combinations(df)

Unnamed: 0,id,date,hour,variable,value
0,AS14.01,2014-02-26,13:00:00,mood,6.000
1,AS14.01,2014-02-26,13:00:00,circumplex.arousal,-1.000
2,AS14.01,2014-02-26,13:00:00,circumplex.valence,0.000
3,AS14.01,2014-02-26,15:00:00,mood,6.000
4,AS14.01,2014-02-26,15:00:00,circumplex.arousal,-1.000
...,...,...,...,...,...
207487,AS14.33,2014-05-04,04:13:12,appCat.builtin,1.018
207488,AS14.25,2014-04-28,11:54:00,appCat.builtin,0.836
207489,AS14.25,2014-04-28,11:54:00,appCat.communication,38.142
207490,AS14.28,2014-04-09,11:11:42,appCat.builtin,0.325


## Aggregate data
Lets try some different methods, and see if the df's differ. If so, which one is better?

In [37]:
def transform_dataframe(df):
    # Pivot the dataframe to create a new column for each unique value in the 'variable' column
    pivoted_df = df.pivot(index=['id', 'date', 'hour'], columns='variable', values='value')
    
    # Reset the index to make the 'id', 'date', and 'hour' columns regular columns
    transformed_df = pivoted_df.reset_index()
    
    return transformed_df

test = transform_dataframe(df)
test

ValueError: Index contains duplicate entries, cannot reshape

In [19]:
#round to hour
df['hour'] = df['hour'].apply(lambda x: x.replace(minute=0, second=0))

df.tail()

Unnamed: 0.1,Unnamed: 0,id,time,variable,value,hour,date
376907,2770399,AS14.30,2014-04-11 07:51:16.948,appCat.weather,8.032,07:00:00,2014-04-11
376908,2772465,AS14.30,2014-04-19 11:00:32.747,appCat.weather,3.008,11:00:00,2014-04-19
376909,2774026,AS14.30,2014-04-26 10:19:07.434,appCat.weather,7.026,10:00:00,2014-04-26
376910,2774133,AS14.30,2014-04-27 00:44:48.450,appCat.weather,23.033,00:00:00,2014-04-27
376911,2784435,AS14.32,2014-04-07 18:25:14.036,appCat.weather,22.431,18:00:00,2014-04-07


In [4]:
# round time to nearest hour and group by time, person_id, and variable
df['time_hour'] = df['time'].dt.round('H')
grouped_df = df.groupby(['time_hour', 'id', 'variable']).sum().reset_index()

# use pivot_table to create columns for each variable
pivoted_df = grouped_df.pivot_table(index=['time_hour', 'id'], columns='variable', values='value')

pivoted_df

AttributeError: Can only use .dt accessor with datetimelike values