In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 200)

In [3]:
df = pd.read_csv('clean.csv', index_col=0)
df['time'] = pd.to_datetime(df['time'], format='%Y-%m-%d %H:%M:%S.%f')

In [4]:
# get date
df['date'] = df['time'].dt.date

# get day
df['day'] = df['time'].dt.weekday

# get month
df['month'] = df['time'].dt.month

df

Unnamed: 0,id,time,variable,value,date,day,month
0,AS14.01,2014-02-26 13:00:00.000,mood,6.000,2014-02-26,2,2
1,AS14.01,2014-02-26 15:00:00.000,mood,6.000,2014-02-26,2,2
2,AS14.01,2014-02-26 18:00:00.000,mood,6.000,2014-02-26,2,2
3,AS14.01,2014-02-26 21:00:00.000,mood,7.000,2014-02-26,2,2
4,AS14.01,2014-02-27 09:00:00.000,mood,6.000,2014-02-27,3,2
...,...,...,...,...,...,...,...
370770,AS14.30,2014-04-11 07:51:16.948,appCat.weather,8.032,2014-04-11,4,4
370771,AS14.30,2014-04-19 11:00:32.747,appCat.weather,3.008,2014-04-19,5,4
370772,AS14.30,2014-04-26 10:19:07.434,appCat.weather,7.026,2014-04-26,5,4
370773,AS14.30,2014-04-27 00:44:48.450,appCat.weather,23.033,2014-04-27,6,4


In [5]:
aggregations = {'mood':'mean', 
                'circumplex.arousal':'mean', 
                'circumplex.valence':'mean', 
                'activity':'mean',
                'screen':'sum',
                'call':'sum',
                'sms':'sum',
                'appCat.builtin':'sum',
                'appCat.communication':'sum',
                'appCat.entertainment':'sum',
                'appCat.finance':'sum',
                'appCat.game':'sum',
                'appCat.office':'sum',
                'appCat.other':'sum',
                'appCat.social':'sum',
                'appCat.travel':'sum',
                'appCat.unknown':'sum',
                'appCat.utilities':'sum',
                'appCat.weather':'sum',}

df_agg = pd.DataFrame(columns=['id', 'date'])

for variable, aggregation in aggregations.items():
    df_new = df[df['variable']==variable].groupby(['id', 'date']).value.agg([aggregation])
    df_new = df_new.rename(columns={aggregation:f'{variable}_{aggregation}'}).reset_index()
    df_agg = pd.merge(df_agg, df_new, on=['id', 'date'], how='outer')

df_dates = df[['date', 'day', 'month']].drop_duplicates()
df_agg = pd.merge(df_agg, df_dates, on=['date'], how='left')

df_agg = df_agg.sort_values(['id', 'date'])

df_agg.to_csv('dataset_aggregated.csv')
df_agg

Unnamed: 0,id,date,mood_mean,circumplex.arousal_mean,circumplex.valence_mean,activity_mean,screen_sum,call_sum,sms_sum,appCat.builtin_sum,...,appCat.game_sum,appCat.office_sum,appCat.other_sum,appCat.social_sum,appCat.travel_sum,appCat.unknown_sum,appCat.utilities_sum,appCat.weather_sum,day,month
1309,AS14.01,2014-02-17,,,,,,2.0,,,...,,,,,,,,,0,2
1310,AS14.01,2014-02-18,,,,,,1.0,,,...,,,,,,,,,1,2
1311,AS14.01,2014-02-19,,,,,,7.0,2.0,,...,,,,,,,,,2,2
1312,AS14.01,2014-02-20,,,,,,2.0,3.0,,...,,,,,,,,,3,2
1886,AS14.01,2014-02-21,,,,,,,1.0,,...,,,,,,,,,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1259,AS14.33,2014-05-27,6.20,1.000000,0.75,0.012704,3344.624001,1.0,2.0,726.391,...,,,142.686,2010.364,,,56.173,,1,5
1260,AS14.33,2014-05-28,7.75,1.333333,1.20,0.103301,6757.439998,10.0,1.0,2551.046,...,,357.909,529.946,4446.281,,,30.666,,2,5
1261,AS14.33,2014-05-29,7.00,,1.00,0.149544,560.093000,5.0,1.0,400.034,...,,,29.202,1789.922,0.939,,3.199,,3,5
1262,AS14.33,2014-05-30,6.80,0.500000,0.50,0.142046,8649.867999,4.0,,3044.030,...,,,52.610,3166.409,1052.648,8.072,232.825,,4,5


In [7]:
df_agg[df_agg['screen_sum'].isna() & ~df_agg['appCat.social_sum'].isna()]

Unnamed: 0,id,date,mood_mean,circumplex.arousal_mean,circumplex.valence_mean,activity_mean,screen_sum,call_sum,sms_sum,appCat.builtin_sum,...,appCat.game_sum,appCat.office_sum,appCat.other_sum,appCat.social_sum,appCat.travel_sum,appCat.unknown_sum,appCat.utilities_sum,appCat.weather_sum,day,month
230,AS14.06,2014-05-08,7.0,,1.0,0.0,,,,,...,,,,63.596,,,,,3,5
