In [51]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [52]:
data_agg = pd.read_csv('dataset_aggregated.csv', index_col=0)
data_agg['date'] = pd.to_datetime(data_agg['date'])
data_agg[['screen_sum',
       'call_sum', 'sms_sum', 'appCat.builtin_sum', 'appCat.communication_sum',
       'appCat.entertainment_sum', 'appCat.finance_sum', 'appCat.game_sum',
       'appCat.office_sum', 'appCat.other_sum', 'appCat.social_sum',
       'appCat.travel_sum', 'appCat.unknown_sum', 'appCat.utilities_sum',
       'appCat.weather_sum', 'day', 'month']] = data_agg[['screen_sum',
       'call_sum', 'sms_sum', 'appCat.builtin_sum', 'appCat.communication_sum',
       'appCat.entertainment_sum', 'appCat.finance_sum', 'appCat.game_sum',
       'appCat.office_sum', 'appCat.other_sum', 'appCat.social_sum',
       'appCat.travel_sum', 'appCat.unknown_sum', 'appCat.utilities_sum',
       'appCat.weather_sum', 'day', 'month']].fillna(0)

scaler = MinMaxScaler()
data_agg[['circumplex.arousal_median', 'circumplex.valence_median', 'activity_mean', 'screen_sum',
       'call_sum', 'sms_sum', 'appCat.builtin_sum', 'appCat.communication_sum',
       'appCat.entertainment_sum', 'appCat.finance_sum', 'appCat.game_sum',
       'appCat.office_sum', 'appCat.other_sum', 'appCat.social_sum',
       'appCat.travel_sum', 'appCat.unknown_sum', 'appCat.utilities_sum',
       'appCat.weather_sum', 'day', 'month']] = scaler.fit_transform(data_agg[['circumplex.arousal_median', 'circumplex.valence_median', 'activity_mean', 'screen_sum',
       'call_sum', 'sms_sum', 'appCat.builtin_sum', 'appCat.communication_sum',
       'appCat.entertainment_sum', 'appCat.finance_sum', 'appCat.game_sum',
       'appCat.office_sum', 'appCat.other_sum', 'appCat.social_sum',
       'appCat.travel_sum', 'appCat.unknown_sum', 'appCat.utilities_sum',
       'appCat.weather_sum', 'day', 'month']])

scaler_target = MinMaxScaler()
data_agg[['mood_mean']] = scaler_target.fit_transform(data_agg[['mood_mean']])

data_agg.head(50)

Unnamed: 0,id,date,mood_mean,circumplex.arousal_median,circumplex.valence_median,activity_mean,screen_sum,call_sum,sms_sum,appCat.builtin_sum,...,appCat.game_sum,appCat.office_sum,appCat.other_sum,appCat.social_sum,appCat.travel_sum,appCat.unknown_sum,appCat.utilities_sum,appCat.weather_sum,day,month
1309,AS14.01,2014-02-17,,,,,0.0,0.064516,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1310,AS14.01,2014-02-18,,,,,0.0,0.032258,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0
1311,AS14.01,2014-02-19,,,,,0.0,0.225806,0.117647,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0
1312,AS14.01,2014-02-20,,,,,0.0,0.064516,0.176471,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0
1886,AS14.01,2014-02-21,,,,,0.0,0.0,0.058824,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,0.0
1313,AS14.01,2014-02-22,,,,,0.0,0.064516,0.058824,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.833333,0.0
1314,AS14.01,2014-02-25,,,,,0.0,0.096774,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0
0,AS14.01,2014-02-26,0.3125,0.25,0.5,,0.0,0.032258,0.117647,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0
1,AS14.01,2014-02-27,0.333333,0.5,0.0,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0
1315,AS14.01,2014-02-28,,,,,0.0,0.129032,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,0.0


### Create datasets

In [53]:
# loop over IDs
IDs = data_agg.id.unique()

data_in = []
data_out = []
data_out_class = []
meta_data = []

window = 5

count_nan_X = 0
count_nan_Y = 0

for ID in IDs:

    data_id = data_agg[data_agg['id']==ID]
    
    # make sure all consecutive dates are in dataset
    start = data_id['date'].min()
    end = data_id['date'].max()
    dates = pd.DataFrame({'date': pd.date_range(start=start, end=end), 'id':ID})

    data_id = pd.merge(dates, data_id, on=['date', 'id'], how='left')#, 'day', 'month'])

    # use rolling window to find first 4 consecutive non-null values
    mask = data_id['mood_mean'].rolling(4).apply(lambda x: x.notnull().all()).fillna(False).astype(bool)
    idx = mask.index[mask].min()
    data_id = data_id.iloc[idx-3:]

    # try to fill na values using interpolation
    data_id.iloc[:, 2:] = data_id.iloc[:, 2:].interpolate(method='linear', limit=3, axis=0, limit_direction='both')

    for i in range(window, len(data_id)):

        meta = np.array(data_id.iloc[i, 0:2])
        X = np.array(data_id.iloc[i - window:i, 2:])

        # target for numerical prediction
        Y = data_id.iloc[i, 2]

        # skip data if Y is nan
        if np.isnan(Y):
            count_nan_Y += 1
            continue

        # skip data if nan values in X
        if (np.isnan(X)).any():
            count_nan_X += 1
            continue

        # target for classification
        Y_int = int(np.rint(scaler_target.inverse_transform([[Y]])))
        n_values = 10
        Y_class = np.eye(n_values)[Y_int-1]

        data_in.append(X)
        data_out.append(Y)
        data_out_class.append(Y_class)
        meta_data.append(meta)
    
data_in = np.array(data_in)
data_out = np.array(data_out)
data_out_class = np.array(data_out_class)
meta_data = np.array(meta_data)

print(data_in.shape)
print(data_out.shape)
print('Rows deleted because of nan values in target: ', count_nan_Y)
print('Rows deleted because of nan values in input: ', count_nan_X)


(1106, 5, 21)
(1106,)
Rows deleted because of nan values in target:  1
Rows deleted because of nan values in input:  65


In [54]:
df_meta_data = pd.DataFrame({'id':meta_data[:,1], 'date':meta_data[:,0]})
data_agg_with_mood = data_agg[~data_agg['mood_mean'].isna()]
df_all = pd.merge(data_agg_with_mood, df_meta_data, on=['id', 'date'], how='outer', indicator=True)
print('Number of rows deleted that had a mood value: ', len(df_all[df_all['_merge']=='left_only']))
print('Number of rows added with interpolated mood value: ', len(df_all[df_all['_merge']=='right_only']))

Number of rows deleted that had a mood value:  195
Number of rows added with interpolated mood value:  37


In [55]:
indices = np.arange(len(data_out))

# make dataset for numerical prediction
X_train_num, X_test_num, Y_train_num, Y_test_num, idx_train, idx_test = train_test_split(data_in, data_out, indices, test_size=0.1, shuffle=True, random_state=42)
X_train_num, X_val_num, Y_train_num, Y_val_num, idx_train, idx_val = train_test_split(X_train_num, Y_train_num, idx_train, test_size=0.1/0.9, shuffle=True, random_state=42)

# make dataset for classification
X_train_class, X_test_class, Y_train_class, Y_test_class = train_test_split(data_in, data_out_class, test_size=0.1, shuffle=True, random_state=42)
X_train_class, X_val_class, Y_train_class, Y_val_class = train_test_split(X_train_class, Y_train_class, test_size=0.1/0.9, shuffle=True, random_state=42)

print('Size train set: ', X_train_class.shape)
print('Size validation set: ', X_val_class.shape)
print('Size test set: ', X_test_class.shape)

Size train set:  (884, 5, 21)
Size validation set:  (111, 5, 21)
Size test set:  (111, 5, 21)


In [56]:
df_all = pd.merge(data_agg_with_mood, df_meta_data.iloc[idx_test], on=['id', 'date'], how='outer', indicator=True)
print('Number of rows added with interpolated mood value in test set: ', len(df_all[df_all['_merge']=='right_only']))

Number of rows added with interpolated mood value in test set:  4


In [57]:
print('Target in numerical prediction dataset: ', scaler_target.inverse_transform([[Y_train_num[1]]]))
print('Target in classification dataset: ', Y_train_class[1])

Target in numerical prediction dataset:  [[8.8]]
Target in classification dataset:  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]


In [58]:
print('Number of nans in input data: ', np.isnan(X_train_num).sum())

Number of nans in input data:  0


In [59]:
print('Sanity check')
print()
random_idx = idx_train[0]
print('Index: ', random_idx)
print('date, ID, mood: ', [meta_data[random_idx][0], meta_data[random_idx][1], data_out[random_idx]])
print('Mood on previous day: ', data_out[random_idx - 1])
print('Mood on second to last day: ', data_out[random_idx - 2])
print()
print('Moods in input data: ', X_train_num[0][:,0])
print('Mood in target data: ', Y_train_num[0])

Sanity check

Index:  647
date, ID, mood:  [Timestamp('2014-04-01 00:00:00'), 'AS14.20', 0.3125]
Mood on previous day:  0.5
Mood on second to last day:  0.625

Moods in input data:  [0.5   0.5   0.6   0.625 0.5  ]
Mood in target data:  0.3125


In [10]:
# aggregate dataset for randomforest and regression
agg_X_train = X_train_num.mean(axis=1)
agg_X_test = X_test_num.mean(axis=1)
agg_X_val = X_val_num.mean(axis=1)

meta_train = meta_data[idx_train]
meta_test = meta_data[idx_test]
meta_val = meta_data[idx_val]

df = pd.DataFrame(agg_X_train, columns=data_agg.columns[2:])
df['id'] = meta_train[:,1]
df['date'] = meta_train[:,0]
df['target_num'] = scaler_target.inverse_transform([Y_train_num])[0]
df['target_class'] = Y_train_class.argmax(axis=1) + 1
df.to_csv(f'aggregated_train_set_{window}.csv')

df = pd.DataFrame(agg_X_test, columns=data_agg.columns[2:])
df['id'] = meta_test[:,1]
df['date'] = meta_test[:,0]
df['target_num'] = scaler_target.inverse_transform([Y_test_num])[0]
df['target_class'] = Y_test_class.argmax(axis=1) + 1
df.to_csv(f'aggregated_test_set_{window}.csv')

df = pd.DataFrame(agg_X_val, columns=data_agg.columns[2:])
df['id'] = meta_val[:,1]
df['date'] = meta_val[:,0]
df['target_num'] = scaler_target.inverse_transform([Y_val_num])[0]
df['target_class'] = Y_val_class.argmax(axis=1) + 1
df.to_csv(f'aggregated_val_set_{window}.csv')

df

Unnamed: 0,mood_mean,circumplex.arousal_median,circumplex.valence_median,activity_mean,screen_sum,call_sum,sms_sum,appCat.builtin_sum,appCat.communication_sum,appCat.entertainment_sum,...,appCat.travel_sum,appCat.unknown_sum,appCat.utilities_sum,appCat.weather_sum,day,month,id,date,target_num,target_class
0,0.680357,0.535714,0.285714,0.159668,0.283186,0.188940,0.126050,0.173422,0.161694,0.222079,...,0.040175,0.000000,0.001837,0.010563,0.5,0.500000,AS14.32,2014-04-12,8.600000,9
1,0.496429,0.500000,0.428571,0.178370,0.038080,0.138249,0.176471,0.021190,0.017588,0.027491,...,0.000000,0.000000,0.000000,0.000000,0.5,0.500000,AS14.16,2014-05-01,7.000000,7
2,0.364286,0.642857,0.464286,0.090174,0.034864,0.055300,0.184874,0.022795,0.014571,0.030180,...,0.000000,0.000000,0.000000,0.000000,0.5,0.250000,AS14.16,2014-03-28,6.500000,6
3,0.437500,0.160714,0.339286,0.491458,0.286113,0.179724,0.042017,0.230524,0.151646,0.112692,...,0.126206,0.000000,0.000717,0.000000,0.5,0.500000,AS14.02,2014-04-14,7.333333,7
4,0.577679,0.142857,0.428571,0.326350,0.289848,0.036866,0.000000,0.065566,0.165634,0.233329,...,0.053031,0.008196,0.000000,0.003344,0.5,0.714286,AS14.06,2014-05-07,6.750000,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101,0.307143,0.107143,0.142857,0.163934,0.104669,0.023041,0.025210,0.008364,0.050329,0.168645,...,0.081812,0.020139,0.004930,0.000000,0.5,0.500000,AS14.09,2014-04-17,7.000000,7
102,0.532143,0.339286,0.500000,0.334695,0.424206,0.064516,0.008403,0.114647,0.252751,0.391719,...,0.071001,0.000000,0.000000,0.013250,0.5,0.500000,AS14.06,2014-04-19,6.800000,7
103,0.739881,0.571429,0.571429,0.156096,0.399854,0.009217,0.084034,0.042258,0.179196,0.044844,...,0.054670,0.000000,0.000321,0.000000,0.5,0.500000,AS14.29,2014-04-17,7.600000,8
104,0.480357,0.500000,0.357143,0.059993,0.027003,0.009217,0.008403,0.052055,0.001837,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.5,0.500000,AS14.14,2014-04-18,6.500000,6
