In [1]:
import pandas as pd
import numpy as np
import pmdarima as pm
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import warnings
from statsmodels.tools.sm_exceptions import ValueWarning
warnings.simplefilter('ignore', ValueWarning)
np.random.seed(42)

## Preprocessing
* TODO: do the same NA handling as Oleg and Agoston for consistency..,


In [2]:
df = pd.read_csv("data/mood_aggregated.csv")

# We only care about mood for ARIMA, so we drop other variables
df = df[df.variable == 'mood']

# We keep the id, timestamp, and value for mood
df = df[['id', 'time', 'value']]

# We change the time strings to datetime objjects
df['time'] = pd.to_datetime(df['time'])

# Setting the index as the time
df.set_index('time', inplace=True)
df.index = pd.to_datetime(df.index)

# We rename the value column to mood since it only contains values for mood now
df = df.rename(columns={'value': 'mood'})

## Separating the data
* First we will separate for each participant
* Then we will do a train/test split

In [3]:
# We split the data for each participant
participants = {participant: df[df.id == participant].drop(['id'], axis=1) for participant in df.id.unique()}

# Maps each participants to a train and test set
train_test_dict = {}
for participant in participants:
    # cutoff = int(len(participants[participant]['mood']) * 0.8)
    cutoff = len(participants[participant]['mood']) -1
    train = participants[participant]['mood'][:cutoff]
    test = participants[participant]['mood'][cutoff:]
    
#     time_index_train = participants[participant]['time'][:cutoff]
#     time_index_test = participants[participant]['time'][cutoff:]
    
    train_test_dict[participant] = {'train': train, 'test': test, 'cutoff': cutoff}
#     train_test_dict[participant] = {'train': train, 'test': test, 'time_index_train': time_index_train, 'time_index_test': time_index_test}

## Finding the models with the optimal parameters

In [4]:
y = train_test_dict['AS14.01']['train']

# Maps each participant to a fitted model
model_dict = {}
for participant in participants:
    train = train_test_dict[participant]['train']
    
    model = pm.auto_arima(train, start_p=1, start_q=1,
                          test='adf',       # use adftest to find optimal 'd'
                          max_p=5, max_q=5, # maximum p and q
                          m=1,              # frequency of series
                          d=None,           # let model determine 'd'
                          seasonal=False,   # No Seasonality (Might have to check for this shit?)
                          start_P=0, 
                          D=0, 
                          trace=False,
                          error_action='ignore',  
                          suppress_warnings=True, 
                          stepwise=True)
    model_dict[participant] = model

## Using the statsmodels package ARIMA function

In [5]:
final_models = {}
fitted_models = {}
for participant in model_dict.keys():
    order = model_dict[participant].order
    endog = train_test_dict[participant]['train']
    model = ARIMA(endog=endog, order=order)
    
    final_models[participant] = model
    
    fitted_models[participant] = model.fit()



## Predicting with each model

In [6]:
prediction_dict = {}
for participant in participants:
    # Get the correct model
    model = fitted_models[participant]
    
    # Get the real labels and cutoff
    true_labels = train_test_dict[participant]['test']
    cutoff = train_test_dict[participant]['cutoff']
    
    # Predict
    predictions = model.predict(start=cutoff, end=cutoff + len(true_labels)-1)
    
    assert len(predictions) == len(true_labels)
    
    # Save the predictions
    prediction_dict[participant] = {'true_labels': true_labels, 'predictions': predictions}

In [29]:
for participant in prediction_dict:
    true_value = prediction_dict[participant]['true_labels']
    prediction = prediction_dict[participant]['predictions']
    
    MSE = mean_absolute_error(true_value, prediction)
    RMSE = mean_squared_error(true_value, prediction)
    
    print(f"{participant}     MSE = {MSE:.2f}     RMSE = {RMSE:.2f}     True_Value = {true_value[0]:.2f}       Predicted = {list(prediction)[0]:.2f}")

AS14.01     MSE = 0.22     RMSE = 0.05     True_Value = 8.00       Predicted = 7.78
AS14.02     MSE = 2.89     RMSE = 8.32     True_Value = 9.00       Predicted = 6.11
AS14.03     MSE = 0.29     RMSE = 0.08     True_Value = 7.67       Predicted = 7.38
AS14.05     MSE = 0.58     RMSE = 0.34     True_Value = 6.33       Predicted = 6.92
AS14.06     MSE = 0.19     RMSE = 0.03     True_Value = 7.00       Predicted = 7.19
AS14.07     MSE = 0.39     RMSE = 0.15     True_Value = 5.50       Predicted = 5.89
AS14.08     MSE = 0.08     RMSE = 0.01     True_Value = 6.67       Predicted = 6.75
AS14.09     MSE = 1.14     RMSE = 1.30     True_Value = 6.00       Predicted = 7.14
AS14.12     MSE = 0.62     RMSE = 0.38     True_Value = 6.00       Predicted = 5.38
AS14.13     MSE = 0.23     RMSE = 0.05     True_Value = 8.00       Predicted = 7.77
AS14.14     MSE = 0.13     RMSE = 0.02     True_Value = 7.00       Predicted = 6.87
AS14.15     MSE = 0.05     RMSE = 0.00     True_Value = 7.00       Predicted