In [34]:
import pandas as pd
import numpy as np
import pmdarima as pm
from sklearn.model_selection import train_test_split
np.random.seed(42)

## Preprocessing
* TODO: do the same NA handling as Oleg and Agoston for consistency..,


In [6]:
df = pd.read_csv("data/mood.csv")

# We only care about mood for ARIMA, so we drop other variables
df = df[df.variable == 'mood']

# We keep the id, timestamp, and value for mood
df = df[['id', 'time', 'value']]

# We rename the value column to mood since it only contains values for mood now
df = df.rename(columns={'value': 'mood'})

## Separating the data
* First we will separate for each participant
* Then we will do a train/test split

In [15]:
# We split the data for each participant
participants = {participant: df[df.id == participant].drop(['id'], axis=1) for participant in df.id.unique()}

# Maps each participants to a train and test set
train_test_dict = {}
for participant in participants:
    cutoff = int(len(participants[participant]['mood']) * 0.8)
    train = participants[participant]['mood'][:cutoff]
    test = participants[participant]['mood'][cutoff:]
    
    train_test_dict[participant] = {'train': train, 'test': test}

## Finding the models with the optimal parameters

In [44]:
y = train_test_dict['AS14.01']['train']

# Maps each participant to a fitted model
model_dict = {}
for participant in participants:
    train = train_test_dict[participant]['train']
    
    model = pm.auto_arima(train, start_p=1, start_q=1,
                          test='adf',       # use adftest to find optimal 'd'
                          max_p=5, max_q=5, # maximum p and q
                          m=1,              # frequency of series
                          d=None,           # let model determine 'd'
                          seasonal=False,   # No Seasonality (Might have to check for this shit?)
                          start_P=0, 
                          D=0, 
                          trace=False,
                          error_action='ignore',  
                          suppress_warnings=True, 
                          stepwise=True)
    model_dict[participant] = model

for participant in sorted(model_dict.keys()):
    model = model_dict[participant]
    print(f"{participant}\t\tAIC: {np.round(model.aic(), 2):.2f}\tBIC: {np.round(model.bic(), 2):.2f}\tParameters: {model.order}")

AS14.01		AIC: 417.75	BIC: 430.45	Parameters: (1, 0, 1)
AS14.02		AIC: 407.19	BIC: 418.57	Parameters: (2, 0, 0)
AS14.03		AIC: 377.36	BIC: 390.04	Parameters: (1, 0, 1)
AS14.05		AIC: 443.08	BIC: 452.84	Parameters: (0, 1, 2)
AS14.06		AIC: 485.18	BIC: 494.44	Parameters: (0, 0, 1)
AS14.07		AIC: 606.30	BIC: 618.42	Parameters: (1, 0, 1)
AS14.08		AIC: 725.30	BIC: 735.73	Parameters: (1, 0, 0)
AS14.09		AIC: 380.68	BIC: 395.96	Parameters: (2, 0, 1)
AS14.12		AIC: 342.45	BIC: 351.44	Parameters: (0, 0, 1)
AS14.13		AIC: 578.24	BIC: 591.34	Parameters: (1, 0, 1)
AS14.14		AIC: 352.75	BIC: 358.48	Parameters: (0, 1, 1)
AS14.15		AIC: 288.95	BIC: 309.17	Parameters: (2, 0, 2)
AS14.16		AIC: 472.43	BIC: 485.29	Parameters: (1, 0, 1)
AS14.17		AIC: 351.49	BIC: 361.22	Parameters: (1, 0, 0)
AS14.19		AIC: 500.74	BIC: 516.60	Parameters: (1, 0, 2)
AS14.20		AIC: 358.14	BIC: 367.38	Parameters: (1, 0, 0)
AS14.23		AIC: 283.31	BIC: 294.69	Parameters: (2, 0, 0)
AS14.24		AIC: 485.86	BIC: 498.84	Parameters: (1, 0, 1)
AS14.25		A