# Part 2: Time Series Modeling

In this notebook, you will implement functions to extract features from time series data and build ARIMA models.

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.arima.model import ARIMA
from pathlib import Path
import os

# Set seaborn style
sns.set()

## 1. Feature Extraction

Implement the `extract_time_series_features` function to calculate rolling window features.

In [3]:
def extract_time_series_features(data, window_size=60):
    """Extract rolling window features from time series data.

    Parameters
    ----------
    data : pd.DataFrame
        Preprocessed physiological data.
    window_size : int
        Size of the rolling window in seconds (or appropriate time unit).

    Returns
    -------
    pd.DataFrame
        DataFrame containing extracted features for each signal.
    """
    # Make sure column names are case-insensitive
    data.columns = data.columns.str.lower()

    signals = ['heart_rate', 'eda', 'temperature']

    if not all(sig in data.columns for sig in signals):
        raise ValueError("Input data must contain heart_rate, eda, and temperature columns (case-insensitive).")

    # Make sure timestamp is sorted
    data = data.sort_values('timestamp').reset_index(drop=True)

    feature_list = []

    # Assume that 'timestamp' is evenly spaced or already resampled

    for signal in signals:
        # Rolling window computations
        rolling = data[signal].rolling(window=window_size, min_periods=1)

        features = pd.DataFrame({
            f'{signal}_mean': rolling.mean(),
            f'{signal}_std': rolling.std(),
            f'{signal}_min': rolling.min(),
            f'{signal}_max': rolling.max(),
        })

        # Autocorrelation at lag 1
        def rolling_autocorr(x):
            return x.autocorr(lag=1) if len(x) > 1 else np.nan

        features[f'{signal}_autocorr_lag1'] = rolling.apply(rolling_autocorr, raw=False)

        feature_list.append(features)

    # Concatenate all signal features
    features_df = pd.concat(feature_list, axis=1)

    # Attach subject_id, session, timestamp if they exist
    meta_cols = ['timestamp', 'subject_id', 'session']
    for col in meta_cols:
        if col in data.columns:
            features_df[col] = data[col]

    # Reorder columns to put meta information first
    ordered_cols = [col for col in meta_cols if col in features_df.columns] + list(features_df.drop(columns=meta_cols, errors='ignore').columns)
    features_df = features_df[ordered_cols]

    return features_df


## 2. ARIMA Modeling

Implement the `build_arima_model` function to fit ARIMA models and generate diagnostic plots.

In [None]:
from statsmodels.tsa.arima.model import ARIMA
import matplotlib.pyplot as plt
import os
from pathlib import Path

def build_arima_model(series, order=(1, 1, 1), output_dir='plots', subject_id='S1', session='Midterm 1', signal_name='heart_rate'):
    """Fit an ARIMA model to the time series and generate diagnostic plots.

    Parameters
    ----------
    series : pd.Series
        Time series data to model.
    order : tuple
        (p,d,q) order of the ARIMA model.
    output_dir : str
        Directory to save diagnostic plots.
    subject_id : str
        Subject ID, e.g., 'S1'.
    session : str
        Session name, e.g., 'Midterm 1'.
    signal_name : str
        Signal name, e.g., 'heart_rate'.

    Returns
    -------
    statsmodels.tsa.arima.model.ARIMAResults
        Fitted ARIMA model.
    """
    # Create output directory
    output_dir = Path(output_dir).expanduser()
    output_dir.mkdir(parents=True, exist_ok=True)

    # Drop NaNs from series
    series = series.dropna()

    # 1. Fit ARIMA model
    model = ARIMA(series, order=order)
    model_fit = model.fit()

    # 2. Create file name prefix
    safe_session = session.replace(' ', '_')  # Replace spaces
    file_prefix = f"{subject_id}_{safe_session}_{signal_name}_arima"

    # --- (1) Model fit plot ---
    fig_fit, ax = plt.subplots(figsize=(12, 6))
    ax.plot(series.index, series, label='Observed', color='blue')
    ax.plot(series.index, model_fit.fittedvalues, label='Fitted', color='red')
    ax.set_title(f'ARIMA Model Fit: {subject_id} {session} {signal_name}')
    ax.set_xlabel('Time')
    ax.set_ylabel(signal_name.capitalize())
    ax.legend()
    ax.grid(True)

    fit_plot_path = output_dir / f"{file_prefix}_fit.png"
    fig_fit.savefig(fit_plot_path)
    plt.close(fig_fit)

    # --- (2) Residuals plot ---
    residuals = model_fit.resid
    fig_resid, ax = plt.subplots(figsize=(12, 6))
    ax.plot(series.index, residuals, label='Residuals', color='purple')
    ax.axhline(0, linestyle='--', color='black')
    ax.set_title(f'ARIMA Model Residuals: {subject_id} {session} {signal_name}')
    ax.set_xlabel('Time')
    ax.set_ylabel('Residuals')
    ax.legend()
    ax.grid(True)

    resid_plot_path = output_dir / f"{file_prefix}_residuals.png"
    fig_resid.savefig(resid_plot_path)
    plt.close(fig_resid)

    print(f"Saved plots: {fit_plot_path}, {resid_plot_path}")

    return model_fit

from pathlib import Path

data = preprocess_data(load_data('~/Documents/4-it-s-about-time-kanting6/data'))

# Pick a subject and session
subject_id = 'S1'
session = 'Midterm 1'
signal_name = 'heart_rate'

# Subset the data
subset = data[(data['subject_id'] == subject_id) & (data['session'] == session)]

# Build ARIMA model for heart rate
model_fit = build_arima_model(
    series=subset.set_index('timestamp')[signal_name],
    order=(1, 1, 1),
    output_dir='plots',
    subject_id=subject_id,
    session=session,
    signal_name=signal_name
)
