# Part 2: Time Series Modeling

In this notebook, you will implement functions to extract features from time series data and build ARIMA models.

In [5]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.arima.model import ARIMA
from pathlib import Path
import os

# Set style for plots
plt.style.use('seaborn-v0_8')
%matplotlib inline

## 1. Feature Extraction

Implement the `extract_time_series_features` function to calculate rolling window features.

In [None]:
def extract_time_series_features(data, window_size=60):
    """Extract rolling window features from time series data.
    
    Parameters
    ----------
    data : pd.DataFrame
        Preprocessed physiological data
    window_size : int
        Size of the rolling window in seconds
        
    Returns
    -------
    pd.DataFrame
        DataFrame containing extracted features for each signal
    """
    # Your code here
    # 1. Calculate rolling window statistics
    # 2. Include mean, std, min, max, and autocorrelation

    signals = ['heart_rate','eda','temperature']
    all_features = []
    for session, group in data.groupby('session'):
            session_features = {}
            for signal in signals:

                signal_mean = group[signal].rolling(window=window_size, center=True).mean()
                signal_std = group[signal].rolling(window=window_size, center=True).std()
                signal_min = group[signal].rolling(window=window_size, center=True).min()
                signal_max = group[signal].rolling(window=window_size, center=True).max()

                # autocorrelation (lag 1)
                autocorr = group[signal].rolling(window=window_size, center=True).apply(
                    lambda x: x.autocorr(lag=1) if len(x) > 1 else np.nan
                )
                session_features[f"{signal}_mean"] = signal_mean
                session_features[f"{signal}_std"] = signal_std
                session_features[f"{signal}_min"] = signal_min
                session_features[f"{signal}_max"] = signal_max
                session_features[f"{signal}_autocorr_lag1"] = autocorr

            session_features_df = pd.DataFrame(session_features, index=group.index)
            session_features_df['session'] = session
            all_features.append(session_features_df)

    features_df = pd.concat(all_features)
    features_df.dropna(inplace=True)
    features_df.reset_index(inplace=True)
    return features_df


## 2. ARIMA Modeling

Implement the `build_arima_model` function to fit ARIMA models and generate diagnostic plots.

In [None]:
def build_arima_model(series, order=(1,1,1), output_dir='plots'):
    """Fit an ARIMA model to the time series and generate diagnostic plots.
    
    Parameters
    ----------
    series : pd.Series
        Time series data to model
    order : tuple
        (p,d,q) order of the ARIMA model
    output_dir : str
        Directory to save diagnostic plots
        
    Returns
    -------
    statsmodels.tsa.arima.model.ARIMAResults
        Fitted ARIMA model
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Your code here
    # 1. Fit ARIMA model
    # 2. Generate diagnostic plots:
    #    - Model fit plot
    #    - Residuals plot
    #    - Forecast plot
    # 3. Save plots to output directory
    
    
    model = ARIMA(series, order=order)
    model_fit = model.fit()

    # Actual
    plt.figure(figsize=(10, 5))
    plt.plot(series, label='Actual')
    plt.plot(model_fit.fittedvalues, label='ARIMA Fit', alpha=0.7)
    plt.title('ARIMA Model Fit')
    plt.xlabel('Time')
    plt.ylabel('Value')
    plt.legend()
    fit_plot_path = os.path.join(output_dir, f'{subject}_{session}_{signal}_arima_fit.png')
    plt.tight_layout()
    plt.savefig(fit_plot_path)
    plt.close()

    # Residuals
    residuals = model_fit.resid
    plt.figure(figsize=(10, 4))
    plt.plot(residuals, label='Residuals')
    plt.axhline(0, color='red', linestyle='--')
    plt.title('ARIMA Model Residuals')
    plt.xlabel('Time')
    plt.ylabel('Residual')
    plt.legend()
    resid_plot_path = os.path.join(output_dir, f'{subject}_{session}_{signal}_arima_residuals.png')
    plt.tight_layout()
    plt.savefig(resid_plot_path)
    plt.close()

    # Forecast 100 steps
    forecast_steps = 100
    forecast = model_fit.get_forecast(steps=forecast_steps)
    forecast_index = pd.RangeIndex(start=series.index[-1]+1, stop=series.index[-1]+1+forecast_steps)
    plt.figure(figsize=(10, 5))
    plt.plot(series, label='Original')
    plt.plot(model_fit.fittedvalues, label='ARIMA Fit', alpha=0.7)
    plt.plot(forecast_index, forecast.predicted_mean, label='Forecast', color='yellow')
    plt.fill_between(forecast_index, 
                     forecast.conf_int().iloc[:, 0], 
                     forecast.conf_int().iloc[:, 1], 
                     color='green', alpha=0.2, linestyle='--')
    plt.title('ARIMA Forecast')
    plt.xlabel('Time')
    plt.ylabel('Value')
    plt.legend()
    forecast_plot_path = os.path.join(output_dir, f'{subject}_{session}_{signal}_arima_forecast.png')
    plt.tight_layout()
    plt.savefig(forecast_plot_path)
    plt.close()

    return model_fit

In [43]:
# 'S1_Midterm 1_heart_rate_arima_fit.png', 'S1_Midterm 1_heart_rate_arima_residuals.png'
preprocessed_df_S1 = pd.read_csv("data/processed/S1_processed.csv")
features_df_S1 = extract_time_series_features(preprocessed_df_S1, window_size=60)
features_df_S1

preprocessed_df_S9 = pd.read_csv("data/processed/S9_processed.csv")
features_df_S9 = extract_time_series_features(preprocessed_df_S9, window_size=60)
features_df_S9

Unnamed: 0,index,heart_rate_mean,heart_rate_std,heart_rate_min,heart_rate_max,heart_rate_autocorr_lag1,eda_mean,eda_std,eda_min,eda_max,eda_autocorr_lag1,temperature_mean,temperature_std,temperature_min,temperature_max,temperature_autocorr_lag1,session
0,24581,99.564000,6.147692,70.50,104.00,0.823000,0.016786,0.003163,0.000000,0.019220,0.719941,21.916667,0.127062,21.77,22.17,0.955060,Final
1,24582,99.738500,6.048276,70.50,104.00,0.930817,0.017063,0.002269,0.001281,0.019220,0.278687,21.917667,0.126120,21.77,22.17,0.954365,Final
2,24583,100.265667,4.680417,76.25,104.00,0.915683,0.017341,0.000928,0.014095,0.019220,-0.396989,21.918667,0.125164,21.77,22.17,0.953642,Final
3,24584,100.591333,4.048424,76.25,104.00,0.977599,0.017383,0.000830,0.015376,0.019220,-0.516116,21.919667,0.124192,21.77,22.17,0.952889,Final
4,24585,101.027167,2.491912,86.40,104.00,0.942867,0.017405,0.000827,0.015376,0.019220,-0.516116,21.920667,0.123204,21.77,22.17,0.952105,Final
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38565,24517,158.633000,6.964547,148.42,169.32,0.990462,0.276644,0.007750,0.249845,0.287001,0.825275,32.422667,0.016037,32.39,32.45,0.907297,midterm_2
38566,24518,158.714000,6.935824,148.42,169.32,0.990269,0.276986,0.007573,0.249845,0.287001,0.824380,32.423000,0.015977,32.39,32.45,0.906584,midterm_2
38567,24519,158.772333,6.912064,148.42,169.32,0.990033,0.276623,0.008432,0.247282,0.287001,0.676437,32.423333,0.015909,32.39,32.45,0.917953,midterm_2
38568,24520,158.803167,6.897247,148.42,169.32,0.989692,0.276046,0.009920,0.235751,0.287001,0.754078,32.424667,0.016413,32.39,32.47,0.863735,midterm_2


In [46]:
subject = 'S1'
signal = 'eda_autocorr_lag1'
session = 'midterm_1'
fitted_model = build_arima_model(features_df_S1[features_df_S1.session == session][signal], order=(1,1,1), output_dir='plots')

subject = 'S9'
signal = 'temperature_autocorr_lag1'
session = 'Final'
fitted_model = build_arima_model(features_df_S9[features_df_S9.session == session][signal], order=(1,1,1), output_dir='plots')
fitted_model.summary()



  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


0,1,2,3
Dep. Variable:,temperature_autocorr_lag1,No. Observations:,14137.0
Model:,"ARIMA(1, 1, 1)",Log Likelihood,33387.014
Date:,"Tue, 06 May 2025",AIC,-66768.028
Time:,15:20:47,BIC,-66745.358
Sample:,0,HQIC,-66760.485
,- 14137,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ar.L1,0.1874,0.009,20.817,0.000,0.170,0.205
ma.L1,-0.4445,0.009,-50.501,0.000,-0.462,-0.427
sigma2,0.0005,7.29e-07,712.826,0.000,0.001,0.001

0,1,2,3
Ljung-Box (L1) (Q):,0.0,Jarque-Bera (JB):,15940676.83
Prob(Q):,0.96,Prob(JB):,0.0
Heteroskedasticity (H):,0.65,Skew:,-3.19
Prob(H) (two-sided):,0.0,Kurtosis:,167.39
