# Part 2: Time Series Modeling

In this notebook, you will implement functions to extract features from time series data and build ARIMA models.

In [2]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.arima.model import ARIMA
from pathlib import Path
import os
# Set style for plots
plt.style.use("seaborn-v0_8")
%matplotlib inline
# Seaborn is deprecated with matplotlib, I kept getting the error: FileNotFoundError: [Errno 2] No such file or directory: 'seaborn' when running plt.style.use('seaborn')
# StatckOverflow suggested using seaborn-v0_8 instead and it works
# https://stackoverflow.com/questions/74716259/the-seaborn-styles-shipped-by-matplotlib-are-deprecated-since-3-6

## 1. Feature Extraction

Implement the `extract_time_series_features` function to calculate rolling window features.

In [3]:
def extract_time_series_features(data, window_size=60):
    """Extract rolling window features from time series data.
    
    Parameters
    ----------
    data : pd.DataFrame
        Preprocessed physiological data
    window_size : int
        Size of the rolling window in seconds
        
    Returns
    -------
    pd.DataFrame
        DataFrame containing extracted features for each signal
    """
    # Your code here
    # 1. Calculate rolling window statistics
    # 2. Include mean, std, min, max, and autocorrelation
    features = []

    for (session, subject_id), group in data.groupby(['session', 'subject_id']):
        window = group.rolling(f'{window_size}s', min_periods=1)

        def autocorr(x):
            return x.autocorr(lag=1) if len(x) > 1 else 0

        feature_df = pd.DataFrame({
            'eda_mean': window['eda'].mean(),
            'eda_std': window['eda'].std(),
            'eda_min': window['eda'].min(),
            'eda_max': window['eda'].max(),
            'eda_autocorr': window['eda'].apply(autocorr),

            'hr_mean': window['heart_rate'].mean(),
            'hr_std': window['heart_rate'].std(),
            'hr_min': window['heart_rate'].min(),
            'hr_max': window['heart_rate'].max(),
            'hr_autocorr': window['heart_rate'].apply(autocorr),

            'temp_mean': window['temperature'].mean(),
            'temp_std': window['temperature'].std(),
            'temp_min': window['temperature'].min(),
            'temp_max': window['temperature'].max(),
            'temp_autocorr': window['temperature'].apply(autocorr),
        })

        feature_df['subject_id'] = subject_id
        feature_df['session'] = session
        feature_df['timestamp'] = feature_df.index

        features.append(feature_df.reset_index(drop=True))

    return pd.concat(features, ignore_index=True)

In [7]:
def load_processed_data(processed_dir='data/processed'):
    """Load and concatenate all processed data CSVs."""
    all_files = [f for f in os.listdir(processed_dir) if f.endswith('.csv')]
    dfs = []
    for file in all_files:
        df = pd.read_csv(os.path.join(processed_dir, file))
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

processed_data = load_processed_data()

processed_data.set_index('timestamp', inplace=True)

features_df = extract_time_series_features(processed_data, window_size=60)

print("\n\n\n","------"*15)
print(features_df[::100].head())

  c = cov(x, y, rowvar, dtype=dtype)
  c = cov(x, y, rowvar, dtype=dtype)
  c = cov(x, y, rowvar, dtype=dtype)





 ------------------------------------------------------------------------------------------
     eda_mean   eda_std   eda_min   eda_max  eda_autocorr     hr_mean  \
0    0.010463       NaN  0.010463  0.010463      0.000000         NaN   
100  0.008027  0.000284  0.007624  0.009129      0.631460  100.397104   
200  0.007654  0.000184  0.007271  0.008072      0.191001  107.084808   
300  0.007574  0.000190  0.006983  0.008008     -0.113827   86.100800   
400  0.007727  0.000254  0.007336  0.008937     -0.063293  109.156142   

       hr_std     hr_min    hr_max  hr_autocorr  temp_mean  temp_std  \
0         NaN        NaN       NaN          NaN  21.936667       NaN   
100  3.558109   93.93700  106.3705     0.999440  22.311733  0.121036   
200  2.646762  100.52975  109.5330     0.999258  22.215800  0.042333   
300  4.374770   81.22975   95.2535     0.997687  22.344067  0.024652   
400  4.104146  103.41250  116.2540     0.999867  21.935267  0.515855   

      temp_min   temp_max  temp_a

## 2. ARIMA Modeling

Implement the `build_arima_model` function to fit ARIMA models and generate diagnostic plots.

In [15]:
from statsmodels.tsa.arima.model import ARIMA

def build_arima_model(series, order=(1,1,1), output_dir='plots', subject_id='S1', session='Midterm 1', signal='heart_rate'):
    """Fit an ARIMA model to the time series and generate diagnostic plots.
    
    Parameters
    ----------
    series : pd.Series
        Time series data to model
    order : tuple
        (p,d,q) order of the ARIMA model
    output_dir : str
        Directory to save diagnostic plots
    subject_id : str
        Subject identifier (e.g., 'S1')
    session : str
        Session identifier (e.g., 'Midterm 1')
    signal : str
        Physiological signal being modeled (e.g., 'heart_rate')
        
    Returns
    -------
    statsmodels.tsa.arima.model.ARIMAResults
        Fitted ARIMA model
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # 1. Fit ARIMA model
    model = ARIMA(series, order=order)
    fitted_model = model.fit()

    # 2. Generate diagnostic plots:
    # Model fit plot
    plt.figure(figsize=(10, 6))
    plt.plot(fitted_model.fittedvalues, color='blue', label='Fitted values')
    plt.plot(series, color='gray', alpha=0.5, label='Original series')
    plt.title(f'{subject_id}_{session}_{signal}_arima_fit.png')
    plt.xlabel('Time')
    plt.ylabel('Value')
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f'{subject_id}_{session}_{signal}_arima_fit.png'))
    plt.close()

    # Residuals plot
    residuals = fitted_model.resid
    plt.figure(figsize=(10, 6))
    plt.plot(residuals, color='red', label='Residuals')
    plt.title(f'{subject_id}_{session}_{signal}_arima_residuals.png')
    plt.xlabel('Time')
    plt.ylabel('Residuals')
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f'{subject_id}_{session}_{signal}_arima_residuals.png'))
    plt.close()

    # 3. Generate a forecast plot (optional)
    forecast = fitted_model.forecast(steps=50)
    plt.figure(figsize=(10, 6))
    plt.plot(series, label='Original Series')
    plt.plot(range(len(series), len(series) + 50), forecast, label='Forecast', color='green')
    plt.title(f'{subject_id}_{session}_{signal}_arima_forecast.png')
    plt.xlabel('Time')
    plt.ylabel('Value')
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f'{subject_id}_{session}_{signal}_arima_forecast.png'))
    plt.close()

    # 4. Return the fitted ARIMA model
    return fitted_model

In [16]:
signal_series = processed_data['heart_rate'].dropna() 

# Parameters for the first plot: 
subject_id = 'S1'  
session = 'Midterm 1' 
signal = 'heart_rate'

# Run the ARIMA model and generate first plot
model = build_arima_model(signal_series, order=(1, 1, 1), output_dir='plots', subject_id=subject_id, session=session, signal=signal)


#Parameters for the second plot:
subject_id = 'S2'
# Run the ARIMA model and generate second plot
model = build_arima_model(signal_series, order=(1, 1, 1), output_dir='plots', subject_id=subject_id, session=session, signal=signal)

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
