# Part 2: Time Series Modeling

In this notebook, you will implement functions to extract features from time series data and build ARIMA models.

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.arima.model import ARIMA
from pathlib import Path
import os

# Set style for plots
plt.style.use('seaborn')
%matplotlib inline

In [13]:
import pandas as pd

def extract_time_series_features(data_path, window_size=60):
    
    def autocorr(x):
        return x.autocorr(lag=1) if len(x) > 1 else np.nan

    data = data_path.sort_values('timestamp')

    results = data[['timestamp', 'subject_id', 'session']].copy()

    cols = ['heart_rate', 'eda', 'temperature']

    for col in cols:
        results[f'{col}_mean'] = data[col].rolling(window=window_size).mean()
        results[f'{col}_sd'] = data[col].rolling(window=window_size).std()
        results[f'{col}_min'] = data[col].rolling(window=window_size).min()
        results[f'{col}_max'] = data[col].rolling(window=window_size).max()
        results[f'{col}_autocor'] = data[col].rolling(window=window_size).apply(autocorr)
        
    return results

data_list = pd.DataFrame()
for i in range(1, 11):
    df = pd.read_csv(f'data/processed/S{i}_processed.csv')
    data_list = pd.concat([data_list, df])

features_df = extract_time_series_features(data_list, window_size=60)
print(features_df)
#features_df = extract_time_series_features(data_path='data/processed', window_size=60)


                 timestamp subject_id    session  heart_rate_mean  \
15216  2018-10-13 12:55:21         S5  Midterm 1              NaN   
15217  2018-10-13 12:55:22         S5  Midterm 1              NaN   
15218  2018-10-13 12:55:23         S5  Midterm 1              NaN   
15219  2018-10-13 12:55:24         S5  Midterm 1              NaN   
15220  2018-10-13 12:55:25         S5  Midterm 1              NaN   
...                    ...        ...        ...              ...   
25817  2018-12-05 23:39:11         S3      Final       103.219333   
25818  2018-12-05 23:39:12         S3      Final       103.072167   
25819  2018-12-05 23:39:13         S3      Final       102.932333   
25820  2018-12-05 23:39:14         S3      Final       102.769333   
25821  2018-12-05 23:39:15         S3      Final       102.611000   

       heart_rate_sd  heart_rate_min  heart_rate_max  heart_rate_autocor  \
15216            NaN             NaN             NaN                 NaN   
15217            Na

In [22]:
import os
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA

def build_arima_model(series, order=(1,1,1), output_dir='plots'):
    """
    Fits ARIMA model to the series and generates diagnostic plots.
    """
    os.makedirs(output_dir, exist_ok=True) # make directory if not currently exists

    model = ARIMA(series, order=order)
    fitted_model = model.fit()

    # actual vs. fitted
    plt.figure(figsize=(10, 4)) # create figure
    plt.plot(series, label='Actual') #plot actual
    plt.plot(fitted_model.fittedvalues, label='Fitted', alpha=0.7) # plot fitted model
    plt.title('ARIMA Fit')
    plt.legend()
    fit_plot_name = os.path.join(output_dir, f"{series.name}_arima_fit.png")
    plt.savefig(fit_plot_name) # save figure to output path
    plt.close()

    # Residuals plot
    residuals = fitted_model.resid # find residuals
    plt.figure(figsize=(10, 4)) # create figure
    plt.plot(residuals)
    plt.axhline(0, linestyle='--', color='gray')
    plt.title('ARIMA Residuals')
    residual_plot_name = os.path.join(output_dir, f"{series.name}_arima_residuals.png")
    plt.savefig(residual_plot_name) # save figure to output path
    plt.close()

    return fitted_model

import pandas as pd

df = pd.read_csv('data/processed/S1_processed.csv')
df = df.sort_values('timestamp')

heart_rate_series = df['heart_rate'].dropna()
fitted_model = build_arima_model(
    series=heart_rate_series,
    order=(1, 1, 1),
    output_dir='plots'
)

print(fitted_model.summary())

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


                               SARIMAX Results                                
Dep. Variable:             heart_rate   No. Observations:                44355
Model:                 ARIMA(1, 1, 1)   Log Likelihood              -29435.164
Date:                Fri, 02 May 2025   AIC                          58876.329
Time:                        10:06:01   BIC                          58902.428
Sample:                             0   HQIC                         58884.550
                              - 44355                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.9619      0.001   1819.344      0.000       0.961       0.963
ma.L1         -0.7578      0.001  -1070.913      0.000      -0.759      -0.756
sigma2         0.2208   8.04e-05   2747.208      0.0