In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
import pmdarima as pm
from pmdarima import auto_arima
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.metrics import mean_squared_error
from math import sqrt
import pandas as pd
import os
import re
import numpy as np
import pickle

from scipy.interpolate import CubicSpline, PchipInterpolator, Akima1DInterpolator

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
def load_timeseries(fname):
    ts = pd.read_csv(f'data/{fname}')
    ts['Date'] = pd.to_datetime(ts['Date'])

    return ts

def interpolate_cubicspline(ts, *, step_size='ME'):
    ''' 
    This function will perform a cubic spline interpolation on a given timeseries dataframe
    and, optionally, given a new-stepsize.
    e.g. 'D' for daily, 'ME' for monthly, ...
    The default (if not specified) new-stepsize is monthly ('ME')
    '''

    # Create the smaller dates for interpolation
    interp_dates = pd.date_range(start=ts['Date'].min(), end=ts['Date'].max(), freq=step_size)

    # Create a method for performign the cubic spline
    cs = CubicSpline(pd.to_datetime(ts['Date']), ts['Average City MPG'])

    # Perform the cubic spline interpolation
    interp_values = cs(interp_dates)

    interp_ts = pd.DataFrame({'Date': interp_dates, 'Average City MPG': interp_values})

    return interp_ts

def interpolate_akima1d(ts, *, step_size='ME'):
    ''' 
    This function will perform an Akima1d interpolation on a given timeseries dataframe
    and, optionally, given a new-stepsize.
    e.g. 'D' for daily, 'ME' for monthly, ...
    The default (if not specified) new-stepsize is monthly ('ME')

    Why use Akima1d instead of cubic spline? Well cubic splines can sometimes 'overfit' or 'over-oscillate' 
    So Akima1d can be smoother

    '''

    # Create the smaller dates for interpolation
    interp_dates = pd.date_range(start=ts['Date'].min(), end=ts['Date'].max(), freq=step_size)

    # Create a method for performign the cubic spline
    cs = Akima1DInterpolator(pd.to_datetime(ts['Date']), ts['Average City MPG'])

    # Perform the cubic spline interpolation
    interp_values = cs(interp_dates)

    interp_ts = pd.DataFrame({'Date': interp_dates, 'Average City MPG': interp_values})

    return interp_ts


def interpolate_pchip(ts, *, step_size='ME'):
    ''' 
    This function will perform an PCHIP interpolation on a given timeseries dataframe
    and, optionally, given a new-stepsize.
    e.g. 'D' for daily, 'ME' for monthly, ...
    The default (if not specified) new-stepsize is monthly ('ME')

    Why use PCHIP instead of cubic spline? Well cubic splines can sometimes 'overfit' or 'over-oscillate' 
    So PCHIP can be smoother, similar to Akima1d

    '''

    # Create the smaller dates for interpolation
    interp_dates = pd.date_range(start=ts['Date'].min(), end=ts['Date'].max(), freq=step_size)

    # Create a method for performign the cubic spline
    cs = PchipInterpolator(pd.to_datetime(ts['Date']), ts['Average City MPG'])

    # Perform the cubic spline interpolation
    interp_values = cs(interp_dates)

    interp_ts = pd.DataFrame({'Date': interp_dates, 'Average City MPG': interp_values})

    return interp_ts
    

def test_stationarity(timeseries):
    print("Dickey-Fuller test results:")
    dftest = adfuller(timeseries, autolag = "AIC")
    dfoutput = pd.Series(dftest[0:4], index = ["Test Statistic", "p-value", "#Lags Used", "Number of Observations Used"])
    for key,value in dftest[4].items():
        dfoutput["Critical Value (%s)"%key] = value
    return dfoutput

def forecast_test(ts, *, train_size=0.8, seasonal=False, m=1, information_criterion='aic'):
    # Split data into 80/20 train/test 
    train_size = int(len(ts) * train_size)
    train_ts, test_ts = ts[:train_size], ts[train_size:]

    model_ = auto_arima(train_ts['Average City MPG'], trace=True, seasonal = seasonal, m = m, information_criterion=information_criterion)

    # Given a number of forecast steps, make predictions
    forecast_steps = len(test_ts)
    forecast_values, conf_int = model_.predict(n_periods=forecast_steps, return_conf_int=True)
    forecast_values

    # Create Dates for the predictions
    step_size = pd.infer_freq(ts['Date']) #'y'
    forecast_dates = pd.date_range(start=test_ts.iloc[0,0], periods=forecast_steps, freq=step_size)

    # Place into a dataframe
    forecast_ts = pd.DataFrame({'Date': forecast_dates, 'Average City MPG': forecast_values})

    # Append the last time date of the training set to the test and forecast set. 
    # This helps with plotting 
    test_ts = pd.concat([train_ts.iloc[[-1]], test_ts])
    forecast_ts = pd.concat([train_ts.iloc[[-1]], forecast_ts])

    return model_, train_ts, test_ts, forecast_ts
