In [1]:
import warnings
warnings.simplefilter(action='ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge
from pmdarima import auto_arima
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from scipy import stats
from utils import calculate_CI

In [4]:
# Hospitalization numbers
hosp = pd.read_csv("../data/COVID_hosp.csv")
hosp = hosp[hosp['geography'] == 'DE'][['date', 'total']]
hosp = hosp.rename(columns={'total': 'hosp'})
hosp['date'] = pd.to_datetime(hosp['date'])
hosp = hosp.set_index('date', drop=True)
# Calculate incidence per 100.000
hosp = hosp['hosp'] / (84357 / 100)

# Import waste water data (interpolated) to determine time period where waste water data is present
virus = pd.read_excel("../data/amelag_aggregierte_kurve.xlsx")
virus = virus[["datum", "loess_vorhersage"]].dropna()
virus = virus.rename(columns={'datum': 'date', 'loess_vorhersage': 'virus'})
virus['date'] = pd.to_datetime(virus['date'])
virus = virus.set_index('date', drop=True)
virus = virus # normalized to whole population
hosp_virus = pd.merge(virus, hosp, on='date', how='inner')
# Only the time period where waste water data is present
df = hosp_virus.iloc[1:] # Account for the one day that gets lost when differentiating
# Create hosp_x column with shifted values
df['virus'] = df['virus'].shift(periods=7)
df = df.dropna(how='any')

In [5]:
def split_dataframe(df, method):
    """
    Split a given dataframe into smaller chunks with a constant context and forecast length.
    The chunk window is moved forward by a specified shift value.
    The forecast part of the first chunk starts at the biggest expected context length,
    so that no matter the given context length the forecast parts always have the same indices.
    If the last forecast part of a chunk does not have the full forecast length, the chunk is discarded.

    Parameters:
    - df: pandas DataFrame, the dataframe to be split
    - context_length: int, the length of the context window
    - forecast_length: int, the length of the forecast window
    - max_context_length: int, the maximum context length to be used
    - shift: int, the value by which the chunk window moves forward

    Returns:
    - chunks: list of tuples, each tuple contains a context window and its corresponding forecast window
    """

    if method == "autoarima":
        context_length = 70
    else:
        context_length = 7
            
    max_context_length = 70 ## 70 is needed to align prediction windows 
    forecast_length = 7
    shift = 7
    chunks = []
    end_index = len(df) - forecast_length + 1

    # Start splitting the dataframe into chunks
    for i in range(max_context_length - context_length, end_index, shift):
        # Define start and end indices for the context and forecast windows
        context_start = i
        context_end = context_start + context_length
        forecast_start = context_end
        forecast_end = forecast_start + forecast_length

        # Check if the forecast window exceeds the dataframe length
        if forecast_end <= len(df):
            # Extract context and forecast windows
            context_window = df.iloc[context_start:context_end]
            forecast_window = df.iloc[forecast_start:forecast_end]
            chunks.append((context_window, forecast_window))

    return chunks

In [8]:
def train_forecast_evaluate(train_chunks, method, log=False, ar=True):
    """
    Train a model, forecast, and evaluate MAPE for each chunk.

    Parameters:
    - train_chunks: list of tuples, each tuple contains a context window and its corresponding forecast window
    - method: str, method to use for forecasting ('autoarima' or 'log_linear')
    - plot: bool, whether to plot or not (default is False)

    Returns:
    - average_mape: float, the average Mean Absolute Percentage Error (MAPE) across all chunks
    """
    def transform():
        scaler = MinMaxScaler(feature_range=(0, 1))
        context_window['virus'] = scaler.fit_transform(context_window['virus'].values.reshape(-1, 1))
        forecast_window['virus'] = scaler.fit_transform(forecast_window['virus'].values.reshape(-1, 1))
    
    mape_list = []
    mae_list = []
    
    ar_params = []
    ma_params = []
    exog_params = []
    
    for i, (context_window, forecast_window) in enumerate(train_chunks):
        if log:
            context_window = np.log(context_window)
            forecast_window = np.log(forecast_window)
            
        if method == 'autoarima':
        
            if ar == True:
                context_window = context_window['hosp']
                model = auto_arima(context_window, seasonal=False, trace=False)
                forecast = model.predict(n_periods=len(forecast_window))

            if ar == False:

                model = auto_arima(context_window['hosp'], seasonal=False, X=pd.DataFrame(context_window['virus']), trace=False)
                forecast = model.predict(n_periods=len(forecast_window['hosp']), X=pd.DataFrame(forecast_window['virus']))
                params = model.params()
                ar_params.append(params.filter(like='ar'))
                ma_params.append(params.filter(like='ma'))
                exog_params.append(params.filter(like='virus'))
        elif method == 'lin_reg':
            model = LinearRegression()
            # Use indices of context_window as independent variable and context_window as dependent variable
            if ar == True:
                context_window = context_window['hosp']
                model.fit(np.arange(len(context_window)).reshape(-1, 1), context_window)
                forecast_indices = np.arange(len(context_window), len(context_window) + len(forecast_window))
                forecast = model.predict(forecast_indices.reshape(-1, 1))
            elif ar == False:
                X = pd.concat([pd.DataFrame(np.arange(1,len(context_window['hosp'])+1).reshape(-1, 1)), context_window['virus'].reset_index(drop=True)],  axis=1)
                X.columns = ['days', 'virus']
                model.fit(X, context_window['hosp'])
                # Predict the values for forecast window indices
                forecast_indices = np.arange(len(context_window['hosp'])+1, len(context_window['hosp']) + len(forecast_window['hosp'])+1)
                X_test = pd.concat([pd.DataFrame(forecast_indices.reshape(-1, 1)), forecast_window['virus'].reset_index(drop=True)],  axis=1)
                X_test.columns = ['days', 'virus']
                forecast = model.predict(X_test)

        elif method == 'ridge':
            model = Ridge(alpha=5)
            # Use indices of context_window as independent variable and context_window as dependent variable
            if ar == True:
                context_window = context_window['hosp']
                model.fit(np.arange(len(context_window)).reshape(-1, 1), context_window)
                forecast_indices = np.arange(len(context_window), len(context_window) + len(forecast_window))
                forecast = model.predict(forecast_indices.reshape(-1, 1))
            elif ar == False:
                X = pd.concat([pd.DataFrame(np.arange(1,len(context_window['hosp'])+1).reshape(-1, 1)), context_window['virus'].reset_index(drop=True)],  axis=1)
                X.columns = ['days', 'virus']
                model.fit(X, context_window['hosp'])
                # Predict the values for forecast window indices
                forecast_indices = np.arange(len(context_window['hosp'])+1, len(context_window['hosp']) + len(forecast_window['hosp'])+1)
                X_test = pd.concat([pd.DataFrame(forecast_indices.reshape(-1, 1)), forecast_window['virus'].reset_index(drop=True)],  axis=1)
                X_test.columns = ['days', 'virus']
                forecast = model.predict(X_test)

        if log:
            forecast_window = np.exp(forecast_window)    
            forecast = np.exp(forecast)
       
        # Calculate MAPE for the chunk
        mape = mean_absolute_percentage_error(forecast_window['hosp'], forecast) * 100
        mae = mean_absolute_error(forecast_window['hosp'], forecast) 
        mape_list.append(mape)
        mae_list.append(mae)
    
        #print(f"MAPE for Train Chunk {i+1} using {method}: {mape}")

    CI_mape = calculate_CI(mape_list)
    CI_mae = calculate_CI(mae_list)
    # Calculate average MAPE
    average_mape = np.round(np.mean(mape_list),2)
    average_mae = np.round(np.mean(mae_list),2)
    print("Average MAPE using", method, ":", average_mape, "| CI:", CI_mape,
          "Average MAE using", method, ":", average_mae, "| CI:",CI_mae)

    #if method == 'autoarima' and ar == False:
    #    return average_mape, mape_list, ar_params, ma_params, exog_params, model.params()
   # else:
    return average_mape, CI_mape, mape_list, average_mae,CI_mae, mae_list


In [9]:
for met in ["lin_reg", "ridge", "autoarima"]:
    for setting in [True, False]:
        log = True
        method = met
        ar = setting

        output_name_metrics = "../output/"+method+"_"+str(setting)+"_metrics.csv"
        output_name_summary = "../output/"+method+"_"+str(setting)+"_summary.csv"
        train_chunks = split_dataframe(df, method=method)
        print(method, "ar=", setting)
        average_mape,CI_mape, mape_list, average_mae, CI_mae, mae_list = train_forecast_evaluate(train_chunks, method=method, log=log, ar=ar)
        median_mape = np.round(np.median(mape_list),2)
        median_mae = np.round(np.median(mae_list),2)
        metrics = pd.DataFrame({"MAPE":mape_list, "MAE":mae_list})
        summary = pd.DataFrame({"Mean_MAPE":average_mape, "Median_MAPE":median_mape, "CI_low_MAPE":CI_mape[0],"CI_up_MAPE":CI_mape[1], 
                                        "Mean_MAE": average_mae, "Median_MAE":median_mae, "CI_low_MAE":CI_mae[0],"CI_up_MAE":CI_mae[1] }, index=[0])
        metrics.to_csv(output_name_metrics,index=False)
        summary.to_csv(output_name_summary,index=False)




lin_reg ar= True
Average MAPE using lin_reg : 8.51 | CI: ['7.05', '9.97'] Average MAE using lin_reg : 0.57 | CI: ['0.40', '0.74']
lin_reg ar= False
Average MAPE using lin_reg : 11.85 | CI: ['8.74', '14.96'] Average MAE using lin_reg : 0.71 | CI: ['0.49', '0.92']
ridge ar= True
Average MAPE using ridge : 8.1 | CI: ['6.77', '9.43'] Average MAE using ridge : 0.54 | CI: ['0.39', '0.68']
ridge ar= False
Average MAPE using ridge : 8.1 | CI: ['6.77', '9.43'] Average MAE using ridge : 0.54 | CI: ['0.39', '0.68']
autoarima ar= True


KeyboardInterrupt: 