## Calculates squared error metrics including overall and weekly analysis

We use RMSE and to assess the accuracy of a forecast. Let $t_0$ be the first day of a forecast and $f_{win}$ be the forecast length. Then we define

* Root mean square error (RMSE)
$$
RMSE = \sqrt{\frac{\sum_{i=t_0}^{t_0+f_{win}} (\text{trap}_i - \hat{y}_i)^2}{f_{win}}}
$$



where $\text{trap}_i$ is the trap count and $\hat{y}_i$ is the smoothed neural network prediction for day $i$. We report average metric values, averging over slightly different scales, as described below.

Start by importing necessary libraries and turning off warnings

In [None]:
import os, sys, importlib, json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import date
from scipy.stats import nbinom as nbinom  
import itertools
sys.path.append( os.path.abspath(os.path.join('..')) )


import utils.forecast as forecast
import utils.utils as gen_utils

import warnings
warnings.filterwarnings('ignore')

# Opening config file
f = open('../fpaths_config.json')
paths = json.load(f)

raw_data_path = paths['raw_data']
smoothed_data_path = paths['smoothed_data']
metric_fil = paths["mean_metric_fil"]
amt_metrics_path = paths['amt_metrics']
scaling_rto_fil = paths["scaling_rto_fil"]
baseline_scaling_rto_fil = paths["baseline_scaling_rto_fil"]

## Overall mean metric

Here we compute the average RMSE across all forecasts. We test combinations of scaler windows and forecast windows.

In [None]:
importlib.reload(forecast)

with open(metric_fil, 'w') as f:
    f.write('Site\tWeather\tScaler_win\tForecast_win\tTrap_smoothing_flag\tRMSE_mu\tRMSE_sig\n')

# Test site and airport weather compared to raw and smoothed trap
# Neural network predictions are always smoothed
weather_flags = ['site', 'airport']
smoothing_flags = [False, True]

weather_smoothing_flags = list(itertools.product(weather_flags, smoothing_flags))

for weather_smoothing_flag in weather_smoothing_flags:
    weather_flag = weather_smoothing_flag[0]
    smoothing_flag = weather_smoothing_flag[-1]

    sites = ['Arboleda', 'Playa', 'La_Margarita', 'Villodas']

    scaler_wins = [4, 8, 13, 17, 21, 26]
    forecast_wins = [3, 4, 8, 17, 21, 30, 34, 52, 80, 150]

    scaler_forecast_wins = list(itertools.product(scaler_wins, forecast_wins))
    today = date.today().strftime('%m%d%y')

    for site in sites:
        if smoothing_flag:
            nn_fil_name = '{}/{}_{}_smoothed_weekly_predictions.csv'.format(smoothed_data_path,site,weather_flag)
        else:
            nn_fil_name = '{}/{}_{}_raw_weekly_predictions.csv'.format(raw_data_path,site,weather_flag)

        print(site, weather_flag, smoothing_flag)
        site_info = [site, weather_flag, smoothing_flag]
        site_nn_data = gen_utils.load_csv(nn_fil_name)
        site_nn_data.Datetime = pd.to_datetime(site_nn_data.Datetime)

        forecast.avg_metric_analysis(site_info, site_nn_data, scaler_forecast_wins, metric_fil)

Next we define a baseline forecast by scaling the neural network predictions to the mean of the site trap data for either 80 or 150 weeks, depending on location. We then compute the metrics for this baseline forecast and average over the 6 years

In [None]:
## Metrics for baseline forecast
from sklearn.metrics import mean_squared_error as mse

importlib.reload(forecast)

weather_flags = ['site', 'airport']
smoothing_flags = [False, True]

weather_smoothing_flags = list(itertools.product(weather_flags, smoothing_flags))

for weather_smoothing_flag in weather_smoothing_flags:
    weather_flag = weather_smoothing_flag[0]
    smoothing_flag = weather_smoothing_flag[-1]
    
    sites = ['Arboleda', 'Playa', 'La_Margarita', 'Villodas']

    for site in sites:
        if smoothing_flag:
            nn_fil_name = '{}/{}_{}_smoothed_weekly_predictions.csv'.format(smoothed_data_path,site,weather_flag)
        else:
            nn_fil_name = '{}/{}_{}_raw_weekly_predictions.csv'.format(raw_data_path,site,weather_flag)

        site_nn_data = gen_utils.load_csv(nn_fil_name)
        site_nn_data.Datetime = pd.to_datetime(site_nn_data.Datetime)
    
        site = '{}_baseline'.format(site)
    
        if 'la_margarita' in site.lower():
            #In MoLS the baseline is ~150weeks of trap data for La Margarita
            f_wins = [150]
        else:
            #And ~80weeks of trap data for Villodas, Arboleda, and Playa
            f_wins = [80]
        
        today = date.today().strftime('%m%d%y')

        for f_win in f_wins:
            rmses = []
        
            for i in range(0,len(site_nn_data) - f_win):
                subset = site_nn_data.copy(deep=True)
                subset = subset.iloc[i:i+f_win,:]

                scaler = forecast.scale_rto(subset)
                scaled_nn_preds = scaler*subset['Neural Network']
            
                trap = subset.Ref.values
                nn = scaled_nn_preds.values
            
                rmse = mse(trap, nn, squared=False)
                
                rmses.append(rmse)
                
        
            avg_rmse = np.average(rmses)               
            sig_rmse = np.std(rmses)

            quant = np.quantile(rmses, .25)
            first_quant = np.nanargmin(np.abs(rmses - quant))
            print(site, weather_flag, smoothing_flag, first_quant)

    
            with open(metric_fil, 'a') as f:
                f.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(site, weather_flag, 0, f_win,
                                                                        smoothing_flag, avg_rmse, sig_rmse))
    

## Amt metric analysis

Now we want to see how the quality of forecasts change over time. We create forecasts using a 13 week scaling window and 52 week forecasting window. Then we compute the average metric for a 4 weeks worth of predictions (i.e. weeks 1-4 after $t_0$, 5-9 weeks after $t_0$, ...). We report these results for all $t_0$ values.

In [None]:
importlib.reload(forecast)

# Test airport weather compared to true weather and impact of Fourier smoothing
# Neural network predictions are always smoothed
weather_flags = ['site', 'airport']
smoothing_flags = [False, True]

weather_smoothing_flags = list(itertools.product(weather_flags, smoothing_flags))

if not os.path.exists(amt_metrics_path):
    os.mkdir(amt_metrics_path)

#Chunk of time to calculate metrics (i.e. 1 wk,...len(weekly) wks)
amt = 4

for weather_smoothing_flag in weather_smoothing_flags:
    weather_flag = weather_smoothing_flag[0]
    smoothing_flag = weather_smoothing_flag[-1]

    sites = ['Arboleda', 'Playa', 'La_Margarita', 'Villodas']

    scaler_wins = [13]
    forecast_wins = [52]

    scaler_forecast_wins = list(itertools.product(scaler_wins, forecast_wins))
    today = date.today().strftime('%m%d%y')

    for site in sites:
        if smoothing_flag:
            nn_fil_name = '{}/{}_{}_smoothed_weekly_predictions.csv'.format(smoothed_data_path,site,weather_flag)
            metric_fil = '{}/{}_{}_smoothed_{}amt_metrics.csv'.format(amt_metrics_path,site,weather_flag,amt)
        else:
            nn_fil_name = '{}/{}_{}_raw_weekly_predictions.csv'.format(raw_data_path,site,weather_flag)
            metric_fil = '{}/{}_{}_raw_{}amt_metrics.csv'.format(amt_metrics_path,site,weather_flag,amt)

        print(site, weather_flag, smoothing_flag)
        site_info = [site, weather_flag, smoothing_flag]
        site_nn_data = gen_utils.load_csv(nn_fil_name)
        site_nn_data.Datetime = pd.to_datetime(site_nn_data.Datetime)

        if not os.path.exists(metric_fil):
            header = 'Site\tWeather\tScaler_win\tForecast_win\tTrap_smoothing_flag\tt_0\tTotal_RMSE\t'
            for i in range(1,forecast_wins[0], amt):
                header += 'amt{}_RMSE\t'.format(i)
    
            with open(metric_fil, 'w') as f:
                f.write('{}\n'.format(header))
                f.close()
        site_nn_data = gen_utils.load_csv(nn_fil_name)
        site_nn_data.Datetime = pd.to_datetime(site_nn_data.Datetime)

        forecast.t0_metric_analysis(site_info, site_nn_data, scaler_forecast_wins, metric_fil, amt)

## Scaling rto over time

Forecasts

In [None]:
importlib.reload(forecast)

# Test site and airport weather compared to raw and smoothed trap
# Neural network predictions are always smoothed
weather_flags = ['site', 'airport']
smoothing_flags = [False, True]

weather_smoothing_flags = list(itertools.product(weather_flags, smoothing_flags))

results = pd.DataFrame(columns=['Site', 'Weather', 'Smoothing', 'Scaler_win', 't_0', 'Scaler_rto'])

for weather_smoothing_flag in weather_smoothing_flags:
    weather_flag = weather_smoothing_flag[0]
    smoothing_flag = weather_smoothing_flag[-1]

    sites = ['Arboleda', 'Playa', 'La_Margarita', 'Villodas']

    scaler_wins = [13]

    for site in sites:
        if smoothing_flag:
            nn_fil_name = '{}/{}_{}_smoothed_weekly_predictions.csv'.format(smoothed_data_path,site,weather_flag)
        else:
            nn_fil_name = '{}/{}_{}_raw_weekly_predictions.csv'.format(raw_data_path,site,weather_flag)

        print(site, weather_flag, smoothing_flag)
        site_info = [site, weather_flag, smoothing_flag]
        site_nn_data = gen_utils.load_csv(nn_fil_name)
        site_nn_data.Datetime = pd.to_datetime(site_nn_data.Datetime)

        scaler_df = forecast.scaler_analysis(site_nn_data, scaler_wins)
        scaler_df = scaler_df.assign(Site=site, Weather=weather_flag, Smoothing=smoothing_flag, Scaler_win=90)     
        results = pd.concat([results, scaler_df])
    
results.to_csv(scaling_rto_fil, sep='\t', index=False)


Baselines

In [None]:
importlib.reload(forecast)

# Test site and airport weather compared to raw and smoothed trap
# Neural network predictions are always smoothed
weather_flags = ['site', 'airport']
smoothing_flags = [True]

weather_smoothing_flags = list(itertools.product(weather_flags, smoothing_flags))

results = pd.DataFrame(columns=['Site', 'Weather', 'Smoothing', 'Scaler_win', 't_0', 'Scaler_rto'])

for weather_smoothing_flag in weather_smoothing_flags:
    weather_flag = weather_smoothing_flag[0]
    smoothing_flag = weather_smoothing_flag[-1]

    sites = ['Arboleda', 'Playa', 'La_Margarita', 'Villodas']

    for site in sites:
        if smoothing_flag:
            nn_fil_name = '{}/{}_{}_smoothed_weekly_predictions.csv'.format(smoothed_data_path,site,weather_flag)
        else:
            nn_fil_name = '{}/{}_{}_raw_weekly_predictions.csv'.format(raw_data_path,site,weather_flag)

        if 'la_margarita' in site.lower():
            #In MoLS the baseline is ~150weeks of trap data for La Margarita
            scaler_wins = [150]
        else:
            #And ~80weeks of trap data for Villodas, Arboleda, and Playa
            scaler_wins = [80]

        print(site, weather_flag, smoothing_flag)
        site_info = [site, weather_flag, smoothing_flag]
        site_nn_data = gen_utils.load_csv(nn_fil_name)
        site_nn_data.Datetime = pd.to_datetime(site_nn_data.Datetime)

        scaler_df = forecast.scaler_analysis(site_nn_data, scaler_wins)
        scaler_df = scaler_df.assign(Site=site, Weather=weather_flag, Smoothing=smoothing_flag, Scaler_win=90)     
        results = pd.concat([results, scaler_df])
    
results.to_csv(baseline_scaling_rto_fil, sep='\t', index=False)

## Timing

In [None]:
importlib.reload(forecast)

site = 'Arboleda'
scaler_win = 13
forecast_win = 52
    
nn_fil_name = '{}/{}_site_smoothed_weekly_predictions.csv'.format(smoothed_data_path,site)

site_nn_data = gen_utils.load_csv(nn_fil_name)
site_nn_data = site_nn_data.iloc[0:0+scaler_win+forecast_win,:]
site_nn_data.Datetime = pd.to_datetime(site_nn_data.Datetime)

l_quant, u_quant, ns, mean_p, scaled_nn_preds = forecast.main_analysis(scaler_win, forecast_win, 0.68, site_nn_data)

plt.figure()
plt.plot(site_nn_data.Datetime, scaled_nn_preds)
plt.plot(site_nn_data.Datetime.iloc[-len(l_quant):], l_quant)
plt.plot(site_nn_data.Datetime.iloc[-len(l_quant):], u_quant)
plt.show()