# Poisson Regression Testing (v1.0)

Casey A Graff

November 3rd, 2017

Now working with updated $X_{cluster}$ where the format is $[t, c, y(t), weather(t+k), y(t+k)]$.

In [None]:
REP_DIR = "/home/cagraff/Documents/dev/fire_prediction/"
SRC_DIR = REP_DIR + 'src/'
DATA_DIR = REP_DIR + 'data/'

# Load system-wide packages
import os
import sys
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from collections import defaultdict
import datetime as dt
plt.rcParams['figure.figsize'] = [15,7]
%matplotlib inline

# Load project packages
os.chdir(SRC_DIR)
from features.loaders import load_integrated_df
import models.poisson_regression as pr
import models.linear_regression as lr
import models.evaluation as ev
from models import metrics
from helper import date_util as du

In [None]:
sys.path.append(SRC_DIR+'features')
int_5km_10days_14_1k_df = load_integrated_df(os.path.join(DATA_DIR, 'interim/integrated/fire_weather/fire_weather_integrated_gfs_4_modis_5km_10days_1400_1k_alaska_2007-2016.pkl'))
int_5km_10days_14_2k_df = load_integrated_df(os.path.join(DATA_DIR, 'interim/integrated/fire_weather/fire_weather_integrated_gfs_4_modis_5km_10days_1400_2k_alaska_2007-2016.pkl'))

In [None]:
int_5km_10days_14_1k_df[:5]

In [None]:
reload(pr)
reload(lr)
reload(ev)

def train(Xs, t_k_arr, leave_one_out=True, years=None):
    results_dict = defaultdict(list)
    for t_k in t_k_arr:
        # Choose X
        X = Xs[t_k].copy()
         
        # Standardize weather
        for cov in ['temperature', 'humidity', 'wind', 'rain']:
            X[cov] = (X[cov] - np.mean(X[cov])) / np.var(X[cov])

        X = X.assign(year=map(lambda x: x.year, X.date_local))

        # Filter years
        if years:
            X = X[X.year.isin(years)]
             
        print 'Starting t_k=%d' % t_k

        # Filter out predicting before fire started
        """
        legit_series = pd.Series(index=X.index)
        for clust in X.cluster_id.unique():
            clust_df = X[X.cluster_id==clust]
            legit_day = np.min(clust_df.date_local) + du.INC_ONE_DAY * (t_k+1)
            legit_series[clust_df[clust_df.date_local>=legit_day].index]=1        
        
        X_legit = X[legit_series==1]
        """

        X_t = pr.PoissonRegressionModel(t_k, []).add_autoregressive_col(X, t_k+1)

        results_dict['baseline'].append((X_t.num_det, X_t.num_det_prev))

        prm = pr.PoissonRegressionModel(t_k=t_k, covariates=[])
        if leave_one_out:
            results, years = ev.cross_validation_years(prm, X_t)
        else:
            results, years = ev.leave_none_out(prm, X_t)
        results_dict['auto'].append(np.concatenate(results, axis=1))

        prm = pr.PoissonRegressionModel(t_k=t_k, covariates=['temperature', 'humidity'])
        if leave_one_out:
            results, years = ev.cross_validation_years(prm, X_t)
        else:
            results, years = ev.leave_none_out(prm, X_t)
        results_dict['temp_humid'].append(np.concatenate(results, axis=1))

        prm = pr.PoissonRegressionModel(t_k=t_k, covariates=['temperature', 'humidity', 'wind', 'rain'])
        if leave_one_out:
            results, years = ev.cross_validation_years(prm, X_t)
        else:
            results, years = ev.leave_none_out(prm, X_t)
        results_dict['all'].append(np.concatenate(results, axis=1)) 
        
        prm = pr.PoissonRegressionModel(t_k=t_k, covariates=['temperature', 'wind', 'rain'])
        if leave_one_out:
            results, years = ev.cross_validation_years(prm, X_t)
        else:
            results, years = ev.leave_none_out(prm, X_t)
        results_dict['no_humid'].append(np.concatenate(results, axis=1)) 
        
        prm = pr.PoissonRegressionModel(t_k=t_k, covariates=['temperature'])
        if leave_one_out:
            results, years = ev.cross_validation_years(prm, X_t)
        else:
            results, years = ev.leave_none_out(prm, X_t)
        results_dict['temp'].append(np.concatenate(results, axis=1)) 
     
        """
        lrm = lr.LinearRegressionModel(t_k=t_k, covariates=[])
        if leave_one_out:
            results, years = ev.cross_validation_years(lrm, X_t)
        else:
            results, years = ev.leave_none_out(prm, X_t)
        results_dict['auto_linear'].append(np.concatenate(results, axis=1))
        
        lrm = lr.LinearRegressionModel(t_k=t_k, covariates=['temperature', 'humidity', 'wind', 'rain'])
        if leave_one_out:
            results, years = ev.cross_validation_years(lrm, X_t)
        else:
            results, years = ev.leave_none_out(prm, X_t)
        results_dict['linear_all'].append(np.concatenate(results, axis=1))
        """
    return results_dict
        
def plot_training(results, t_k_arr, metric=metrics.mean_absolute_error):
    plt.plot(t_k_arr+1, map(lambda x: metric(*x), results['baseline']), "kv--", label="Baseline", linewidth=2)
    plt.plot(t_k_arr+1, map(lambda x: metric(*x), results['auto']), "gs--", label="Autoregression", linewidth=2)
    plt.plot(t_k_arr+1, map(lambda x: metric(*x), results['temp_humid']), "r^--", label="Temp/hum", linewidth=2)
    plt.plot(t_k_arr+1, map(lambda x: metric(*x), results['all']), "bo--", label="All weather", linewidth=2)
    plt.plot(t_k_arr+1, map(lambda x: metric(*x), results['no_humid']), "co--", label="All (except humid)", linewidth=2)
    plt.plot(t_k_arr+1, map(lambda x: metric(*x), results['temp']), "yo--", label="Temp Only", linewidth=2)
    #plt.plot(t_k_arr+1, map(lambda x: metric(*x), results['auto_linear']), "cs--", label='Linear Autoregression', linewidth=2)
    #plt.plot(t_k_arr+1, map(lambda x: metric(*x), results['linear_all']), "yo--", label='Linear All', linewidth=2)

    plt.rcParams.update({'font.size': 14})
    lgd = plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    plt.xlabel("Day of forecast (k)")
    plt.xticks(t_k_arr+1)
    plt.ylabel(metric.__name__)
    
def print_summary(X, cov_list, t_k):
    # Standardize weather
    for cov in cov_list:
        X[cov] = (X[cov] - np.mean(X[cov])) / np.var(X[cov])

    X = X.assign(year=map(lambda x: x.year, X.date_local))
    X_t = pr.PoissonRegressionModel(t_k, []).add_autoregressive_col(X, t_k+1)

    prm = pr.PoissonRegressionModel(t_k=t_k, covariates=cov_list)
    results, years = ev.cross_validation_years(prm, X_t)
    print 'Mean Abs. Error: %f' % metrics.mean_absolute_error(*np.concatenate(results, axis=1))
    print prm.fit_result.summary()

## Comparing Clusterings on Ten Year (out of sample)

In [None]:
t_k_arr = np.arange(0, 1)
Xs = [int_5km_10days_14_1k_df, int_5km_10days_14_2k_df]

results = train(Xs, t_k_arr)
plot_training(results, t_k_arr, metrics.mean_absolute_error)
plt.show()
plot_training(results, t_k_arr, metrics.root_mean_squared_error)