# Global Model Testing (v3.1)

Casey A Graff

November 30th, 2017

Now using updated model evaluation pipeline.

In [None]:
# === IMPORTS ===

REP_DIR = "/home/cagraff/Documents/dev/fire_prediction/"
SRC_DIR = REP_DIR + 'src/'
DATA_DIR = REP_DIR + 'data/'

# --- System imports
import os
import numpy as np
import datetime as dt
from matplotlib import pyplot as plt
from collections import defaultdict
import tabulate
import xarray as xr

# --- Package imports
os.chdir(SRC_DIR)
import evaluation.metrics as metrics
import evaluation.evaluate_model as evm
import visualization.mapping as vmap
import visualization.stats as vstat

import helper.geometry as geo
import helper.loaders as load
import helper.date_util as du
import helper.weather as weather

from models import linear_regression as lr
from models import poisson_regression as pr
from models import poisson_regression_grid as prg
from models import quantile_regression as qr
from models import grid_predictor as gp
from models import bias_grid as bg
from models import bias_poisson_weather_grid as bpwg
from models import active_ignition_grid as aig


# Setup mpl
plt.rcParams['figure.figsize'] = [7,5]
%matplotlib inline

In [None]:
# === DATA LOADING ===

T_K_MAX = 5
T_K_ARR = np.arange(1,T_K_MAX+1)

# --- Data files
ignition_cube_src = os.path.join(DATA_DIR, 'interim/modis/fire_cube/fire_ignition_cube_modis_alaska_2007-2016.pkl')
detection_cube_src = os.path.join(DATA_DIR, 'interim/modis/fire_cube/fire_detection_cube_modis_alaska_2007-2016.pkl')
weather_proc_region_src = os.path.join(DATA_DIR, 'interim/gfs/weather_proc/weather_proc_gfs_4_alaska_2007-2016.pkl')

integrated_cluster_df_fmt = 'interim/integrated/fire_weather/fire_weather_integrated_gfs_4_modis_5km_10days_1400_%dk_alaska_2007-2016.pkl'
integrated_cluster_df_src_list = map(lambda k: os.path.join(DATA_DIR, integrated_cluster_df_fmt % k), T_K_ARR)
                                     
# --- Load data
X_ignition_c, Y_detection_c = evm.setup_ignition_data(ignition_cube_src, detection_cube_src)
X_ignition_c.name = 'num_ig'
Y_detection_c.name = 'num_det'
weather_proc_region = load.load_pickle(weather_proc_region_src)

X_active_df = []
for f_src in integrated_cluster_df_src_list:
    X_active_df.append(evm.setup_active_fire_data(f_src))

In [None]:
# Non-grid baselines
def only_zero_model():
    model = aig.ActiveIgnitionGridModel(None, None)
    
    return model

def no_ignition_model_poisson(covariates):
    afm = gp.GridPredictorModel(pr.PoissonRegressionModel(covariates))
    model = aig.ActiveIgnitionGridModel(afm, None)
    
    return model

def bias_model_poisson(covariates):
    afm = gp.GridPredictorModel(pr.PoissonRegressionModel(covariates))
    igm = bg.BiasGridModel()
    model = aig.ActiveIgnitionGridModel(afm, igm)
    
    return model

def bias_weather_model_poisson(covariates):
    afm = gp.GridPredictorModel(pr.PoissonRegressionModel(covariates))
    igm = bpwg.BiasPoissonWeatherGridModel(covariates)
    model = aig.ActiveIgnitionGridModel(afm, igm)
    
    return model

In [None]:
# Grid models
def no_ignition_grid_model_poisson(covariates):
    afm = prg.PoissonRegressionGridModel(covariates)
    model = aig.ActiveIgnitionGridModel(afm, None)
    
    return model

In [None]:
def test_model(model_func):
    results_all = defaultdict(list)
    for t_k in T_K_ARR:
        results_k = {}
        
        # Test baselines
        results = evm.evaluate_model(only_zero_model(), X_active_df[t_k-1], X_ignition_c, Y_detection_c, t_k)
        results_all['Only_Zero'].append(results)
        
        #results = evm.evaluate_model(only_bias_model(), X_active_df[t_k-1], Y_detection_c, Y_detection_c, t_k)
        #results_all['Only_Bias'].append(results)
        
        # Test model with different covariates
        print 'T_k=%d' % t_k
        for name,cov in [('None', []), ('Temp/Humid', ['temperature', 'humidity']), ('All', ['temperature','humidity','wind','rain'])]:
            results = evm.evaluate_model(model_func(cov), X_active_df[t_k-1], X_ignition_c, Y_detection_c, t_k)
            results_all[name].append(results)
    
    return results_all

def flat(x):
    return map(lambda x: x.flatten(), x)

def plot_results(results):
    for metric in [metrics.mean_absolute_error, metrics.root_mean_squared_error]:
        for k,v in results.iteritems():
            plt.plot(T_K_ARR, map(lambda x: metric(*flat(x)), results[k]), "s--", label=k, linewidth=2)
        lgd = plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
        plt.xlabel("Day of forecast (k)")
        plt.xticks(T_K_ARR)
        plt.ylabel(metric.__name__)
        plt.show()
        
def plot_results_grid(results_list):
    #fig = plt.figure()
    metrics_ = [metrics.mean_absolute_error, metrics.root_mean_squared_error]
    for j,(results,t) in enumerate(results_list):
        for i, metric in enumerate(metrics_):    
            ax = plt.subplot(len(metrics_),len(results_list),(i*len(results_list))+j+1)
            ax.set_title(t)
            for k,v in results.iteritems():     
                plt.plot(range(1,len(results[k])+1), map(lambda x: metric(*flat(x)), results[k]), "s--", label=k, linewidth=2)
            plt.xlabel("Day of forecast (k)")
            plt.xticks(T_K_ARR)
            plt.ylabel(metric.__name__)
            
    lgd = plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    
def print_results_table(results_list):
    metrics_ = [metrics.mean_absolute_error, metrics.root_mean_squared_error]
    for results,title in results_list:
        print title
        print '====================='
        for metric in metrics_:
            print metric.__name__
            table = []
            for k,v in results.iteritems():
                vals = map(lambda x: round(metric(*flat(x)),5), results[k])
                table.append([k]+vals)
            print tabulate.tabulate(table)
                

In [None]:
fill_n_days = 5

def get_date_index(weather_data, target_datetime):
        date_ind = np.searchsorted(weather_data.dates, target_datetime, side='left')

        # Check if left or right element is closer
        if date_ind != 0:
            date_ind_left, date_ind_curr = date_ind-1, date_ind

            dist_left = abs((weather_data.dates[date_ind_left] - target_datetime).total_seconds())
            dist_curr = abs((weather_data.dates[date_ind_curr] - target_datetime).total_seconds())

            if dist_left < dist_curr:
                date_ind = date_ind_left

        return date_ind

def get_weather_variables(vals,weather_data, target_datetime, covariates):                                                             
    # Get date index
    date_ind = get_date_index(weather_data, target_datetime)                                                  

    #vals = []
    for key in covariates:                                                                           
        data = weather_data[key].values                                                                            
        val = data[:, :, date_ind]                                                                 

        if np.any(np.isnan(val)):
            val = fill_missing_value(data, date_ind)                                        

        #vals.append(val)                                                                                           
        vals[key].append(val)

    #return vals                                                                                                    
    
def fill_missing_value(data, date_ind):                                                    
    """
    Try to replace with closest prev day in range [1, fill_n_days].                                                

    If no non-nan value is found, replaces with mean of all values at the given lat/lon.                           
    """ 
    for day_offset in range(1,fill_n_days+1):                                                                 
        new_date_ind = date_ind - day_offset                                                                       

        if new_date_ind < 0:                                                                                       
            break                                                                                                  

        val = data[:, :, new_date_ind]                                                                 

        if not np.any(np.isnan(val)):                                                                                      
            return val                                                                                             

    return np.nanmean(data[:, :, :], axis=2)

In [None]:
def test_model2(model_func):
    results_all = defaultdict(list)
    for t_k in T_K_ARR:
        results_k = {}
        
        #ignitions = X_ignition_c.values
        #shape = np.shape(ignitions)[:2]+(t_k,)
        #ignitions = np.concatenate((ignitions, np.zeros(shape)), axis=2)
        #ignitions = ignitions[:,:,t_k:]

        #cube = weather.WeatherCube('num_ig_target', ignitions, None, dates=Y_detection_c.dates)
        #X_weather_r[t_k].add_cube(cube)
        
        # Test baselines
        #results = evm.evaluate_model(only_zero_model(), X_active_df[t_k-1], X_ignition_c, Y_detection_c, t_k)
        #results_all['Only_Zero'].append(results)
        
        # Test model with different covariates
        print 'T_k=%d' % t_k
        for name,cov in [('None', []), ('Temp/Humid', ['temperature', 'humidity']), ('All', ['temperature','humidity','wind','rain'])]:
            results = evm.evaluate_model(model_func(cov), X_active_df[t_k-1], X_ignition_weather_r[t_k], Y_detection_c, t_k)
            results_all[name].append(results)
    
    return results_all

In [None]:
def plot_predictions(result,title):
    fig = plt.figure()
    fig.suptitle(title)
    ax = plt.subplot(121)
    a = np.array(result[0].flatten())
    b = np.array(result[1].flatten())
    
    print a.shape, b.shape
    ax.plot(a, b, 'o', markeredgecolor='none', markersize=2)
    plt.title('Predicted vs. Actual Counts')
    plt.xlabel('Actual Counts')
    plt.ylabel('Predicted Counts')
    plt.plot(xrange(int(np.max(result[0]))),xrange(int(np.max(result[0]))), 'r')
    
    ax = plt.subplot(122)
    plt.plot(np.log(a+1), np.log(b+1), 'o', markeredgecolor='none', markersize=2)
    plt.title('Predicted vs. Actual Counts (log+1)')
    plt.xlabel('Actual Counts')
    plt.ylabel('Predicted Counts')
    plt.plot([0, np.max(np.log(a+1))],[0, np.max(np.log(a+1))], 'r')

In [None]:
X_ignition_weather_r = {}
for t_k in [1,2]:#T_K_ARR:
    vals = defaultdict(list)
    for date in Y_detection_c.dates:
        time = 14
        date += du.INC_ONE_DAY * t_k # For row t, store weather(t+k)
        target_datetime = dt.datetime.combine(date, dt.time(time, 0, 0, tzinfo=du.TrulyLocalTzInfo(153, du.round_to_nearest_quarter_hour)))

        get_weather_variables(vals, weather_proc_region, target_datetime, ['temperature','humidity','wind','rain'])

    X_ignition_weather_r[t_k] = weather.WeatherRegion('ignition')
    for k,v in vals.iteritems():
        vals[k] = np.rollaxis(np.array(v), 0, 3)  
        cube = weather.WeatherCube(k, vals[k], None, dates=Y_detection_c.dates)
        X_ignition_weather_r[t_k].add_cube(cube)
        
    # Add ignitions (shifted)
    ignitions = X_ignition_c.values
    shape = np.shape(ignitions)[:2]+(t_k,)
    ignitions = np.concatenate((ignitions, np.zeros(shape)), axis=2)
    ignitions = ignitions[:,:,t_k:]

    cube = weather.WeatherCube('num_ig_target', ignitions, None, dates=Y_detection_c.dates)
    X_ignition_weather_r[t_k].add_cube(cube)
    
    # Add detections
    det = Y_detection_c.values

    cube = weather.WeatherCube('num_det', det, None, dates=Y_detection_c.dates)
    X_ignition_weather_r[t_k].add_cube(cube)
    
    # Add detections (shifted)
    det_target = Y_detection_c.values
    shape = np.shape(det_target)[:2]+(t_k,)
    det_target = np.concatenate((det_target, np.zeros(shape)), axis=2)
    det_target = det_target[:,:,t_k:]

    cube = weather.WeatherCube('num_det_target', det_target, None, dates=Y_detection_c.dates)
    X_ignition_weather_r[t_k].add_cube(cube)
    
    print 'T_k=%d' % t_k

In [None]:
def test_model3(model_func):
    results_all = defaultdict(list)
    for t_k in [1,2]:#T_K_ARR:
        results_k = {}
        
        results = evm.evaluate_model(only_zero_model(), X_active_df[t_k-1], X_ignition_c, Y_detection_c, t_k)
        results_all['Only_Zero'].append(results)
        
        # Test model with different covariates
        print 'T_k=%d' % t_k
        for name,cov in [('None', []), ('Temp/Humid', ['temperature', 'humidity']), ('All', ['temperature','humidity','wind','rain'])]:
            results = evm.evaluate_model_grid(model_func(cov), X_ignition_weather_r[t_k], X_ignition_weather_r[t_k], Y_detection_c, t_k)
            results_all[name].append(results)
    
    return results_all

In [None]:
from scipy.stats import gaussian_kde
def plot_kde(data,upper=100, data2=None):
    density = gaussian_kde(data)
    x = np.arange(0., upper, .05)
    plt.plot(x, density(x), label='1')
    
    if data2 is not None:
        density = gaussian_kde(data2)
        x = np.arange(0., upper, .05)
        plt.plot(x, density(x), label='2')
        
    lgd = plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
reload(prg)
reload(evm)
import evaluation.cross_validation as cv
reload(cv)
results_no_ig_grid = test_model3(no_ignition_grid_model_poisson)

In [None]:
# Plot pred vs actual
plt.rcParams['figure.figsize'] = [15, 6]
plot_predictions(results_no_ig_grid['All'][0], 'Poisson Regression (k=1)')

In [None]:
reload(prg)
reload(evm)
import evaluation.cross_validation as cv
reload(cv)
results_no_ig_grid = test_model3(no_ignition_grid_model_poisson)

In [None]:
plt.rcParams['figure.figsize'] = [15, 10]
plot_results_grid([(results_no_ig, 'Poisson (No Ign.)'), (results_no_ig_grid, 'Poisson Grid (No Ign.)')])

In [None]:
print np.sum(results_no_ig_grid['All'][0][1].flatten()<1)/(1.*len(results_no_ig_grid['All'][0][1].flatten())),
print np.sum(results_no_ig['All'][0][1].flatten()<1)/(1.*len(results_no_ig['All'][0][1].flatten()))

print np.mean(results_no_ig_grid['All'][0][1]), np.mean(results_no_ig['All'][0][1])

In [None]:
# Baselines
results_no_ig = test_model(no_ignition_model_poisson)
#results_ig_weather = test_model2(bias_weather_model_poisson)

In [None]:
actual = results_no_ig_grid['All'][0][0].flatten()
pred = results_no_ig_grid['All'][0][1].flatten()
ind = range(len(actual))
ind_samp = np.random.choice(ind, 10000, replace=False)
plot_kde(actual[ind], 2, pred[ind])

In [None]:
actual = results_no_ig['All'][0][0].flatten()
pred = results_no_ig['All'][0][1].flatten()
ind = range(len(actual))
ind_samp = np.random.choice(ind, 10000, replace=False)
plot_kde(actual[ind], 2, pred[ind])

In [None]:
# Plot comparison graph
plot_results_grid([(results_no_ig, 'Poisson (No Ign.)'), (results_no_ig, 'Poisson (No Ign.)')])

In [None]:
# Plot pred vs actual
plt.rcParams['figure.figsize'] = [15, 6]
plot_predictions(results_no_ig['All'][0], 'Poisson Regression (k=1)')

In [None]:
print_results_table([(results_no_ig, 'Poisson (No Ign.)'), (results_ig_bias, 'Poisson (Ign. Bias)')])

In [None]:
data = Y_detection_c.values
dates = Y_detection_c.dates
lat_min,lat_max,lon_min,lon_max = weather_proc_region.bounding_box.get()
lats = np.arange(lat_min,lat_max+.5,.5)
lons = np.arange(lon_min,lon_max+.5,.5)
print data.shape, dates.shape

In [None]:
ds = xr.Dataset({'num_det': (('y', 'x', 'time'), data), 'temp': (('y','x','time'), weather_r['temperature'].values)},
                {'time': pd.to_datetime(dates), 'lat': (['y'], lats), 'lon': (['x'], lons)})

In [None]:
ds

In [None]:
ds['temp'][:,:,0].plot()

In [None]:
start = time()
a = ds.sel(time='2007')
b = ds.sel(time='2008')
c = xr.concat((a,b), dim='time')
print time()-start

In [None]:
fires = (ds['num_det']>0).groupby('time.month').mean().to_pandas().plot()

In [None]:
import pandas as pd
date_range = pd.date_range('2007-5-14', '2007-5-21', tz='UTC')
print dates[0],date_range.to_datetime()[0]
print np.in1d(ds['time.year'], [2008,2012]).shape
ds.sel(time=ds.time.dt.year!=2007)

In [None]:
weather_r = {}
vals = defaultdict(list)
for date in Y_detection_c.dates:
    time = 14
    date += du.INC_ONE_DAY # For row t, store weather(t+k)
    target_datetime = dt.datetime.combine(date, dt.time(time, 0, 0, tzinfo=du.TrulyLocalTzInfo(153, du.round_to_nearest_quarter_hour)))

    get_weather_variables(vals, weather_proc_region, target_datetime, ['temperature','humidity','wind','rain'])

weather_r = weather.WeatherRegion('ignition')
for k,v in vals.iteritems():
    vals[k] = np.rollaxis(np.array(v), 0, 3)  
    cube = weather.WeatherCube(k, vals[k], None, dates=Y_detection_c.dates)
    weather_r.add_cube(cube)