# GFS Open Question Exploration (v2.0)
Casey A Graff

August 11th, 2017

**Now using re-fetched gfs data.**

In [None]:
REP_DIR = "/home/cagraff/Documents/dev/fire_prediction/"
SRC_DIR = REP_DIR + 'src/'
DATA_DIR = REP_DIR + 'data/'

# Load system-wide packages
import os
import sys
import numpy as np
from tabulate import tabulate
import datetime as dt
import pytz
from matplotlib import pyplot as plt
%matplotlib inline

# Load project packages
os.chdir(SRC_DIR)
from features.loaders import load_gfs_weather
from helper import date_util as du
from visualization.mapping import make_map
from visualization.stats import calc_mean, calc_cor
from helper.geometry import latlonrange

In [None]:
# Load data
sys.path.append(SRC_DIR+'helper')
gfs = load_gfs_weather(os.path.join(DATA_DIR, 'interim/gfs/weather/weather_gfs_alaska_2007-2016.pkl'))

In [None]:
print gfs.cubes.keys()

## Missing files
How many files are missing and is there a temporal pattern to the missing days?

### Number of missings files per year

In [None]:
years = range(2007, 2017)

missing = []
for year in years:
    sel = gfs['temperature'].filter_dates(du.DatetimeMeasurement(dt.datetime(year,1,1, tzinfo=pytz.UTC)), du.DatetimeMeasurement(dt.datetime(year, 12, 31, tzinfo=pytz.UTC)))
    
    files_missing = len([v for v in sel.values[0,0,:] if np.isnan(v)])
    files_present = sel.shape[2] - files_missing
        
    missing.append((year, files_present, files_missing))

# Missing total
missing.append(('Total', sum([x[1] for x in missing]), sum([x[2] for x in missing])))
print tabulate(missing, headers=['Year', 'Present', 'Missing'])

print '\nPercentage missing is {}%'.format(missing[-1][2]/(.01*missing[-1][1]))

### Number of missings files per year (within fire season)

In [None]:
years = range(2007, 2017)
season = ((5,14), (8,31))

print 'Fire Season:', season[0], 'to', season[1], '\n'

# Missing per year
missing = []

for year in years:
    sel = gfs['temperature'].filter_dates(du.DatetimeMeasurement(dt.datetime(year, season[0][0], season[0][1], tzinfo=pytz.UTC)), du.DatetimeMeasurement(dt.datetime(year, season[1][0], season[1][1], tzinfo=pytz.UTC)))
    
    files_missing = len([v for v in sel.values[0,0,:] if np.isnan(v)])
    files_present = sel.shape[2] - files_missing
        
    missing.append((year, files_present, files_missing))

# Missing total
missing.append(('Total', sum([x[1] for x in missing]), sum([x[2] for x in missing])))
print tabulate(missing, headers=['Year', 'Present', 'Missing'])

print '\nPercentage missing is {}%'.format(missing[-1][2]/(.01*missing[-1][1]))

### Number of missing days per month

In [None]:
years = range(2007, 2017)
months = range(1, 13)

MONTH_IND = 0
PRESENT_IND = 1
MISSING_IND = 2

missing = np.zeros((12,3), dtype=np.int32)
missing[:, MONTH_IND] = months
# TODO: Missing a few days        
for year in years:
    for month in months:
        month_num_days = du.days_per_month(month, du.is_leap_year(year))
        sel = gfs['temperature'].filter_dates(du.DatetimeMeasurement(dt.datetime(year, month, 1, tzinfo=pytz.UTC)), du.DatetimeMeasurement(dt.datetime(year, month, month_num_days, tzinfo=pytz.UTC)))
        total += sel.shape[2]
        files_missing = len([v for v in sel.values[0,0,:] if np.isnan(v)])
        files_present = sel.shape[2] - files_missing
        
        missing[month-1,PRESENT_IND] += files_present
        missing[month-1,MISSING_IND] += files_missing


# Missing total
present_total = np.sum(missing[:, PRESENT_IND])
missing_total = np.sum(missing[:, MISSING_IND])
missing = list(missing)
missing.append(['Total', present_total, missing_total])

print tabulate(missing, headers=['Month', 'Present', 'Missing'])


## Print Missing Files

In [None]:
years = range(2011, 2011)
grib_file_fmt = "gfsanl_4_%s%.2d%.2d_%.2d%.2d_%.3d.grb2"
# TODO: Update
missing_files = []
for year in years:
    all_dates = [d for d in du.daterange(dt.datetime(year,1,1, tzinfo=pytz.UTC), dt.datetime(year+1, 1, 1, tzinfo=pytz.UTC), increment=dt.timedelta(hours=6))]

    for date in all_dates:
        offsets_found = gfs['temperature'].get_attribute_for_date('offsets', date)
        
        gribs_found = [grib_file_fmt % (year, date.month, date.day, date.hour, date.minute, offset.seconds/3600) for offset in offsets_found]
        gribs_expected = [grib_file_fmt % (year, date.month, date.day, date.hour, date.minute, offset) for offset in (0, 3, 6)]

        missing_files += list(set(gribs_expected).difference(set(gribs_found)))

print len(missing_files), missing_files

## Spatial Correlation of Measurements

Do adjacent pixel have a high correlation between measurements? If there is sufficient variability it may be useful to perform linear interpolation between neighboring cells when calculating the weather variables for a fire event.

In [None]:
data_types = ['total_precipitation','u_wind_component', 'v_wind_component', 'temperature', 'humidity']
DATA_TYPE = data_types[3]
DATE_SEL = dt.datetime(2009, 3, 5, 18, tzinfo=pytz.UTC)
OFFSET_SEL = 2

plt.rcParams['figure.figsize'] = [10,15]

mp = make_map(gfs[DATA_TYPE].bounding_box)
mp.shadedrelief()

latlon = [ll for ll in latlonrange(gfs[DATA_TYPE].bounding_box, .5, .5)]
lats,lons = zip(*latlon)

_ = mp.scatter(lons, lats ,30, latlon=True, marker='o', color='b')
_ = plt.title('GFS Meaurement Points')

In [None]:
from time import time
    
mp = make_map(gfs[DATA_TYPE].bounding_box)
mp.shadedrelief()

values = gfs[DATA_TYPE].get_values_for_date(DATE_SEL)[:,:,OFFSET_SEL]
lats, lons = gfs[DATA_TYPE].bounding_box.make_grid()

cs = mp.contourf(lons, lats , values, latlon=True, alpha=.6)
cbar = mp.colorbar(cs,location='bottom',pad="5%")

_ = plt.title('%s at %s' % (DATA_TYPE, DATE_SEL))

In [None]:
mp = make_map(gfs[DATA_TYPE].bounding_box)
mp.shadedrelief()

values = calc_mean(gfs[DATA_TYPE].values, gfs[DATA_TYPE].values.shape)
lats, lons = gfs[DATA_TYPE].bounding_box.make_grid()

cs = mp.contourf(lons, lats , values, latlon=True, alpha=.6)
cbar = mp.colorbar(cs,location='bottom',pad="5%")
#cbar.set_label('Kelvin (degrees)')

_ = plt.title('Mean %s' % DATA_TYPE)

In [None]:
# Calculate correlation for left neighbor
cor = calc_cor(gfs[DATA_TYPE].values, gfs[DATA_TYPE].values.shape, (0, 0, 0), (1, 0, -1))

mp = make_map(gfs[DATA_TYPE].bounding_box)
mp.shadedrelief()

values = cor
lats, lons = gfs[DATA_TYPE].bounding_box.make_grid()

cs = mp.contourf(lons[:,1:], lats[:,1:] ,values[:,1:], latlon=True, alpha=.6)
cbar = mp.colorbar(cs,location='bottom',pad="5%")
cbar.set_label('Correlation (0 to 1)')

_ = plt.title('Left Correlation of %s' % DATA_TYPE)

print 'Min=%f, Max=%f, Mean=%f' % (np.min(values), np.max(values), np.mean(values))

In [None]:
# Calculate correlation for top neighbor
cor = calc_cor(gfs[DATA_TYPE].values, gfs[DATA_TYPE].values.shape, (1, 0, -1), (0, 0, 0))

mp = make_map(gfs[DATA_TYPE].bounding_box)
mp.shadedrelief()

values = cor
lats, lons = gfs[DATA_TYPE].bounding_box.make_grid()

cs = mp.contourf(lons[1:, :], lats[1:, :] ,values[1:, :], latlon=True, alpha=.6)
cbar = mp.colorbar(cs,location='bottom',pad="5%")
cbar.set_label('Correlation (0 to 1)')

_ = plt.title('Top Correlation of %s' % DATA_TYPE)

print 'Min=%f, Max=%f, Mean=%f' % (np.min(values), np.max(values), np.mean(values))

In [None]:
# Calculate correlation for bottom neighbor
cor = calc_cor(gfs[DATA_TYPE].values, gfs[DATA_TYPE].values.shape, (0, -1, 1), (0, 0, 0))

mp = make_map(gfs[DATA_TYPE].bounding_box)
mp.shadedrelief()

values = cor
lats, lons = gfs[DATA_TYPE].bounding_box.make_grid()

cs = mp.contourf(lons[:-1, :], lats[:-1, :] ,values[:-1, :], latlon=True, alpha=.6)
cbar = mp.colorbar(cs,location='bottom',pad="5%")
cbar.set_label('Correlation (0 to 1)')

_ = plt.title('Bottom Correlation of %s' % DATA_TYPE)

print 'Min=%f, Max=%f, Mean=%f' % (np.min(values), np.max(values), np.mean(values))

## Exploring +0, +3, +6 Offset Files

### Are the +3 and +6 offsets for instantaneous variables different than +0?

In [None]:
DATA_TYPE = 'humidity'
lat = 0
lon = 0
all_equals = []
for i in range(0, gfs.shape[2]-2, 3):
    equals_3_offset = gfs[DATA_TYPE].values[lat,lon,i] == gfs[DATA_TYPE].values[lat,lon,i+1]
    equals_6_offset = gfs[DATA_TYPE].values[lat,lon,i] == gfs[DATA_TYPE].values[lat,lon,i+2]
    equals_both = equals_3_offset and equals_6_offset
    one_is_nan = np.isnan(gfs[DATA_TYPE].values[lat,lon,i]) or np.isnan(gfs[DATA_TYPE].values[lat,lon,i+1]) or np.isnan(gfs[DATA_TYPE].values[lat,lon,i+2])
    if not one_is_nan:
        all_equals.append(equals_both)

print DATA_TYPE, all(all_equals), np.mean(all_equals)

**Instantaneous variables at +0, +3 and +6 are not always equal.**

### Is the +6 and +0 from the next time stamp equal for instantaneous variables?

In [None]:
DATA_TYPE = 'humidity'
lat = 0
lon = 0
all_equals = []
for i in range(3, gfs.shape[2], 3):
    is_equal = gfs[DATA_TYPE].values[lat,lon,i] == gfs[DATA_TYPE].values[lat,lon,i-1]
    one_is_nan = np.isnan(gfs[DATA_TYPE].values[lat,lon,i]) or np.isnan(gfs[DATA_TYPE].values[lat,lon,i-1])
    if not one_is_nan:
        all_equals.append(is_equal)
        
    #print gfs.dates[i].get().hour, gfs.dates[i].get_offset(), gfs[DATA_TYPE].values[lat,lon,i], gfs.dates[i-1].get().hour,gfs.dates[i-1].get_offset(), gfs[DATA_TYPE].values[lat,lon,i-1]

print DATA_TYPE, all(all_equals), np.mean(all_equals)

**No, +6 from current time and +0 from next six hour interval are usually not equal.**

### Is +3 precipitation always less than or equal to +6?

In [None]:
DATA_TYPE = 'total_precipitation'
lat = 0
lon = 0
all_equals = []
for i in range(1, gfs.shape[2]-1, 3):
    is_lt_or_equal = gfs[DATA_TYPE].values[lat,lon,i] <= gfs[DATA_TYPE].values[lat,lon,i+1]
    one_is_nan = np.isnan(gfs[DATA_TYPE].values[lat,lon,i]) or np.isnan(gfs[DATA_TYPE].values[lat,lon,i+1])
    if not one_is_nan:
        all_equals.append(is_lt_or_equal)
        #if not is_lt_or_equal: print gfs.dates[i].get(), gfs.dates[i].get_offset(), gfs[DATA_TYPE].values[lat,lon,i], gfs.dates[i+1].get(),gfs.dates[i+1].get_offset(), gfs[DATA_TYPE].values[lat,lon,i+1]

print DATA_TYPE, all(all_equals), np.mean(all_equals)

**In the vast majority of cases +3 rain is less than or equal to +6, but not always.**

Perhaps the outliers are caused by measurement or model error.

## Daily Temperature Timings
Exploring the change in temperature within a day.

In [None]:
gfs_proc = load_gfs_weather(os.path.join(DATA_DIR, 'interim/gfs/weather_proc/weather_proc_gfs_alaska_2007-2016.pkl'))

In [None]:
date = dt.datetime(2010,7,22,0,0, tzinfo=pytz.UTC) # 2010, 7, 15; 2010 7 22
lat, lon = 61.2, -149.9 # anchorage
from features import fire_weather_integration as fwi

lat_ind, lon_ind = fwi.FireWeatherIntegration(None).get_latlon_index(gfs_proc, lat, lon)
day = fwi.FireWeatherIntegration(None).get_date_index(gfs_proc, date)

val1 = np.mean(gfs_proc['temperature'].values[lat_ind,lon_ind,day+0])
val2 = np.mean(gfs_proc['temperature'].values[lat_ind,lon_ind,day+1])
val3 = np.mean(gfs_proc['temperature'].values[lat_ind,lon_ind,day+2])
val4 = np.mean(gfs_proc['temperature'].values[lat_ind,lon_ind,day+3])
date1 = gfs_proc['temperature'].dates[day+0].astimezone(du.TrulyLocalTzInfo(lon, du.round_to_nearest_quarter_hour))
date2 = gfs_proc['temperature'].dates[day+1].astimezone(du.TrulyLocalTzInfo(lon, du.round_to_nearest_quarter_hour))
date3 = gfs_proc['temperature'].dates[day+2].astimezone(du.TrulyLocalTzInfo(lon, du.round_to_nearest_quarter_hour))
date4 = gfs_proc['temperature'].dates[day+3].astimezone(du.TrulyLocalTzInfo(lon, du.round_to_nearest_quarter_hour))

import pytz
print date1, val1
print date2, val2
print date3, val3
print date4, val4
