# GFS Open Question Exploration (v2.0)
Casey A Graff

August 11th, 2017

**Now using re-fetched gfs data.**

In [None]:
REP_DIR = "/home/cagraff/Documents/dev/fire_prediction/"
SRC_DIR = REP_DIR + 'src/'
DATA_DIR = REP_DIR + 'data/'

# Load system-wide packages
import os
import sys
import numpy as np
from tabulate import tabulate
from scipy.stats.stats import pearsonr
import datetime as dt
import pytz
from mpl_toolkits.basemap import Basemap
from matplotlib import pyplot as plt
%matplotlib inline

# Load project packages
os.chdir(SRC_DIR)
from features.loaders import load_gfs_weather
from features.helper.daymonth import monthday2day, day2monthday
from features.helper import date_util as du
from data.grib import latlonrange

In [None]:
# Load data
sys.path.append(SRC_DIR+'features')
gfs = load_gfs_weather(os.path.join(DATA_DIR, 'interim/gfs/weather/alaska_2007-2011.pkl'))

In [None]:
print gfs.cubes.keys()

## Missing files
How many files are missing and is there a temporal pattern to the missing days?

### Number of missings files per year

In [None]:
years = range(2007, 2012)

missing = []
for year in years:
    all_dates = [d for d in du.daterange(dt.datetime(year,1,1, tzinfo=pytz.UTC), dt.datetime(year+1, 1, 1, tzinfo=pytz.UTC), increment=dt.timedelta(hours=6))]

    files_present = 0
    files_missing = 0
    for date in all_dates:
        vals = gfs['temperature'].get_attribute_for_date('offsets', date)
        
        files_present += len(vals)
        files_missing += 3-len(vals)
        
    missing.append((year, files_present, files_missing))

# Missing total
missing.append(('Total', sum([x[1] for x in missing]), sum([x[2] for x in missing])))
print tabulate(missing, headers=['Year', 'Present', 'Missing'])

print '\nPercentage missing is {}%'.format(missing[-1][2]/(.01*missing[-1][1]))

### Number of missings files per year (within fire season)

In [None]:
years = range(2007, 2012)
season = ((5,14), (8,31))

print 'Fire Season:', season[0], 'to', season[1], '\n'

# Missing per year
missing = []
for year in years:
    all_dates = [d for d in du.daterange(dt.datetime(year, season[0][0], season[0][1], tzinfo=pytz.UTC),
                                         dt.datetime(year, season[1][0], season[1][1], tzinfo=pytz.UTC) + du.inc_one_day, increment=dt.timedelta(hours=6))]

    files_present = 0
    files_missing = 0
    for date in all_dates:
        vals = gfs['temperature'].get_attribute_for_date('offsets', date)
        
        files_present += len(vals)
        files_missing += 3-len(vals)
        
    missing.append((year, files_present, files_missing))

# Missing total
missing.append(('Total', sum([x[1] for x in missing]), sum([x[2] for x in missing])))
print tabulate(missing, headers=['Year', 'Present', 'Missing'])

print '\nPercentage missing is {}%'.format(missing[-1][2]/(.01*missing[-1][1]))

### Number of missing days per month

In [None]:
years = range(2007, 2012)
months = range(1, 13)

MONTH_IND = 0
PRESENT_IND = 1
MISSING_IND = 2

missing = np.zeros((12,3), dtype=np.int32)
missing[:, MONTH_IND] = months
        
for year in years:
    for month in months:
        month_num_days = du.days_per_month(month, du.is_leap_year(year))
        all_dates = [d for d in du.daterange(dt.datetime(year, month, 1, tzinfo=pytz.UTC),
                                         dt.datetime(year, month, month_num_days, tzinfo=pytz.UTC) + du.inc_one_day, increment=dt.timedelta(hours=6))]

        files_present = 0
        files_missing = 0
        for date in all_dates:
            vals = gfs['temperature'].get_attribute_for_date('offsets', date)

            files_present += len(vals)
            files_missing += 3-len(vals)
        
        missing[month-1,PRESENT_IND] += files_present
        missing[month-1,MISSING_IND] += files_missing


# Missing total
present_total = np.sum(missing[:, PRESENT_IND])
missing_total = np.sum(missing[:, MISSING_IND])
missing = list(missing)
missing.append(['Total', present_total, missing_total])

print tabulate(missing, headers=['Month', 'Present', 'Missing'])


## Print Missing Files

In [None]:
years = range(2011, 2012)
grib_file_fmt = "gfsanl_4_%s%.2d%.2d_%.2d%.2d_%.3d.grb2"

missing_files = []
for year in years:
    all_dates = [d for d in du.daterange(dt.datetime(year,1,1, tzinfo=pytz.UTC), dt.datetime(year+1, 1, 1, tzinfo=pytz.UTC), increment=dt.timedelta(hours=6))]

    for date in all_dates:
        offsets_found = gfs['temperature'].get_attribute_for_date('offsets', date)
        
        gribs_found = [grib_file_fmt % (year, date.month, date.day, date.hour, date.minute, offset.seconds/3600) for offset in offsets_found]
        gribs_expected = [grib_file_fmt % (year, date.month, date.day, date.hour, date.minute, offset) for offset in (0, 3, 6)]

        missing_files += list(set(gribs_expected).difference(set(gribs_found)))

print len(missing_files), missing_files

## Spatial Correlation of Measurements

Do adjacent pixel have a high correlation between measurements? If there is sufficient variability it may be useful to perform linear interpolation between neighboring cells when calculating the weather variables for a fire event.

In [None]:
plt.rcParams['figure.figsize'] = [10,15]

def make_map():
    lat_min, lat_max, lon_min, lon_max = gfs['temperature'].bounding_box.get()

    print (lat_min, lat_max), (lon_min, lon_max)

    mp = Basemap(projection="merc",
                  llcrnrlat=lat_min,
                  llcrnrlon=lon_min,
                  urcrnrlat=lat_max,
                  urcrnrlon=lon_max,
                  resolution='i')

    mp.drawcoastlines()
    #mp.drawlsmask()

    parallels = np.arange(lat_min,lat_max,2)
    _ = mp.drawparallels(parallels,labels=[False,True,False,False])
    
    parallels = np.arange(lat_min,lat_max,.5)
    _ = mp.drawparallels(parallels,labels=[False,False,False,False])

    meridians = np.arange(lon_min,lon_max,2)
    _ = mp.drawmeridians(meridians, labels=[False,False,False,True])
    
    meridians = np.arange(lon_min,lon_max,.5)
    _ = mp.drawmeridians(meridians, labels=[False,False,False,False])
    
    return mp

mp = make_map()

latlon = [ll for ll in latlonrange(gfs['temperature'].bounding_box, .5, .5)]
lats,lons = zip(*latlon)
_ = mp.scatter(lons, lats ,30, latlon=True, marker='o', color='b')
_ = plt.title('GFS Meaurement Points')

In [None]:
data_types = ['total_precipitation','u_wind_component', 'v_wind_component', 'temperature', 'humidity']
DATA_TYPE = data_types[0]
DATE_SEL = dt.datetime(2009, 3, 5, 18, tzinfo=pytz.UTC)
OFFSET_SEL = 2

plt.rcParams['figure.figsize'] = [10,15]

mp = make_map()
mp.shadedrelief()

values = gfs[DATA_TYPE].get_values_for_date(DATE_SEL)[:,:,OFFSET_SEL]
num_lats, num_lons = values.shape[0], values.shape[1]
lons, lats = mp.makegrid(num_lons, num_lats)
lats = np.transpose(np.tile(np.arange(71,54.5, -.5), (lons.shape[1],1) ))

cs = mp.contourf(lons, lats , values, latlon=True, alpha=.6)
cbar = mp.colorbar(cs,location='bottom',pad="5%")
#cbar.set_label('Kelvin (degrees)')

_ = plt.title('%s at %s' % (DATA_TYPE, DATE_SEL))

In [None]:
def get_mean(data_type):
    shape = gfs[data_type].values.shape
    table = []
    for data in [data_type]:
        mean = np.zeros(shape[:2])
        for lat in range(0, shape[0]):
            for lon in range(0, shape[1]):
                v = gfs[data].values[lat, lon]
                
                # Remove nans
                v = v[np.logical_not(np.isnan(v))]
                
                mean[lat,lon] = np.mean(v)
    return mean


mp = make_map()
mp.shadedrelief()

values = get_mean(DATA_TYPE)
num_lats, num_lons = values.shape[0], values.shape[1]
lons, lats = mp.makegrid(num_lons, num_lats)
lats = np.transpose(np.tile(np.arange(71,54.5, -.5), (lons.shape[1],1) ))

cs = mp.contourf(lons, lats , values, latlon=True, alpha=.6)
cbar = mp.colorbar(cs,location='bottom',pad="5%")
#cbar.set_label('Kelvin (degrees)')

_ = plt.title('Mean %s' % DATA_TYPE)

In [None]:
def calc_cor(lat_off_tup, lon_off_tup, data_type):
    min_lat_off, max_lat_off, lat_off = lat_off_tup
    min_lon_off, max_lon_off, lon_off = lon_off_tup
    shape = gfs[data_type].values.shape
    table = []
    for data in [data_type]:
        cor = np.zeros(shape[:2])
        for lat in range(min_lat_off, shape[0] + max_lat_off):
            for lon in range(min_lon_off, shape[1] + max_lon_off):
                v = gfs[data].values[lat, lon]
                v_off = gfs[data].values[lat+lat_off, lon+lon_off]
                
                # Remove nans
                v = v[np.logical_not(np.isnan(v))]
                v_off = v_off[np.logical_not(np.isnan(v_off))]
                
                cor[lat,lon] = pearsonr(v, v_off)[0]
    return cor
                
# Calculate correlation for left neighbor
cor = calc_cor((0, 0, 0), (1, 0, -1), DATA_TYPE)


mp = make_map()
mp.shadedrelief()

values = cor
num_lats, num_lons = values.shape[0], values.shape[1]
lons, lats = mp.makegrid(num_lons, num_lats)
lats = np.transpose(np.tile(np.arange(71,54.5, -.5), (lons.shape[1],1) ))

cs = mp.contourf(lons[:,1:], lats[:,1:] ,values[:,1:], latlon=True, alpha=.6)
cbar = mp.colorbar(cs,location='bottom',pad="5%")
cbar.set_label('Correlation (0 to 1)')

_ = plt.title('Left Correlation of %s' % DATA_TYPE)

print 'Mean Correlation %f' % np.mean(values)

In [None]:
# Calculate correlation for top neighbor
cor = calc_cor((1, 0, -1), (0, 0, 0), DATA_TYPE)

mp = make_map()
mp.shadedrelief()

values = cor
num_lats, num_lons = values.shape[0], values.shape[1]
lons, lats = mp.makegrid(num_lons, num_lats)
lats = np.transpose(np.tile(np.arange(71,54.5, -.5), (lons.shape[1],1) ))

cs = mp.contourf(lons[1:, :], lats[1:, :] ,values[1:, :], latlon=True, alpha=.6)
cbar = mp.colorbar(cs,location='bottom',pad="5%")
cbar.set_label('Correlation (0 to 1)')

_ = plt.title('Top Correlation of %s' % DATA_TYPE)

print 'Mean Correlation %f' % np.mean(values)

In [None]:
# Calculate correlation for bottom neighbor
cor = calc_cor((0, -1, 1), (0, 0, 0), DATA_TYPE)

mp = make_map()
mp.shadedrelief()

values = cor
num_lats, num_lons = values.shape[0], values.shape[1]
lons, lats = mp.makegrid(num_lons, num_lats)
lats = np.transpose(np.tile(np.arange(71,54.5, -.5), (lons.shape[1],1) ))

cs = mp.contourf(lons[:-1, :], lats[:-1, :] ,values[:-1, :], latlon=True, alpha=.6)
cbar = mp.colorbar(cs,location='bottom',pad="5%")
cbar.set_label('Correlation (0 to 1)')

_ = plt.title('Bottom Correlation of %s' % DATA_TYPE)

print 'Mean Correlation %f' % np.mean(values)