# GFS Open Question Exploration (v1.0)
Casey A Graff

August 11th, 2017

In [None]:
REP_DIR = "/home/cagraff/Documents/dev/fire_prediction/"
SRC_DIR = REP_DIR + 'src/'
DATA_DIR = REP_DIR + 'data/'

# Load system-wide packages
import os
import numpy as np
from tabulate import tabulate
from scipy.stats.stats import pearsonr
%matplotlib inline

# Load project packages
os.chdir(SRC_DIR)
from features.loaders import load_gfs_df
from features.helper.daymonth import monthday2day, day2monthday

In [None]:
# Load data
gfs = load_gfs_df(os.path.join(DATA_DIR, 'archived/weather/gfs_ak_dict.pkl'))
temp = gfs['temp']
days = gfs['days']
print gfs.keys()

## Missing days
How many days are missing and is there a temporal pattern to the missing days?

### Number of missings days per year

In [None]:
days_per_year = 365
years = range(2007, 2017)
num_years = len(years)

def is_leap_year(year):
    return year % 4 == 0

# Missing per year
missing = []
for year in years:
    sel = [x for x in days if x[0]==year]
    
    days_present = len(sel)
    days_missing = (days_per_year + is_leap_year(year)) - days_present
    
    
    missing.append((year, days_present, days_missing))

# Missing total
missing.append(('Total', sum([x[1] for x in missing]), sum([x[2] for x in missing])))
print tabulate(missing, headers=['Year', 'Present', 'Missing'])

print '\nPercentage missing is {}%'.format(missing[-1][2]/(.01*missing[-1][1]))

  

Clearly some years have substantially more missing days than others. The most clear difference is that 2007 has over double the missing days of any other year.

### Number of missing days per year (in fire season)

In [None]:
fire_season = (133,242)
print 'Fire Season:', day2monthday(133), 'to', day2monthday(242), '\n'
days_per_season = fire_season[1] - fire_season[0] + 1

# Missing per year
missing = []
for year in years:
    sel = [x for x in days if x[0]==year and (monthday2day(x[1], x[2], is_leap_year(x[0])) in range(fire_season[0], fire_season[1]+1))]
    
    days_present = len(sel)
    days_missing = days_per_season - days_present
    
    missing.append((year, days_present, days_missing))

# Missing total
missing.append(('Total', sum([x[1] for x in missing]), sum([x[2] for x in missing])))
print tabulate(missing, headers=['Year', 'Present', 'Missing'])

print '\nPercentage missing is {}%'.format(missing[-1][2]/(.01*missing[-1][1]))


Even when constrained to the fire season there is still a significant portion of the data missing (8%).

### Number of missing days per month

In [None]:
months = range(1, 13)

MONTH_IND = 0
PRESENT_IND = 1
MISSING_IND = 2

def days_per_month(month, is_leap):
    if is_leap:
        month_arr = [31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    else:
        month_arr = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    return month_arr[month-1]

missing = np.zeros((12,3), dtype=np.int32)
missing[:, MONTH_IND] = months

# Missing per month
for year in years:
    for month in months:
        sel = [x for x in days if x[0]==year and x[1]==month]

        days_present = len(sel)
        days_missing = days_per_month(month, is_leap_year(year)) - days_present


        missing[month-1,PRESENT_IND] += days_present
        missing[month-1,MISSING_IND] += days_missing
        



# Missing total
present_total = np.sum(missing[:, PRESENT_IND])
missing_total = np.sum(missing[:, MISSING_IND])
missing = list(missing)
missing.append(['Total', present_total, missing_total])

print tabulate(missing, headers=['Month', 'Present', 'Missing'])


There doesn't seem to be a significant difference in the number of missing days when grouped by month.

### List of Missing Days

In [None]:
def generate_month(year, month):
    days = range(1, days_per_month(month, is_leap_year(year))+1)
    return [(year, month, d) for d in days]

missing_days = []

for year in years:
    for month in months:
        days_in_month = days_per_month(month, is_leap_year(year))
        sel = [x for x in days if x[0]==year and x[1]==month]
        if len(sel) < days_in_month:
            missing_days += set(generate_month(year, month)).difference(set(sel))
            
missing_days.sort()
print missing_days

## Spatial Correlation of Measurements

Do adjacent pixel have a high correlation between measurements? If there is sufficient variability it may be useful to perform linear interpolation between neighboring cells when calculating the weather variables for a fire event.

In [None]:
def calc_cor(lat_off_tup, lon_off_tup):
    min_lat_off, max_lat_off, lat_off = lat_off_tup
    min_lon_off, max_lon_off, lon_off = lon_off_tup
    shape = np.shape(temp)
    table = []
    for data in ['temp', 'humidity', 'rain', 'wind']:
        cor = []
        for lat in range(min_lat_off, shape[0] + max_lat_off):
            for lon in range(min_lon_off, shape[1] + max_lon_off):
                cor.append(pearsonr(gfs[data][lat, lon], gfs[data][lat+lat_off, lon+lon_off])[0])

        table.append((data, np.mean(cor), np.std(cor), np.min(cor), np.max(cor)))

    return tabulate(table, headers=['Data', 'Mean', 'Std Dev', 'Min', 'Max'])

# Calculate correlation for left/right neighbor
print 'Left/Right'
print calc_cor((0, 0, 0), (1, 0, -1))

# Calculate correlation for top/bottom neighbor
print '\nTop/Bottom'
print calc_cor((1, 0, -1), (0, 0, 0))

# Calculate correlation for up-left/down-right neighbor
print '\nUp-left/Down-right'
print calc_cor((1, 0, -1), (1, 0, -1))

# Calculate correlation for down-left/up-right neighbor
print '\nDown-left/Up-right'
print calc_cor((0, -1, 1), (1, 0, -1))

Interestingly the left/right correlation for humidity/rain/wind is much higher than for any other comparison. Could this be due to west/east wind channels that smooth weather horizontally?