In [None]:
import statsmodels.formula.api as smf
import statsmodels as sm
import numpy as np
import cPickle
import matplotlib.pyplot as plt
import pandas as pd
import os
os.chdir('/Users/zbutler/research/fire_prediction')
import prediction.poisson_regression as pr
%matplotlib inline

In [None]:
# First, load our dataset
with open('data/global_df.pkl') as fpkl:
    global_df = cPickle.load(fpkl)
summer_df = global_df[(global_df.dayofyear >= 134) & (global_df.dayofyear <= 242)]
summer_df[0:10]

In [None]:
# Convert into processing-friendly DataFrame
reload(pr)
autocorrs = [1,3,5,10]
print summer_df.loc[lambda x: (x.year==2010) & (x.dayofyear==134), 'n_det']
X, y, y_dates = pr.get_regression_df(summer_df, covar_cols=['temp', 'vpd', 'humidity', 'dayofyear', 'year'], normalize=[1,1,1,0,0], log_counts=True, autocorr_windows=autocorrs)
# Add our prediction target as a column for easier model prototyping
X.loc[:,'y'] = y
X1_train, y1_train, X1_test, y1_test = pr.train_test_split(X,y)
X2_train, y2_train, X2_test, y2_test = pr.train_test_split(X,y)
print X.loc[0:10]

In [None]:
reload(pr)
# Now run various versions of the model 
covars = ['t', 'v', 'h', 'tv', '+tv', 'th', '+th', 'vh', '+vh', 'tvh']
res = smf.glm('y ~ n_det_1', data=X1_train, family=sm.genmod.families.family.Poisson()).fit()
y1_hat_base = res.predict(X1_test)
print pearsonr(y1_hat_base, y1_test)
#y_hat_base = X1_test.n_det_1
print "Method\tTrain LL\tMSE\tMedianSE\tMeanAbsErr\tRobustMSE"
print "Base\t%.2f\t" % (res.llf),
eval_metrics = ['MSE', 'MedianSE', 'MeanAbsErr', 'RobustMSE']
for met in eval_metrics:
    print "%.2f\t" % pr.evaluate_glm(y1_test, y_hat_base, metric=met),
print ""
for covar in covars:
    form = 'y ~ n_det_1 + '
    int_flag = 0
    for i,letter in enumerate(covar):
        if letter == "+":
            int_flag = 1
            continue
        if letter == 't':
            form += 'normtemp'
        elif letter == 'v':
            form += 'normvpd'
        elif letter == 'h':
            form += 'normhumidity'
        if i != len(covar) - 1:
            if int_flag:
                form += ' * '
            else:
                form += ' + '
    res = smf.glm(form, data=X1_train, family=sm.genmod.families.family.Poisson()).fit()
    y1_hat = res.predict(X1_test)
    print pearsonr(y1_hat, y1_test)
    print "%s\t%.2f\t" % (covar, res.llf),
    for met in eval_metrics:
        print "%.2f\t" % pr.evaluate_glm(y1_test, y1_hat, metric=met),
    print ""


In [None]:
# Now, similar expt but with different autocorrs
y_hat_base = X1_test.n_det_1
print "Method\tTrain LL\tMSE\tMedianSE\tMeanAbsErr\tRobustMSE"
print "Base\tN/A\t",
eval_metrics = ['MSE', 'MedianSE', 'MeanAbsErr', 'RobustMSE']
for met in eval_metrics:
    print "%.2f\t" % pr.evaluate_glm(y1_test, y_hat_base, metric=met),
print ""
for cor in autocorrs:
    form = 'y ~ n_det_%d + normtemp + normvpd' % cor
    res = smf.glm(form, data=X1_train, family=sm.genmod.families.family.Poisson()).fit()
    y1_hat = res.predict(X1_test)
    print "auto %d\t%.2f\t" % (cor, res.llf),
    for met in eval_metrics:
        print "%.2f\t" % pr.evaluate_glm(y1_test, y1_hat, metric=met),
    print ""
    #res = smf.glm(form, data=X2_train, family=sm.genmod.families.family.Poisson()).fit()
    #print "(2) Form: %s Train LL: %f Train BIC: %f " % (form, res.llf, res.bic)
    #y2_hat = res.predict(X2_test)
    #print "\tMSE: %f, Log MSE: %f" % (pr.evaluate_glm(y2_test, y2_hat), pr.evaluate_glm(y2_test, y2_hat, log=True))
#res = smf.glm("y ~ n_det_5 + normtemp + normvpd + normhumidity", data=X2_train, family=sm.genmod.families.family.Poisson()).fit()
#y2_hat = res.predict(X2_test)
#print np.mean((y2_test - y2_hat)**2)

In [None]:
from scipy.stats import pearsonr
res = smf.glm("y ~ n_det_3 + normtemp + normvpd + normhumidity", data=X2_train, family=sm.genmod.families.family.Poisson()).fit()
print res.params
y2_hat = res.predict(X2_test)
print np.mean((y2_test - y2_hat)**2)
res = smf.glm("y ~ n_det_5 + normtemp + normvpd + normhumidity", data=X2_train, family=sm.genmod.families.family.Poisson()).fit()
print res.params
y2_hat = res.predict(X2_test)
print np.mean((y2_test - y2_hat)**2)

In [None]:
summer_df.loc[0] = summer_df.loc[27]
print summer_df.loc[0]

In [None]:
annual_dfs = dict()
years = range(2010,2017)
for year in years:
    annual_dfs[year] = X[X.year==year]

In [None]:
params_dict = dict()
for year in years:
    glm = smf.glm('y ~ n_det_1 + normvpd', data=annual_dfs[year], family=sm.genmod.families.family.Poisson()).fit()
    params_dict[year] = glm.params
print glm.params

In [None]:
dets = []
temps = []
vpds = []
hums = []
intercepts = []
mean_dets = []
nnzs = []
for year in years:
    dets.append(params_dict[year].n_det_1)
#    temps.append(params_dict[year].normtemp)
    vpds.append(params_dict[year].normvpd)
#    hums.append(params_dict[year].normhumidity)
    mean_dets.append(np.mean(summer_df[summer_df.year==year].n_det))
    nnzs.append(np.sum(summer_df[summer_df.year==year].n_det > 0))
    intercepts.append(params_dict[year].Intercept)

plt.plot(years, intercepts)
plt.title('Intercepts by year')
plt.savefig('pics/intercepts_by_year.png')
plt.show()
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(12,10))
plt.subplot(321)
plt.plot(years, dets)
plt.title("# detections parameter by year")
#plt.subplot(322)
#plt.plot(years, temps)
#plt.title("temp parameter by year")
plt.subplot(323)
plt.plot(years, vpds)
plt.title("vpd parameter by year")
#plt.subplot(324)
#plt.plot(years, hums)
#plt.title("humidity parameter by year")
plt.subplot(325)
plt.plot(years, mean_dets)
plt.title("mean detections by year")
plt.subplot(326)
plt.plot(years, nnzs)
plt.title("number of nonzeros per year")
plt.savefig("pics/other_params_by_year.png")
plt.show()

In [None]:
Xnl, ynl, ynl_dates = pr.get_regression_df(summer_df, covar_cols=['temp', 'vpd', 'humidity', 'dayofyear', 'year'], normalize=[1,1,1,0,0], log_counts=False, autocorr_windows=autocorrs)
for cor in autocorrs:
    X.loc[:,'n_det_%d_nl' %cor] = Xnl['n_det_%d' %cor]
# Add our prediction target as a column for easier model prototyping
X.loc[:,'y'] = y
X1_train, y1_train, X1_test, y1_test = pr.train_test_split(X,y)
X2_train, y2_train, X2_test, y2_test = pr.train_test_split(X,y)
print X.iloc[0:10]

In [None]:
res = smf.glm(form, data=X1_train, family=sm.genmod.families.family.Poisson()).fit()
y1_hat = res.predict(X1_test)
print y1_hat.shape
y_hat_base = X1_test.n_det_1_nl
print "MSE base: " + str(pr.evaluate_glm(y1_test, y_hat_base))
for cor in autocorrs:
    form = 'y ~ n_det_%d + normtemp + normvpd + normhumidity' % cor
    res = smf.glm(form, data=X1_train, family=sm.genmod.families.family.Poisson()).fit()
    print "(log) Form: %s Train LL: %f Train BIC: %f " % (form, res.llf, res.bic)
    y1_hat = res.predict(X1_test)
    print "\tMSE: %f, Log MSE: %f" % (pr.evaluate_glm(y1_test, y1_hat), pr.evaluate_glm(y1_test, y1_hat, log=True))
    
    form = 'y ~ n_det_%d_nl + normtemp + normvpd + normhumidity' % cor
    res = smf.glm(form, data=X1_train, family=sm.genmod.families.family.Poisson()).fit()
    print "(nolog) Form: %s Train LL: %f Train BIC: %f " % (form, res.llf, res.bic)
    y1_hat = res.predict(X1_test)
    print "\tMSE: %f, Log MSE: %f" % (pr.evaluate_glm(y1_test, y1_hat), pr.evaluate_glm(y1_test, y1_hat, log=True))

res = smf.glm("y ~ n_det_5 + normtemp + normvpd + normhumidity", data=X2_train, family=sm.genmod.families.family.Poisson()).fit()
y2_hat = res.predict(X2_test)
print np.mean((y2_test - y2_hat))**2

In [None]:
# Now, try binary model
reload(pr)
X, y, nz_X, nz_y, bin_y = pr.get_regression_df(summer_df, covar_cols=['temp', 'vpd', 'humidity', 'dayofyear', 'year'], normalize=[1,1,1,0,0], log_counts=True, autocorr_windows=autocorrs, ignore_nans=True, return_alt_ys=True)
res_nz = smf.glm("y ~ n_det_1 + normvpd", data=nz_X, family=sm.genmod.families.family.Poisson()).fit()
res_z = smf.glm("bin_y ~ n_det_1 + normvpd", data=X, family=sm.genmod.families.family.Binomial()).fit()
print res_nz.summary()
print res_z.summary()



In [None]:
print np.sum(bin_y)
print len(bin_y)

In [None]:
reload(pr)
Xun, yun, yun_dates = pr.get_regression_df(summer_df, covar_cols=['temp', 'vpd', 'humidity', 'dayofyear', 'year'], normalize=[2,2,2,0,0], log_counts=True, autocorr_windows=autocorrs)
print Xun.iloc[0:5]
# Add our prediction target as a column for easier model prototyping
Xun.loc[:,'humidity'] /= 100.
Xun_train, yun_train, Xun_test, yun_test = pr.train_test_split(Xun,yun)
# Now run various versions of the model 
covars = ['t', 'v', 'h', 'tv', 'th', 'vh', '+tv', '+th', '+vh', 'tvh']
for covar in covars:
    form = 'y ~ n_det_1 + '
    int_flag = 0
    for i,letter in enumerate(covar):
        if letter == "+":
            int_flag = 1
            continue
        if letter == 't':
            form += 'temp'
        elif letter == 'v':
            form += 'vpd'
        elif letter == 'h':
            form += 'humidity'
        if i != len(covar) - 1:
            if int_flag:
                form += ' * '
            else:
                form += ' + '
    res = smf.glm(form, data=Xun_train, family=sm.genmod.families.family.Poisson()).fit()
    print "(unnorm) Form: %s Train LL: %f Train BIC: %f " % (form, res.llf, res.bic)
    y1_hat = res.predict(Xun_test)
    print "\tMSE: %f, Log MSE: %f" % (pr.evaluate_glm(yun_test, y1_hat), pr.evaluate_glm(yun_test, y1_hat, log=True))
    
    form = 'y ~ n_det_1 + '
    int_flag = 0
    for i,letter in enumerate(covar):
        if letter == "+":
            int_flag = 1
            continue
        if letter == 't':
            form += 'normtemp'
        elif letter == 'v':
            form += 'normvpd'
        elif letter == 'h':
            form += 'normhumidity'
        if i != len(covar) - 1:
            if int_flag:
                form += ' * '
            else:
                form += ' + '
    res = smf.glm(form, data=Xun_train, family=sm.genmod.families.family.Poisson()).fit()
    print "(norm) Form: %s Train LL: %f Train BIC: %f " % (form, res.llf, res.bic)
    y1_hat = res.predict(Xun_test)
    print "\tMSE: %f, Log MSE: %f" % (pr.evaluate_glm(yun_test, y1_hat), pr.evaluate_glm(yun_test, y1_hat, log=True))

In [None]:
# Now look at where loss is coming from
res = smf.glm('y ~ n_det_1 + normtemp + normvpd + normhumidity', data=Xun_train, family=sm.genmod.families.family.Poisson()).fit()
y1_hat = res.predict(Xun_test)
plt.plot((y1_hat - yun_test)**2, 'r+')
plt.ylabel('squared error')
plt.show()
plt.plot(np.log(y1_hat+1), np.log(yun_test+1), 'b+')
plt.xlabel('fitted values')
plt.ylabel('actual values')
plt.show()
print pearsonr(np.log(y1_hat+1), np.log(yun_test+1))
print pearsonr(y1_hat, yun_test)
plt.plot((np.log(y1_hat+1) - np.log(yun_test+1))**2, 'r+')
plt.ylabel('squared log error')
plt.show()

In [None]:
plt.plot(np.sort(summer_df.n_det),'b+')
plt.show()
summer_df[summer_df.n_det > 1000]

In [None]:
# plot preds vs reals by day. Train on all but 2013, predict 2013.
testyear = 2011
X_train = X[X.year != testyear]
y_train = X[X.year != testyear].y
X_test = X[X.year == testyear]
y_test = X[X.year == testyear].y
res = smf.glm('y ~ n_det_1 + normhumidity', data=X_train, family=sm.genmod.families.family.Poisson()).fit()
y_hat = res.predict(X_test)

fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10,8))
ax1 = plt.subplot(211)
plt.plot(X_test.dayofyear, y_hat, 'r--')
plt.plot(X_test.dayofyear, y_test, 'b--')
plt.xlabel("Day of year")
plt.ylabel("Number of fires")
ax2 = plt.subplot(212, sharex=ax1)
plt.plot(X_test.dayofyear, np.log(y_hat+1), 'r--')
plt.plot(X_test.dayofyear, np.log(y_test+1), 'b--')
plt.xlabel("Day of year")
plt.ylabel("Number of fires")
plt.show()

In [None]:
# Throw in lightning! Only for the three years we have it though
with open("data/lightning/ground_lightning.pkl") as fpkl:
    ground_l = cPickle.load(fpkl)
print "Loaded %d ground lightning strikes" % len(ground_l)
ground_l.iloc[0:10]

In [None]:
# Aggregate per day and make a new training dataset for 2012-2015 with lightning included
from util.daymonth import monthday2day, increment_day
X_1215 = X[(X.year < 2016) & (X.year > 2011)]
X_1215['lightning'] = pd.Series(np.zeros(len(X_1215)), index=X_1215.index)
day = 1
month = 1
year = 2012
strikes_per_day = []
while year < 2016:
    n_strikes = len(ground_l[(ground_l.day == day) & (ground_l.month == month) & (ground_l.year == year)])
    dayofyear = monthday2day(month, day, leapyear=(year % 4))
    if len(X_1215[(X_1215.dayofyear == dayofyear) & (X_1215.year == year)]):
        X_1215.lightning[(X_1215.dayofyear == dayofyear) & (X_1215.year == year)] = n_strikes
    year, month, day = increment_day(year, month, day)
X_1215['loglightning'] = np.log(X_1215.lightning + 1)
print X_1215.iloc[0:10]

In [None]:
# Ok, now predict 2013 from other years
testyear = 2013
X_train = X_1215[X_1215.year != testyear]
y_train = X_1215[X_1215.year != testyear].y
X_test = X_1215[X_1215.year == testyear]
y_test = X_1215[X_1215.year == testyear].y
res = smf.glm('y ~ n_det_1 + normhumidity + loglightning', data=X_train, family=sm.genmod.families.family.Poisson()).fit()
res.summary()

In [None]:
res0 = smf.glm('y ~ n_det_1', data=X_train, family=sm.genmod.families.family.Poisson()).fit()
res1 = smf.glm('y ~ n_det_1 + normhumidity', data=X_train, family=sm.genmod.families.family.Poisson()).fit()
res2 = smf.glm('y ~ n_det_1 + normhumidity + lightning', data=X_train, family=sm.genmod.families.family.Poisson()).fit()
res3 = smf.glm('y ~ n_det_1 + normhumidity + loglightning', data=X_train, family=sm.genmod.families.family.Poisson()).fit()
y_hat0 = res0.predict(X_test)
y_hat1 = res1.predict(X_test)
y_hat2 = res2.predict(X_test)
y_hat3 = res3.predict(X_test)
print "Method\tTrain LL\tMSE\tMedianSE\tMeanAbsErr\tRobustMSE"
print "no covars\t%.2f\t" % (res0.llf),
for met in eval_metrics:
    print "%.2f\t" % pr.evaluate_glm(y_test, y_hat0, metric=met),
print ""
print "no lightning\t%.2f\t" % (res1.llf),
for met in eval_metrics:
    print "%.2f\t" % pr.evaluate_glm(y_test, y_hat1, metric=met),
print ""
print "lightning\t%.2f\t" % (res2.llf),
for met in eval_metrics:
    print "%.2f\t" % pr.evaluate_glm(y_test, y_hat2, metric=met),
print ""
print "log lightning\t%.2f\t" % (res3.llf),
for met in eval_metrics:
    print "%.2f\t" % pr.evaluate_glm(y_test, y_hat3, metric=met),
print ""

In [None]:
fig, axes = plt.subplots(nrows=4, ncols=1, figsize=(10,16))
ax1 = plt.subplot(411)
plt.plot(X_test.dayofyear, y_hat2, 'r--')
plt.plot(X_test.dayofyear, y_test, 'b--')
plt.xlabel("Day of year")
plt.ylabel("Number of fires")
ax2 = plt.subplot(412, sharex=ax1)
plt.plot(X_test.dayofyear, np.log(y_hat2+1), 'r--')
plt.plot(X_test.dayofyear, np.log(y_test+1), 'b--')
plt.xlabel("Day of year")
plt.ylabel("Number of fires")
ax3 = plt.subplot(413, sharex=ax1)
plt.plot(X_test.dayofyear, y_hat1, 'r--')
plt.plot(X_test.dayofyear, y_test, 'b--')
plt.xlabel("Day of year")
plt.ylabel("Number of fires")
ax4 = plt.subplot(414, sharex=ax1)
plt.plot(X_test.dayofyear, np.log(y_hat1+1), 'r--')
plt.plot(X_test.dayofyear, np.log(y_test+1), 'b--')
plt.xlabel("Day of year")
plt.ylabel("Number of fires")
plt.show()

In [None]:
from util.daymonth import day2monthday
print day2monthday(134)
print day2monthday(242)