In [None]:
import statsmodels as sm
import numpy as np
import cPickle
import matplotlib.pyplot as plt
import pandas as pd
import os
os.chdir('/Users/zbutler/research/fire_prediction')
import prediction.poisson_regression as pr
%matplotlib inline

In [None]:
# First, load our dataset
with open('data/global_df.pkl') as fpkl:
    global_df = cPickle.load(fpkl)
global_df[0:10]

In [None]:
# some basic plots
annual_df = global_df[(global_df.year == 2013) & (global_df.dayofyear > 150) & (global_df.dayofyear < 200)]
#yearfloatarr = global_df.year + (global_df.dayofyear / 365.)
plt.plot(annual_df.dayofyear, np.log(annual_df.n_det+1), 'r.')
plt.title('number of detections per day')
plt.show()

In [None]:
# let's try making a prediction dataset. i guess this should be in carpentry/, i'll throw it there later
def create_dataset(df, normalize_feats=True):
    years = df.year.unique()
    X = pd.DataFrame()
    y = np.zeros((0))
    for year in years:
        annual_df = df[df.year == year]
        max_day = np.max(annual_df.dayofyear)
        min_day = np.min(annual_df.dayofyear)
        X = pd.concat((X, annual_df[annual_df.dayofyear != max_day]))
        y = np.concatenate((y, np.array(annual_df.n_det[annual_df.dayofyear != min_day])))
    if normalize_feats:
        X = (X - X.mean()) / X.std()
    return X,y

feat_df = global_df.loc[:,['dayofyear', 'humidity', 'n_clusters', 'n_det', 'temp', 'vpd', 'year']]
X, y = create_dataset(feat_df)
print X.iloc[0]
print y[0]

In [None]:
import statsmodels.api as sm
X_const = sm.add_constant(X)
glm = sm.GLM(y, X_const, family=sm.genmod.families.family.Poisson(), missing='drop')
glm_res = glm.fit()

In [None]:
print global_df.columns[[2,4,5,6,7]]
glm_res.summary()


In [None]:
# Train/test split
def train_test_split(df, years_in_test=1, normalize_feats=True, feat_cols=['dayofyear', 'n_det', 'vpd']):
    years = df.year.unique()
    perm = np.random.permutation(years)
    test_years = perm[0:years_in_test]
    train_years = perm[years_in_test:]
    print "Train years: " + str(train_years)
    print "Test years: " + str(test_years)
    X_train = pd.DataFrame()
    y_train = np.zeros((0))
    for year in train_years:
        annual_df = df[df.year == year]
        max_day = np.max(annual_df.dayofyear)
        min_day = np.min(annual_df.dayofyear)
        X_train = pd.concat((X_train, annual_df.loc[(annual_df.dayofyear != max_day), feat_cols]))
        y_train = np.concatenate((y_train, np.array(annual_df.n_det[annual_df.dayofyear != min_day])))
    if normalize_feats:
        X_mean = X_train.mean()
        X_std = X_train.std()
        X_train = (X_train - X_mean) / X_std
    X_train = sm.add_constant(X_train)
    
    X_test = pd.DataFrame()
    y_test = np.zeros((0))
    y_hat_base = np.zeros((0))
    for year in test_years:
        annual_df = df[df.year == year]
        max_day = np.max(annual_df.dayofyear)
        min_day = np.min(annual_df.dayofyear)
        X_test = pd.concat((X_test, annual_df.loc[(annual_df.dayofyear != max_day), feat_cols]))
        y_test = np.concatenate((y_test, np.array(annual_df.n_det[annual_df.dayofyear != min_day])))
        y_hat_base = np.concatenate((y_hat_base, np.array(annual_df.n_det[annual_df.dayofyear != max_day])))
    if normalize_feats:
        X_test = (X_test - X_mean) / X_std
    X_test = sm.add_constant(X_test)
    
    return X_train, y_train, X_test, y_test, y_hat_base

In [None]:
X_train, y_train, X_test, y_test, y_hat_base = train_test_split(global_df)
X_train_2, y_train_2, X_test_2, y_test_2, y_hat_base_2 = train_test_split(global_df, normalize_feats=False, feat_cols=['n_det'])
print "Shapes: " + str([X_train.shape, y_train.shape, X_test.shape, y_test.shape])

In [None]:
glm = sm.GLM(y_train, X_train, family=sm.genmod.families.family.Poisson(), missing='drop')
glm_res = glm.fit()
print glm_res.summary()
glm_2 = sm.GLM(y_train_2, X_train_2, family=sm.genmod.families.family.Poisson(), missing='drop')
glm_res_2 = glm_2.fit()
print glm_res_2.summary()

In [None]:
def evaluate_glm(y, y_hat, ignore_nans=True):
    if ignore_nans:
        non_nans = (1 - np.isnan(y_hat)).astype(bool)
        y = y[non_nans]
        y_hat = y_hat[non_nans]
        print "skipped %d" %(len(y) - np.sum(non_nans))
    return np.mean((y - y_hat)**2)
y_hat = glm_res.predict(X_test)
y_hat_train = glm_res.predict(X_train)
y_hat_2 = glm_res_2.predict(X_test_2)
print "MSE training: " + str(evaluate_glm(y_train, y_hat_train))
print "MSE model full: " + str(evaluate_glm(y_test, y_hat))
print "MSE model auto: " + str(evaluate_glm(y_test_2, y_hat_2))
print "MSE base: " + str(evaluate_glm(y_test, y_hat_base))
print "MSE zeros: " + str(evaluate_glm(y_test, np.zeros(len(y_test))))
print np.mean((y_test - y_hat_base)**2)

In [None]:
y_hat_nn = y_hat[np.logical_not(np.isnan(y_hat))]
print "mean: " + str(np.mean(y_hat_nn))
print "max: " + str(np.max(y_hat_nn))
print "min: " + str(np.min(y_hat_nn))
y_test_nn = y_test[np.logical_not(np.isnan(y_hat))]
plt.scatter(y_test_nn, y_hat_nn)
plt.xlabel('y_test')
plt.ylabel('y_hat')
plt.show()
plt.close()
plt.scatter(y_test, y_hat_base)
plt.xlabel('y_test')
plt.ylabel('y_hat baseline')
plt.show()

In [None]:
# Now lets plot some covariates 
plt.scatter(X_train.temp, y_train)
plt.xlabel('Temp')
plt.ylabel('y')
plt.show()
plt.close()
plt.scatter(X_train.humidity, y_train)
plt.xlabel('Humidity')
plt.ylabel('y')
plt.show()
plt.close()
plt.scatter(X_train.vpd, y_train)
plt.xlabel('VPD')
plt.ylabel('y')
plt.show()
plt.close()

In [None]:
# The plot Jim wanted to see
for year in xrange(2010,2017):
    annual_fires = global_df[global_df.year == year]
    fig, axes = plt.subplots(nrows=5, ncols=1, figsize=(12,10))
    ax1 = plt.subplot(511)
    plt.plot(annual_fires.dayofyear, annual_fires.n_det)
    plt.title('Number of detections')

    ax2 = plt.subplot(512, sharex=ax1)
    plt.plot(annual_fires.dayofyear, annual_fires.n_det != 0, 'rs')
    plt.title('Non-zero detection days')

    ax3 = plt.subplot(513, sharex=ax1)
    plt.plot(annual_fires.dayofyear, annual_fires.temp)
    plt.title('Temperature')

    ax4 = plt.subplot(514, sharex=ax1)
    plt.plot(annual_fires.dayofyear, annual_fires.humidity)
    plt.title('Humidity')

    ax5 = plt.subplot(515)
    plt.plot(annual_fires.dayofyear, annual_fires.vpd)
    plt.title('VPD')

    fig.tight_layout()
    plt.savefig('pics/covar_subplot_%d.png' % year)

In [None]:
from util.daymonth import day2monthday
print day2monthday(242)

In [None]:
# Plot counts on top of each other for each year
col_arr = ['r-', 'r--', 'b-', 'b--', 'y-', 'y--', 'k-', 'k--', 'g-']
for i,year in enumerate(xrange(2010, 2017)):
    annual_fires = global_df[global_df.year == year]
    plt.plot(annual_fires.dayofyear, annual_fires.n_det, col_arr[i])
plt.show()

In [None]:
print "%d fires" % np.sum(global_df.n_det)
print "%d before 134, %d after 242" % (np.sum(global_df[global_df.dayofyear<134].n_det), np.sum(global_df[global_df.dayofyear>242].n_det))

In [None]:
summer_df = global_df[(global_df.dayofyear >= 134) & (global_df.dayofyear <= 242)]
X_train, y_train, X_test, y_test, y_hat_base = train_test_split(summer_df, feat_cols=['n_det', 'temp', 'vpd', 'humidity'], normalize_feats=False)
print "Shapes: " + str([X_train.shape, y_train.shape, X_test.shape, y_test.shape])
glm = sm.GLM(y_train, X_train, family=sm.genmod.families.family.Poisson(), missing='drop')
glm_res = glm.fit()
y_hat = glm_res.predict(X_test)
print "MSE model full: " + str(evaluate_glm(y_test, y_hat))
print "MSE base: " + str(evaluate_glm(y_test, y_hat_base))
print "MSE zeros: " + str(evaluate_glm(y_test, np.zeros(len(y_test))))
plt.scatter(X_train.temp, y_train)
plt.xlabel('Temp')
plt.ylabel('y')
plt.show()
plt.close()
plt.scatter(X_train.humidity, y_train)
plt.xlabel('Humidity')
plt.ylabel('y')
plt.show()
plt.close()
plt.scatter(X_train.vpd, y_train)
plt.xlabel('VPD')
plt.ylabel('y')
plt.show()
plt.close()

In [None]:
n_delays = 7
auto_arr = np.zeros(n_delays)
det_arr = np.array(summer_df[summer_df.year==2014].n_det)
for delay in xrange(n_delays):
    auto_arr[delay] = np.corrcoef(np.array([det_arr[0:len(det_arr)-delay], det_arr[delay:]]))[0,1]
plt.plot(auto_arr, 'rs')
plt.title('Autoregressive coefs normal')
plt.show()

auto_arr = np.zeros(n_delays)
det_arr = np.array(summer_df[summer_df.year==2014].n_det)
for delay in xrange(n_delays):
    auto_arr[delay] = np.corrcoef(np.array([np.log(det_arr[0:len(det_arr)-delay]+1), np.log(det_arr[delay:]+1)]))[0,1]
plt.plot(auto_arr, 'rs')
plt.title('Autoregressive coefs log')
plt.show()

In [None]:
X, y, y_dates = pr.get_regression_df(summer_df, covar_cols=['temp', 'vpd'], log_counts=True, autocorr_window=5)
X_train, y_train, y_dates_train, X_test, y_test, y_dates_test = pr.train_test_split(X,y,y_dates)

In [None]:
glm_res = pr.get_glm(X_train, y_train)
glm_res.summary()

In [None]:
y_hat = glm_res.predict(X_test)
print "MSE model (log): " + str(pr.evaluate_glm(y_test, y_hat, log=True))
print "MSE base (log): " + str(pr.evaluate_glm(y_test, np.exp(X_test.loc[:,'n_det']), log=True))
print "MSE zeros (log): " + str(pr.evaluate_glm(y_test, np.zeros(len(y_test)), log=True))
nzs = y_test != 0
print "MSE model (nz): " + str(pr.evaluate_glm(y_test[nzs], y_hat[nzs]))
print "MSE base (nz): " + str(pr.evaluate_glm(y_test[nzs], np.exp(X_test.loc[:,'n_det'])[nzs]))
print "MSE zeros (nz): " + str(pr.evaluate_glm(y_test[nzs], np.zeros(np.sum(nzs))))
plt.plot(np.log(y_test+1), np.log(y_hat+1), 'r+')
plt.xlabel('y_test')
plt.ylabel('y_hat')
plt.show()

plt.plot(y_test - y_hat, 'r+')
plt.title("residuals")
plt.show()

plt.plot(np.log(y_test+1), X_test.loc[:,'n_det'], 'r+')
plt.xlabel('y_test')
plt.ylabel('y_base')
plt.show()

In [None]:
reload(pr)
X1, y1, y1_dates = pr.get_regression_df(summer_df, covar_cols=['temp', 'vpd'], normalize=[1,1], log_counts=True, autocorr_window=5)
X2, y2, y2_dates = pr.get_regression_df(summer_df, covar_cols=['vpd'], normalize=[1,1], log_counts=False, autocorr_window=5)
X3, y3, y3_dates = pr.get_regression_df(summer_df, covar_cols=[], normalize=[], log_counts=True, autocorr_window=1)
X4, y4, y4_dates = pr.get_regression_df(summer_df, covar_cols=['temp', 'vpd'], normalize=[1,1], log_counts=False, autocorr_window=1)
X1_train, y1_train, X1_test, y1_test = pr.train_test_split(X,y)
idx = np.concatenate((X1_train.index, X1_test.index))
X2_train, y2_train, X2_test, y2_test = pr.train_test_split(X2,y2,idx=idx)
X3_train, y3_train, X3_test, y3_test = pr.train_test_split(X3,y3,idx=idx)
X4_train, y4_train, X4_test, y4_test = pr.train_test_split(X4,y4,idx=idx)
glm_res1 = pr.get_glm(X1_train, y1_train)
glm_res2 = pr.get_glm(X2_train, y2_train)
glm_res3 = pr.get_glm(X3_train, y3_train)
glm_res4 = pr.get_glm(X4_train, y4_train)
y1_hat = glm_res1.predict(X1_test)
y2_hat = glm_res2.predict(X2_test)
y3_hat = glm_res3.predict(X3_test)
y4_hat = glm_res4.predict(X4_test)
print "MSE model1: " + str(pr.evaluate_glm(y1_test, y1_hat))
print "MSE model2: " + str(pr.evaluate_glm(y1_test, y2_hat))
print "MSE model3: " + str(pr.evaluate_glm(y1_test, y3_hat))
print "MSE model4: " + str(pr.evaluate_glm(y1_test, y4_hat))
print "MSE base (log): " + str(pr.evaluate_glm(y1_test, X3_test.loc[:,'n_det']))
print "MSE base (not log): " + str(pr.evaluate_glm(y1_test, np.exp(X3_test.loc[:,'n_det'])))
print "MSE zeros: " + str(pr.evaluate_glm(y1_test, np.zeros(len(y1_test))))

In [None]:
import statsmodels.formula.api as smf
glm = smf.glm('n_det ~ normvpd', data=X1, family=sm.genmod.families.family.Poisson())
res = glm.fit()
res.summary()


In [None]:
res.params

In [None]:
for bro in 'bro':
    print bro

In [None]:
print "yolo %d" % 4,
print "swag"