In [None]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import cPickle
import os
os.chdir('/Users/zbutler/research/fire_prediction')
from data import data
from prediction.cluster_regression import ClusterRegression
from prediction.poisson_regression import evaluate_glm
%matplotlib inline

In [None]:
clust_feat_df = data.load_clust_feat_df(clust_thresh=5)
clust_feat_df.iloc[0:5]

# Initial model

In [None]:
cr = ClusterRegression(clust_feat_df, 5, 'unused', 10)
cr.clust_df.iloc[0:5]

In [None]:
def all_except_year(year, rng=(2007,2016)):
    return [x for x in range(rng[0],rng[1]+1) if x!=year]
cr.fit(range(2007,2017), 1)
cr.fit_res.summary()

In [None]:
print "Len of feat df: " + str(len(clust_feat_df))
print "corr: " + str(pearsonr(clust_feat_df.rain[True - np.isnan(clust_feat_df.rain)], clust_feat_df.n_det[True - np.isnan(clust_feat_df.rain)]))
plt.scatter(np.array(clust_feat_df.rain), np.array(clust_feat_df.n_det))
plt.xlabel('rain')
plt.ylabel('fire detections in this cluster')
plt.show()

In [None]:
#cr = ClusterRegression(clust_feat_df, 5, 'unused', 10)
baseline_res = dict()
memory_1_res = dict()
memory_5_res = dict()
weather_1_res = dict()
weather_5_res = dict()
eval_metrics = ["MSE", "MedianSE"]
for met in eval_metrics:
    baseline_res[met] = []
    memory_1_res[met] = []
    memory_5_res[met] = []
    weather_1_res[met] = []
    weather_5_res[met] = []
    
for year in xrange(2007, 2017):
    years = all_except_year(year)
    test_df = clust_feat_df[clust_feat_df.year==year]
    y = test_df.n_det
    non_nan_inds = np.logical_not(np.isnan(test_df.temp))
    stupid = np.where(non_nan_inds)[0]
    y_non_nan = y[non_nan_inds]
    base_fit = test_df.autoreg_1
    memory_1 = cr.fit(years, n_autoreg=1, weather_vars=[])
    memory_5 = cr.fit(years, n_autoreg=5, weather_vars=[])
    weather_1 = cr.fit(years, n_autoreg=1, weather_vars=['temp','humidity','wind','rain'])
    weather_5 = cr.fit(years, n_autoreg=5, weather_vars=['temp','humidity','wind','rain'])
    for met in eval_metrics:
        baseline_res[met].append(evaluate_glm(y_non_nan, base_fit[non_nan_inds], metric=met))
        memory_1_res[met].append(evaluate_glm(y_non_nan, np.array(memory_1.predict(test_df))[stupid], metric=met))
        memory_5_res[met].append(evaluate_glm(y_non_nan, np.array(memory_5.predict(test_df))[stupid], metric=met))
        weather_1_res[met].append(evaluate_glm(y_non_nan, weather_1.predict(test_df), metric=met))
        weather_5_res[met].append(evaluate_glm(y_non_nan, weather_5.predict(test_df), metric=met))
    print "done with year %d" % year

In [None]:
# now make plots
years = np.arange(2013, 2017)
plt.plot(years, baseline_res["MSE"][-4:], 'bs', label="Baseline")
plt.plot(years, memory_1_res["MSE"][-4:], 'rs', label="Memory(1)")
plt.plot(years, memory_5_res["MSE"][-4:], 'ro', label="Memory(5)")
plt.plot(years, weather_1_res["MSE"][-4:], 'gs', label="Weather(1)")
plt.plot(years, weather_5_res["MSE"][-4:], 'go', label="Weather(5)")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.xlabel("Year")
plt.ylabel("MSE")
ax = plt.gca()
ax.get_xaxis().get_major_formatter().set_useOffset(False)
plt.show()

# Debugging initial model

In [None]:
myyear = 2015
years = all_except_year(myyear)
weather_1 = cr.fit(years, n_autoreg=1, weather_vars=['temp','humidity','wind','rain'])
test_df = clust_feat_df[clust_feat_df.year==myyear]
y = test_df.n_det
non_nan_inds = np.logical_not(np.isnan(test_df.temp))
stupid = np.where(non_nan_inds)[0]
y_hat = weather_1.predict(test_df)
plt.scatter(y[non_nan_inds], y_hat)
plt.xlabel("y")
plt.ylabel("yhat")
plt.show()

In [None]:
cr = ClusterRegression(clust_feat_df, 5, 'unused', 10)
weather_1 = cr.fit(years, n_autoreg=1, weather_vars=['temp','humidity','wind','rain'])
memory_1 = cr.fit(years, n_autoreg=1, weather_vars=[])
clust = 890
clust_df = test_df[test_df.cluster==clust].sort('dayofyear')
y_clust = clust_df.n_det
non_nan_days = np.logical_not(np.isnan(clust_df.temp))
stupid = np.where(non_nan_days)[0]
y_hat_clust = np.zeros(len(y_clust))
y_hat_clust[stupid] = weather_1.predict(clust_df)
y_hat_mem = memory_1.predict(clust_df)
days = clust_df.dayofyear
plt.plot(days, y_clust, 'r--', label='y')
plt.plot(days, y_hat_clust, 'b--', label='yhat')
plt.plot(days, y_hat_mem, 'g--', label='memory')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
weather_1.summary()

In [None]:
# why are these models learning no autoregressive component?
plt.scatter(np.log(clust_feat_df.n_det+1), np.log(clust_feat_df.autoreg_1+1))
plt.xlabel("Number of detections today")
plt.ylabel("Number of detections yesterday")
plt.show()
nans = np.isnan(clust_feat_df.temp)
print "autoreg correlation: " + str(pearsonr(clust_feat_df.n_det, clust_feat_df.autoreg_1))
print "temp correlation: " + str(pearsonr(clust_feat_df.n_det[~nans], clust_feat_df.temp[~nans]))
print "humidity correlation: " + str(pearsonr(clust_feat_df.n_det[~nans], clust_feat_df.humidity[~nans]))
print "wind correlation: " + str(pearsonr(clust_feat_df.n_det[~nans], clust_feat_df.wind[~nans]))
print "rain correlation: " + str(pearsonr(clust_feat_df.n_det[~nans], clust_feat_df.rain[~nans]))

In [None]:
print cr.clust_df.iloc[0:5]
print "autoreg correlation: " + str(pearsonr(cr.clust_df.n_det, cr.clust_df.autoreg_1))


In [None]:
# Check stats for every year
for nonyear in xrange(2007,2017):
    ft = cr.fit(all_except_year(nonyear), 1, weather_vars=[])
    print "%d: %f" % (nonyear, ft.params["autoreg_1"])



In [None]:
# Plot coefficients and pearson correlations for each feature and each year
for i in xrange(2,11):
    name = "autoreg_%d" % i
    if name in clust_feat_df.columns:
        del clust_feat_df[name]
        
my_cols = ["n_det", "autoreg_1", "temp", "humidity", "wind", "rain"]
for year in xrange(2007, 2017):
    annual_df = clust_feat_df[clust_feat_df.year==year]
    non_nans = annual_df[~np.isnan(annual_df.rain)]
    corrs = map(lambda x: pearsonr(non_nans.n_det, non_nans[x])[0], my_cols)
    width = .35       # the width of the bars
    ind = np.arange(len(my_cols))
    fig, ax = plt.subplots(figsize=(13,6))
    plt.title(str(year))
    rects1 = ax.bar(ind, corrs, width, color='b')
    ft = cr.fit([year], 1)
    ft_dict = dict(ft.params)
    ft_dict['n_det'] = 0.
    params = map(lambda x: ft_dict[x], my_cols)
    rects2 = ax.bar(ind + width, params, width, color='r')
    ax.set_xticks(ind + width / 2)
    ax.set_xticklabels(my_cols)
    plt.show()

In [None]:
# staggered rain correlation
for t in xrange(0,5):
    t_rain_dict = dict()
    for clust in clust_feat_df.cluster.unique():
        clust_df = clust_feat_df[clust_feat_df.cluster == clust]
        days = clust_df.dayofyear.unique()
        for day in days:
            day_row = clust_df[clust_df.dayofyear==day]
            name = day_row.iloc[0].name
            delayed_rain_day = day - t
            if delayed_rain_day in days:
                rain_val = clust_df[clust_df.dayofyear==delayed_rain_day].iloc[0].rain
                if np.isnan(rain_val):
                    t_rain_dict[name] = 0.
                else:
                    t_rain_dict[name] = clust_df[clust_df.dayofyear==delayed_rain_day].iloc[0].rain
            else:
                t_rain_dict[name] = 0.
    clust_feat_df["rain_del_%d" % t] = pd.Series(t_rain_dict)
print "sweg bitch"
print clust_feat_df.iloc[0:10].rain_del_1
corrs = map(lambda x: pearsonr(clust_feat_df.n_det, clust_feat_df["rain_del_%d" % x]), range(0,5))
print "yala"
plt.plot(range(0,5), corrs, 'b--')
plt.xlabel("Rain delay")
plt.ylabel("Pearson correlation")
plt.show()

In [None]:
print corrs

In [None]:
# plot n_det and autoreg_1 for each cluster
for clust in clust_feat_df.cluster.unique():
    clust_df = clust_feat_df[clust_feat_df.cluster==clust].sort('dayofyear')
    plt.plot(clust_df.dayofyear, clust_df.n_det, 'b--', label="n_det")
    plt.plot(clust_df.dayofyear, clust_df.autoreg_1, 'r--', label="autoreg_1")
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    plt.show()
    boop = raw_input('Press enter to continue or q to quit: ')
    plt.close()
    if boop.startswith('q'):
        break

In [None]:
# correlation with binary over/under median
mean = np.mean(clust_feat_df.n_det)
over_under_n_det = clust_feat_df.n_det > mean
over_under_autoreg = clust_feat_df.autoreg_1 > mean
print "binary corr: " + str(pearsonr(over_under_n_det, over_under_autoreg))
print "mean: " + str(mean)

In [None]:
ft = cr.fit(range(2007,2017), 1, [])
print ft.params
plt.plot(np.sort(clust_feat_df.n_det))

In [None]:
clipped_feat_df = clust_feat_df.copy()
clipped_feat_df[clipped_feat_df.n_det > 10].n_det = 10
clipped_CR = ClusterRegression(clipped_feat_df,5,'unused',2)
clipped_ft = clipped_CR.fit(range(2007,2017),1,[])
print clipped_ft.params

In [None]:
global_df = cPickle.load(open("data/global_df.pkl"))
global_df['cluster'] = global_df['year']
global_df['alt_cluster'] = np.nan
global_cr = ClusterRegression(global_df, 5, 'unused', 2)
global_ft = global_cr.fit(range(2007,2017),1,[])
print global_ft.params
annual_df = global_df[global_df.year==2013]
plt.plot(annual_df.dayofyear, annual_df.n_det,'r--')
plt.plot(annual_df.dayofyear, annual_df.autoreg_1,'b--')
plt.plot(annual_df.dayofyear, annual_df.autoreg_2,'g--')
plt.show()

In [None]:
from prediction.cluster_regression import add_autoreg
global_df['cluster'] = global_df['year']
global_df = add_autoreg(global_df, 2)
ft = smf.glm("n_det ~ autoreg_1", data=global_df, family=sm.genmod.families.family.Poisson()).fit()
ft.params

In [None]:
stupid_glm = sm.GLM(global_df.n_det, np.log(global_df.autoreg_1+1), family=sm.genmod.families.family.Poisson()).fit()
stupid_glm.params

In [None]:
import prediction.cluster_regression as CR
reload(CR)
cr = CR.ClusterRegression(clust_feat_df, 5, 'unused', 5)
ft = cr.fit(range(2007,2017),1,[])
ft.params

In [None]:
import prediction.cluster_regression as CR
reload(CR)
cr = CR.ClusterRegression(clust_feat_df, 5, 'unused', 5)
ft = cr.fit(range(2007,2017),5,[])
ft.params