In [4]:
import math
import torch
import pyro
import pyro.distributions as dist
import pyro.poutine as poutine
from pyro.contrib.examples.bart import load_bart_od
from pyro.contrib.forecast import ForecastingModel, Forecaster, eval_crps
from pyro.infer.reparam import LinearHMMReparam, StableReparam, SymmetricStableReparam
from pyro.ops.tensor_utils import periodic_repeat
from pyro.ops.stats import quantile
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline
assert pyro.__version__.startswith('1.9.1')
pyro.set_rng_seed(20200305)


In [8]:
att_df = pd.read_csv('../processed_data/attendance_w_team_perf.csv')

In [9]:
home_df = att_df.loc[(att_df['team_name'] == 'cardinals') & (att_df['location'] == 'home'),:]

In [11]:
home_df

Unnamed: 0,team,team_name,opponent_team,opponent_team_name,location,year,total,home,away,week,...,stadium,home_team,capacity,capacity_rate,Sb_winners,Sb_winners_prev_years,Playoff_prev_years,Bad_streak,Medium_streak,Good_streak
1,Arizona,cardinals,Dallas,cowboys,home,2000,893926,387475,506451,2,...,State Farm Stadium,Arizona Cardinals,63400,0.960475,False,False,False,True,False,False
3,Arizona,cardinals,Green Bay,packers,home,2000,893926,387475,506451,4,...,State Farm Stadium,Arizona Cardinals,63400,0.882996,False,False,False,True,False,False
5,Arizona,cardinals,Cleveland,browns,home,2000,893926,387475,506451,6,...,State Farm Stadium,Arizona Cardinals,63400,1.000000,False,False,False,True,False,False
6,Arizona,cardinals,Philadelphia,eagles,home,2000,893926,387475,506451,7,...,State Farm Stadium,Arizona Cardinals,63400,1.000000,False,False,False,True,False,False
8,Arizona,cardinals,New Orleans,saints,home,2000,893926,387475,506451,9,...,State Farm Stadium,Arizona Cardinals,63400,1.000000,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10307,Arizona,cardinals,Atlanta,falcons,home,2019,1000509,490586,509923,6,...,State Farm Stadium,Arizona Cardinals,63400,1.000000,False,False,False,True,False,False
10310,Arizona,cardinals,San Francisco,49ers,home,2019,1000509,490586,509923,9,...,State Farm Stadium,Arizona Cardinals,63400,1.000000,False,False,False,False,True,False
10314,Arizona,cardinals,Los Angeles,rams,home,2019,1000509,490586,509923,13,...,State Farm Stadium,Arizona Cardinals,63400,1.000000,False,False,False,False,True,False
10315,Arizona,cardinals,Pittsburgh,steelers,home,2019,1000509,490586,509923,14,...,State Farm Stadium,Arizona Cardinals,63400,0.992486,False,False,False,True,False,False


In [17]:
yearly_attendance_average_df = home_df.groupby(["home_team","year"])["weekly_attendance"].mean().reset_index().rename(columns={"weekly_attendance": "yearly_attendance_average"})
yearly_attendance_average_df["lagged_yearly_attendance_average"] = yearly_attendance_average_df.groupby("home_team")["yearly_attendance_average"].shift(1).fillna(0) # This fillna we can have 100% efficiency

home_df = home_df.merge(
    yearly_attendance_average_df[["home_team", "year", "lagged_yearly_attendance_average"]],
    on=["home_team", "year"],
    how="left"
).rename(columns={"lagged_yearly_attendance_average": "last_year_avg_attendance"})

In [18]:
home_df

Unnamed: 0,team,team_name,opponent_team,opponent_team_name,location,year,total,home,away,week,...,home_team,capacity,capacity_rate,Sb_winners,Sb_winners_prev_years,Playoff_prev_years,Bad_streak,Medium_streak,Good_streak,last_year_avg_attendance
0,Arizona,cardinals,Dallas,cowboys,home,2000,893926,387475,506451,2,...,Arizona Cardinals,63400,0.960475,False,False,False,True,False,False,0.000
1,Arizona,cardinals,Green Bay,packers,home,2000,893926,387475,506451,4,...,Arizona Cardinals,63400,0.882996,False,False,False,True,False,False,0.000
2,Arizona,cardinals,Cleveland,browns,home,2000,893926,387475,506451,6,...,Arizona Cardinals,63400,1.000000,False,False,False,True,False,False,0.000
3,Arizona,cardinals,Philadelphia,eagles,home,2000,893926,387475,506451,7,...,Arizona Cardinals,63400,1.000000,False,False,False,True,False,False,0.000
4,Arizona,cardinals,New Orleans,saints,home,2000,893926,387475,506451,9,...,Arizona Cardinals,63400,1.000000,False,False,False,False,True,False,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,Arizona,cardinals,Atlanta,falcons,home,2019,1000509,490586,509923,6,...,Arizona Cardinals,63400,1.000000,False,False,False,True,False,False,62013.875
156,Arizona,cardinals,San Francisco,49ers,home,2019,1000509,490586,509923,9,...,Arizona Cardinals,63400,1.000000,False,False,False,False,True,False,62013.875
157,Arizona,cardinals,Los Angeles,rams,home,2019,1000509,490586,509923,13,...,Arizona Cardinals,63400,1.000000,False,False,False,False,True,False,62013.875
158,Arizona,cardinals,Pittsburgh,steelers,home,2019,1000509,490586,509923,14,...,Arizona Cardinals,63400,0.992486,False,False,False,True,False,False,62013.875


In [19]:
train_df = home_df.loc[(home_df['year'] > 2000) & (home_df['year'] < 2017)]

In [23]:
class Model1(ForecastingModel):
    def __init__(self, cov_dim, hidden_dim=3):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.cov_dim = cov_dim

        # Define a neural net to generate transition matrices from covariates
        self.trans_net = torch.nn.Sequential(
            torch.nn.Linear(cov_dim, 16),
            torch.nn.ReLU(),
            torch.nn.Linear(16, hidden_dim * hidden_dim)
        )

    def model(self, zero_data, covariates):
        duration = zero_data.size(-2)
        hidden_dim = self.hidden_dim
        last_year_avg_attendance = covariates[..., -1]
        other_covariates = covariates[..., :-1]

        # Mean function (could also depend on covariates)
        # prediction = torch.zeros(zero_data.shape).unsqueeze(-1)  # or a periodic_repeat()
        prediction = last_year_avg_attendance.unsqueeze(-1).expand_as(zero_data)

        # Priors
        init_dist = dist.Normal(0, 10).expand([hidden_dim]).to_event(1)
        trans_scale = pyro.sample("trans_scale", dist.LogNormal(-0.5 * math.log(24), 1))
        trans_dist = dist.Normal(0, trans_scale).expand([hidden_dim]).to_event(1)
        obs_scale = pyro.sample("obs_scale", dist.LogNormal(-2, 1))
        obs_dist = dist.Normal(0, obs_scale).expand([1]).to_event(1)

        # Use the NN to generate transition matrices per time step
        trans_logits = self.trans_net(other_covariates)  # [duration, hidden_dim * hidden_dim]
        trans_matrix_raw = trans_logits.view(duration, hidden_dim, hidden_dim)

        # Apply a nonlinearity to ensure stability (e.g. tanh scaled to keep values bounded)
        trans_matrix = 0.99 * torch.tanh(trans_matrix_raw)

        # Build GaussianHMM
        noise_dist = dist.GaussianHMM(
            initial_distribution=init_dist,
            transition_matrix=trans_matrix,
            transition_distribution=trans_dist,
            observation_matrix=torch.eye(hidden_dim, 1),  # obs_dim = 1
            observation_distribution=obs_dist,
            duration=duration
        )

        self.predict(noise_dist, prediction)


In [None]:
%%time
pyro.set_rng_seed(1)
pyro.clear_param_store()
covariates = train_df[['Sb_winners', 'Sb_winners_prev_years', 'Playoff_prev_years','Bad_streak', 'Medium_streak', 'last_year_avg_attendance']].astype(int)
covariates_tensor = torch.tensor(covariates.values, dtype=torch.float)

y = home_df.loc[train_df.index, 'weekly_attendance']
y_tensor = torch.tensor(y.values, dtype=torch.float).unsqueeze(-1)

forecaster = Forecaster(Model1(cov_dim=5, hidden_dim=3), y_tensor, covariates_tensor, learning_rate=0.1, num_steps=500)
for name, value in forecaster.guide.median().items():
    if value.numel() == 1:
        print("{} = {:0.4g}".format(name, value.item()))

torch.Size([128, 6])
128
128


TypeError: GaussianHMM.__init__() got an unexpected keyword argument 'initial_distribution'