In [2]:
import pandas as pd
import numpy as np

## first generate the dataset necessary for this model

In [3]:
qb_df = pd.read_csv('./dataset/QB_stats_and_fantasy_pts.csv',index_col=0)
def_df = pd.read_csv('./def_stats_and_fantasy_pts.csv',index_col=0)
game_df = pd.read_csv('./dataset/all_game_data.csv',index_col=0)

In [7]:
def find_opp_team(game_eid, team_known):
    teams = game_df[game_df.game_eid==game_eid].filter(axis=1,regex='team')
    if teams['home_team'].iloc[0] == team_known:
        return teams['away_team'].iloc[0]
    else:
        return teams['home_team'].iloc[0]
def generate_temporal_qb_data_for_week(qb_player_id,week,season):
    qb_current_time_slice = qb_df[ (qb_df.player_id== qb_player_id) & (qb_df.week == week) & (qb_df.season==season)]
    if len(qb_current_time_slice) == 0:
        return pd.DataFrame()
    qb_intertemp_data = qb_current_time_slice.filter(axis=1,regex="^passing*|^fantasy*").copy()
    curr_game = qb_current_time_slice['game_eid'].iloc[0]
    curr_team = qb_current_time_slice['team'].iloc[0]
    opp_team = find_opp_team(curr_game,curr_team)
    opp_sums = def_df[(def_df.game_eid == curr_game) & (def_df.team == opp_team) ].filter(axis=1,items=["defense_int","defense_sk","defense_tkl"]).sum()
    
    for col in opp_sums.index:
        qb_intertemp_data[col] = opp_sums[col]
    return qb_intertemp_data
    ##player ids to test:
    # 00-0035251 	
    # 00-0035146
    # 00-0022924
def query_data_for_season(season):
    all_uq_qbs = qb_df[qb_df.season==season]['player_id'].unique()
    acc = pd.DataFrame()
    for week in range(1,18):
        for uid in all_uq_qbs:
            temp = generate_temporal_qb_data_for_week(uid,week,season)
            temp['week'] = week
            temp['player_id'] = uid
            acc = pd.concat([acc,temp])
    return acc

### Lets create a dataset for one season:

In [8]:
season_qb_data  = query_data_for_season(2009)


In [9]:
int_model_data = season_qb_data[['passing_cmp','defense_int','defense_sk','passing_ints','week']]
#discretize passing_cmp
int_model_data['passing_cmp'] = pd.cut(int_model_data['passing_cmp'],5,labels=['almost no completions','poor completions','median completions','above average completions','fantastic completions'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  int_model_data['passing_cmp'] = pd.cut(int_model_data['passing_cmp'],5,labels=['almost no completions','poor completions','median completions','above average completions','fantastic completions'])


In [114]:
inital_state_data = int_model_data[int_model_data['week']==1]
temporal_state_data =  int_model_data[int_model_data['week']==2]
inital_state_data.columns = [x+'_0'  for x in inital_state_data.columns]
temporal_state_data.columns = [x+'_1'  for x in temporal_state_data.columns]

In [134]:
inital_state_data

Unnamed: 0,passing_cmp_0,defense_int_0,defense_sk_0,passing_ints_0,week_0
0,fantastic completions,0.0,3.0,2.0,1
21316,median completions,1.0,1.0,1.0,1
21716,median completions,0.0,2.0,0.0,1
51645,median completions,1.0,4.0,1.0,1
51661,almost no completions,1.0,4.0,0.0,1
51761,above average completions,0.0,1.0,1.0,1
75786,poor completions,0.0,3.0,0.0,1
75811,almost no completions,0.0,3.0,0.0,1
75932,almost no completions,5.0,5.0,4.0,1
76221,almost no completions,5.0,5.0,1.0,1


### Using Pomegranate for a Dynamic bayesian Network

With Pomegranate, you have to specify the specific edges first, then the data, and then watch the magic happen.

In [135]:
import pomegranate as pom

In [136]:
def get_mle_col(col):
    uq_vals = col.unique()
    acc = []
    for uq in uq_vals:
        acc.append(col[col==uq].shape[0]/col.shape[0])
    return acc


In [158]:
model = pom.BayesianNetwork('2dbn')


for i in inital_state_data.columns:
    mle_init = pom.DiscreteDistribution.from_samples(inital_state_data[i].values)#get_mle_col(inital_state_data[i])
    node = pom.Node(mle_init,name=i)
    model.add_state(node)
for i in temporal_state_data.columns:
    mle_temp = pom.DiscreteDistribution.from_samples(temporal_state_data[i].values)
    node = pom.Node(mle_temp,name=i)
    model.add_state(node) 
[n.name for n in model.states]

['passing_cmp_0',
 'defense_int_0',
 'defense_sk_0',
 'passing_ints_0',
 'week_0',
 'passing_cmp_1',
 'defense_int_1',
 'defense_sk_1',
 'passing_ints_1',
 'week_1']

Adding exponential distribution factors

In [159]:
model.add_edge('passing_cmp_0','passing_ints_0')
model.add_edge('defense_int_0','passing_ints_0')
model.add_edge('defense_sk_0','passing_ints_0')
model.add_edge('defense_sk_0','defense_sk_1')
model.add_edge('defense_int_0','defense_int_1')
model.add_edge('defense_int_1','passing_ints_1')
model.add_edge('defense_sk_1','passing_ints_1')
model.add_edge('passing_cmp_1','passing_ints_1')

model.edges

[('passing_cmp_0', 'passing_ints_0'),
 ('defense_int_0', 'passing_ints_0'),
 ('defense_sk_0', 'passing_ints_0'),
 ('defense_sk_0', 'defense_sk_1'),
 ('defense_int_0', 'defense_int_1'),
 ('defense_int_1', 'passing_ints_1'),
 ('defense_sk_1', 'passing_ints_1'),
 ('passing_cmp_1', 'passing_ints_1')]

In [160]:
model.bake()

KeyError: 'passing_cmp_0'

In [148]:
for idx,state in enumerate(model.states):
    print(state.distribution.data)
    print('---------------------')

AttributeError: 'pomegranate.distributions.DiscreteDistribution.Dis' object has no attribute 'data'

In [481]:
np.power(np.subtract(pred_vector.values, no_dbn_implementation[("passing_ints",1)].values),2).sum()

1678.0

In [91]:
col = inital_state_data['passing_ints']
uq_vals = col.unique()
acc = []
for uq in uq_vals:
    acc.append(col[col==uq].shape[0]/col.shape[0])
sum(acc)

1.0

In [162]:
for i, st in enumerate(model.states):
    print(st.distribution)
    print(st.distribution.keys())

    print('------------')
    print(st.distribution.bake(tuple(st.distribution.keys())))

{
    "class" : "Distribution",
    "dtype" : "str",
    "name" : "DiscreteDistribution",
    "parameters" : [
        {
            "above average completions" : 0.15,
            "almost no completions" : 0.225,
            "fantastic completions" : 0.05,
            "median completions" : 0.375,
            "poor completions" : 0.2
        }
    ],
    "frozen" : false
}
('above average completions', 'almost no completions', 'fantastic completions', 'median completions', 'poor completions')
------------
None
{
    "class" : "Distribution",
    "dtype" : "numpy.float64",
    "name" : "DiscreteDistribution",
    "parameters" : [
        {
            "0.0" : 0.45,
            "1.0" : 0.35,
            "2.0" : 0.075,
            "3.0" : 0.025,
            "4.0" : 0.025,
            "5.0" : 0.075
        }
    ],
    "frozen" : false
}
(0.0, 1.0, 2.0, 3.0, 4.0, 5.0)
------------
None
{
    "class" : "Distribution",
    "dtype" : "numpy.float64",
    "name" : "DiscreteDistribution",
    