In [1]:
import warnings

import arviz as az
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc as pm
import seaborn as sns
from IPython.display import Image

# import jupyter_black
# jupyter_black.load()

az.style.use("arviz-darkgrid")
RANDOM_SEED = 8265
np.random.seed(RANDOM_SEED)

np.set_printoptions(2)

# filter out Seaborn UserWarnings
warnings.filterwarnings(action="ignore", category=UserWarning, module=r"seaborn")
# warnings.filterwarnings(action="ignore", category=RuntimeWarning, module=r"scipy")
from warnings import simplefilter, warn

simplefilter("ignore")

In [4]:
import scipy.stats as sps

**GENERATE DATA**

In [43]:
num_teams = 10
num_matches = num_teams*(num_teams-1)
num_venues = 5
num_players = num_teams*11 + 1

In [44]:
toss_won_by_team1_probability = sps.bernoulli(p=0.5)
toss_won_by_team1_outcomes_by_match = toss_won_by_team1_probability.rvs(num_matches)
toss_won_by_team1_outcomes_by_match

array([1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0,
       1, 1])

In [45]:
toss_winning_team_bowls_probability = sps.bernoulli(p=0.8)
toss_winning_team_bowls_outcomes = toss_winning_team_bowls_probability.rvs(num_matches)

In [46]:
teams = np.arange(0,num_teams)
team_combinations = np.array(np.meshgrid(teams,teams)).T.reshape(-1,2)
team_combinations_df = pd.DataFrame(team_combinations,
                                    columns = ['team1','team2'])
team_combinations_df = team_combinations_df.loc[team_combinations_df.team1!=team_combinations_df.team2].reset_index(drop=True)

In [47]:
def get_toss_winning_team(row):
    if row['toss_won_by_team1'] == 1:
        return row['team1']
    else:
        return row['team2']

def get_toss_losing_team(row):
    if row['toss_won_by_team1'] == 1:
        return row['team2']
    else:
        return row['team1']
    
def get_innings1_bowling_team(row):
    if row['toss_winning_team_bowls'] == 1:
        return row['toss_winning_team']
    else:
        return row['toss_losing_team']
    
def get_innings1_batting_team(row):
    if row['toss_winning_team_bowls'] == 1:
        return row['toss_losing_team']
    else:
        return row['toss_winning_team']
    
def get_innings2_bowling_team(row):
    if row['toss_winning_team_bowls'] == 1:
        return row['toss_losing_team']
    else:
        return row['toss_winning_team']
    
def get_innings2_batting_team(row):
    if row['toss_winning_team_bowls'] == 1:
        return row['toss_winning_team']
    else:
        return row['toss_losing_team']
    
matches_df = team_combinations_df.copy()
matches_df['match_key'] = np.arange(0,num_matches)
matches_df['toss_won_by_team1'] = toss_won_by_team1_outcomes_by_match
matches_df['toss_winning_team'] = matches_df.apply(get_toss_winning_team,
                                                   axis=1)
matches_df['toss_losing_team'] = matches_df.apply(get_toss_losing_team,
                                                   axis=1)
matches_df['toss_winning_team_bowls'] = toss_winning_team_bowls_outcomes
matches_df['innings1_batting_team'] = matches_df.apply(get_innings1_batting_team,
                                                       axis=1)
matches_df['innings1_bowling_team'] = matches_df.apply(get_innings1_bowling_team,
                                                       axis=1)
matches_df['innings2_batting_team'] = matches_df.apply(get_innings2_batting_team,
                                                       axis=1)
matches_df['innings2_bowling_team'] = matches_df.apply(get_innings2_bowling_team,
                                                       axis=1)

In [48]:
matches_df

Unnamed: 0,team1,team2,match_key,toss_won_by_team1,toss_winning_team,toss_losing_team,toss_winning_team_bowls,innings1_batting_team,innings1_bowling_team,innings2_batting_team,innings2_bowling_team
0,0,1,0,1,0,1,1,1,0,0,1
1,0,2,1,0,2,0,1,0,2,2,0
2,0,3,2,1,0,3,1,3,0,0,3
3,0,4,3,1,0,4,1,4,0,0,4
4,0,5,4,0,5,0,1,0,5,5,0
...,...,...,...,...,...,...,...,...,...,...,...
85,9,4,85,0,4,9,1,9,4,4,9
86,9,5,86,0,5,9,1,9,5,5,9
87,9,6,87,0,6,9,1,9,6,6,9
88,9,7,88,1,9,7,0,9,7,7,9


In [49]:
players_by_team = np.tile(np.arange(12),(num_teams,1))

In [50]:
players_by_team.shape

(10, 12)

In [51]:
p_player_bowls_over = np.array([sps.dirichlet.rvs(np.ones(players_by_team.shape[1]),size=21).T 
                                for team in range(num_teams)])

In [52]:
p_player_bowls_over.shape

(10, 12, 21)

In [88]:
p_player_bowls_over[0,:,12].sum()

1.0

In [83]:
p_batsman_at_position = np.array([sps.dirichlet.rvs(np.ones(players_by_team.shape[1]),size=11) 
                                for team in range(num_teams)])

In [87]:
p_batsman_at_position[0,0,:].sum()

1.0

In [131]:
bowling_outcomes_index = pd.DataFrame({
    'bowling_outcomes_index': ['0', '1-b', '1-oe', '1-nb', '1-w', '2-b', '2-oe', '2-nb', '2-w','3-b', '3-oe', '3-nb', '3-w','4-b', '4-oe', '4-nb', '4-w','5-b', '5-oe', '5-nb', '5-w', '6-b', '6-oe', '6-nb', '6-w', 'W-b', 'W-bc','W-bs','W-dro',
                                                'W-idro','W-others'],
    'runs_scored': [0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,0,0,0,0,0,0],
    'batter_switched': [1,1,1,0,0,0,0,1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0,1,1,0,0,0,0,0,0],
    'is_legal': [1,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,1,1,1,1]
})

In [132]:
bowling_outcomes_index

Unnamed: 0,bowling_outcomes_index,runs_scored,batter_switched,is_legal
0,0,0,1,1
1,1-b,1,1,1
2,1-oe,1,1,1
3,1-nb,1,0,0
4,1-w,1,0,0
5,2-b,2,0,1
6,2-oe,2,0,1
7,2-nb,2,1,0
8,2-w,2,1,0
9,3-b,3,1,1


In [133]:
bowling_outcomes_index['alpha_for_bowling_outcome_index']= sps.norm(0,1).rvs(size=bowling_outcomes_index.shape[0])

In [134]:
beta_by_player_and_bowling_outcome = np.array([[sps.norm(0,1).rvs(size=bowling_outcomes_index.shape[0]) 
                                      for player in range(12)]
                                      for team in range(num_teams)])

In [135]:
beta_by_player_and_bowling_outcome.shape

(10, 12, 31)

In [136]:
from scipy.special import softmax

In [190]:
from collections import Counter
bowling_outcome_by_ball_and_innings_list = []
match_state_by_ball_and_innings_list = []
match_stats_list = []

for match_key, match_df in matches_df.groupby('match_key'):
    match_stats = {}
    for innings in range(2):
        bowler_overs_bowled = Counter()
        total_balls_bowled = 0
        wickets_lost = 0
        current_score = 0
        batters_dismissed = set()
        bowling_team_id = match_df[f'innings{innings+1}_bowling_team'].iloc[0]
        batting_team_id = match_df[f'innings{innings+1}_batting_team'].iloc[0]
        onstrike_batter_rv = sps.multinomial(1,p_batsman_at_position[batting_team_id,0,:]).rvs(1)[0]
        onstrike_batter_id = np.where(onstrike_batter_rv==1)[0][0]
        offstrike_batter_id = onstrike_batter_id
        while offstrike_batter_id == onstrike_batter_id:
            offstrike_batter_rv = sps.multinomial(1,p_batsman_at_position[batting_team_id,1,:]).rvs(1)[0]
            offstrike_batter_id = np.where(offstrike_batter_rv==1)[0][0]
        for over in range(20):
            selected_bowler_has_overs = False
            while not(selected_bowler_has_overs):
                selected_bowler_rv = sps.multinomial(1,p_player_bowls_over[bowling_team_id,:,over]).rvs(1)[0]
                selected_bowler_id = np.where(selected_bowler_rv==1)[0][0]
                if bowler_overs_bowled[selected_bowler_id] < 4:
                    selected_bowler_has_overs = True
            legal_balls_bowled_in_over = 0
            total_balls_bowled_in_over = 0
            while legal_balls_bowled_in_over<6:
                if innings == 0:
                    runs_to_target = -1
                else:
                    runs_to_target = target - current_score
                ball_number_in_over = min(6,total_balls_bowled_in_over)
                match_state_row = {
                    'match_key': match_key,
                    'innings': innings,
                    'over_number': over,
                    'ball_number_in_over': ball_number_in_over,
                    'bowler_id': selected_bowler_id,
                    'batter_id': onstrike_batter_id,
                    'current_score': current_score,
                    'batting_team_id': batting_team_id,
                    'bowling_team_id': bowling_team_id,
                    'wickets_fallen': wickets_lost,
                    'total_balls_bowled': total_balls_bowled,
                    'runs_to_target': runs_to_target
                }
                total_balls_bowled_in_over+=1
                total_balls_bowled +=1
                ball_outcome_mu = bowling_outcomes_index['alpha_for_bowling_outcome_index'].values + beta_by_player_and_bowling_outcome[batting_team_id,onstrike_batter_id,:]
                ball_outcome_p = softmax(ball_outcome_mu)
                ball_outcome_rv = sps.multinomial(1,ball_outcome_p).rvs(1)
                ball_outcome = np.where(ball_outcome_rv==1)[1][0]
                current_score += bowling_outcomes_index.iloc[ball_outcome]['runs_scored']
                bowling_outcomes_row = {
                    'match_key': match_key,
                    'innings': innings,
                    'over_number': over,
                    'ball_number_in_over': ball_number_in_over,
                    'bowling_outcome_index': ball_outcome,
                    'bowler_id': selected_bowler_id
                }
                match_state_by_ball_and_innings_list.append(match_state_row)
                bowling_outcome_by_ball_and_innings_list.append(bowling_outcomes_row)
                if innings == 1 and current_score >= target:
                    break
                if bowling_outcomes_index.iloc[ball_outcome]['is_legal'] == 1:
                    legal_balls_bowled_in_over+=1
                if ball_outcome >= 25:
                    #wicket has fallen, find new batsman
                    wickets_lost += 1
                    if wickets_lost == 10:
                        break
                    next_at_bat = wickets_lost + 1
                    batters_dismissed.add(onstrike_batter_id)
                    while onstrike_batter_id in batters_dismissed:
                        onstrike_batter_rv = sps.multinomial(1,p_batsman_at_position[batting_team_id,next_at_bat,:]).rvs(1)[0]
                        onstrike_batter_id = np.where(onstrike_batter_rv==1)[0][0]
                if bowling_outcomes_index.iloc[ball_outcome]['batter_switched'] == 1:
                    temp = onstrike_batter_id
                    onstrike_batter_id = offstrike_batter_id
                    offstrike_batter_d = temp
            bowler_overs_bowled[selected_bowler_id]+=1
            if wickets_lost == 10 or (innings == 1 and current_score >= target):
                break
        if innings == 0:
            target = current_score + 1
        match_stats[innings] = {'score': current_score, 'wickets_lost': wickets_lost}
    if match_stats[0]['score']>match_stats[1]['score']:
        match_stats['winner'] = match_df['innings1_batting_team'].iloc[0]
    elif match_stats[0]['score'] == match_stats[1]['score']:
        match_stats_winner = 'draw'
    else:
        match_stats['winner'] = match_df['innings2_batting_team'].iloc[0]
    match_stats_list.append(match_stats)
        

In [191]:
match_state_by_ball_and_innings_df = pd.DataFrame(match_state_by_ball_and_innings_list)

In [192]:
match_state_by_ball_and_innings_df.loc[match_state_by_ball_and_innings_df.wickets_fallen>0]

Unnamed: 0,match_key,innings,over_number,ball_number_in_over,bowler_id,batter_id,current_score,batting_team_id,bowling_team_id,wickets_fallen,total_balls_bowled,runs_to_target
5,0,0,0,5,10,3,24,1,0,1,5,-1
6,0,0,0,6,10,8,24,1,0,2,6,-1
7,0,0,1,0,5,9,24,1,0,2,7,-1
8,0,0,1,1,5,9,30,1,0,2,8,-1
9,0,0,1,2,5,9,35,1,0,2,9,-1
...,...,...,...,...,...,...,...,...,...,...,...,...
10286,89,1,3,2,10,1,97,9,8,4,23,21
10287,89,1,3,3,10,1,103,9,8,4,24,15
10288,89,1,3,4,10,1,109,9,8,4,25,9
10289,89,1,3,5,10,1,115,9,8,4,26,3


In [193]:
bowling_outcomes_by_ball_and_innings_df = pd.DataFrame(bowling_outcome_by_ball_and_innings_list)

In [195]:
bowling_outcomes_by_ball_and_innings_df.bowling_outcome_index.value_counts()

21    2174
8     1062
2      688
23     601
16     563
28     510
22     484
29     435
18     427
10     403
11     383
26     246
25     233
15     211
24     193
0      190
1      177
14     141
12     140
9      133
6      133
5      114
13     111
17      98
27      94
4       83
3       69
7       67
19      65
30      56
20       7
Name: bowling_outcome_index, dtype: int64

In [199]:
with pm.Model() as bowling_outcomes_model_by_ball_first_innings:
  batter_id_by_ball_and_innings_data = pm.MutableData('batter_id_by_ball_and_innings_data',
                                                      match_state_by_ball_and_innings_df['batter_id'].values)
  bowling_outcomes_by_ball_and_innings_data = pm.MutableData('bowling_outcomes_by_ball_and_innings_data',
                                                             bowling_outcomes_by_ball_and_innings_df['bowling_outcome_index'].values)
  alpha_bowling_outcome = pm.Normal('alpha_bowling_outcome', 
                                    mu=0, 
                                    sigma=1, 
                                    shape=bowling_outcomes_index.shape[0])
  beta_for_batter_id_and_bowling_outcome = pm.Normal('beta_for_batter_id_and_bowling_outcome', 
                                                     mu=0, 
                                                     sd=1, 
                                                     shape=(batter_id_index.shape[0],
                                                            bowling_outcomes_index.shape[0]))


NameError: name 'batter_id_index' is not defined

In [None]:


  mu_bowling_outcome = pm.Deterministic('mu_bowling_outcome',
                                        at.dot(batter_id_by_ball_and_innings_data['batter_id'],
                                               beta_for_batter_id_and_bowling_outcome) + alpha_bowling_outcome)
  probability_of_bowling_outcome = pm.Deterministic('probability_of_bowling_outcome',
                                                    at.nnet.softmax(mu_bowling_outcome))
  bowling_outcomes_by_ball_and_innings_rv = pm.Categorical('bowling_outcomes_by_ball_and_innings_rv', 
                                                           p = probability_of_bowling_outcome, 
                                                           observed = bowling_outcomes_by_ball_and_innings_data)