## This notebook builds a new data selection framework to build batting_runs inferential_models

In [1]:
import warnings

import arviz as az
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc as pm
import seaborn as sns
from IPython.display import Image

# import jupyter_black
# jupyter_black.load()

az.style.use("arviz-darkgrid")
RANDOM_SEED = 8265
np.random.seed(RANDOM_SEED)

np.set_printoptions(2)

# filter out Seaborn UserWarnings
warnings.filterwarnings(action="ignore", category=UserWarning, module=r"seaborn")
# warnings.filterwarnings(action="ignore", category=RuntimeWarning, module=r"scipy")
from warnings import simplefilter, warn

simplefilter("ignore")

In [2]:
import os
os.getcwd()

'/Users/gireeshramji/PycharmProjects/SouthridgeCorp/player-outcome-predictor/notebooks'

In [3]:
os.chdir("..")
os.getcwd()

'/Users/gireeshramji/PycharmProjects/SouthridgeCorp/player-outcome-predictor'

In [4]:
from utils.config_utils import ConfigUtils
from historical_data.singleton import Helper
from data_selection.data_selection import DataSelection
from rewards_configuration.rewards_configuration import RewardsConfiguration
from simulators.perfect_simulator import PerfectSimulator

In [5]:
import aesara.tensor as at

In [133]:
config_utils = ConfigUtils(".streamlit/config.toml")
helper = Helper(config_utils)
data_selection = DataSelection(helper)
rewards_config = RewardsConfiguration(config_utils)
perfect_simulator = PerfectSimulator(data_selection, rewards_config)

2022-11-16 00:22:43.701 INFO    root: Creating feedback form storage with method local


In [134]:
helper.tournaments.df

Unnamed: 0,key,name,first_match_date,last_match_date
0,t20s,International T20s,2005-02-17,2022-10-05
1,apl,Afghanistan Premier League,2018-10-05,2018-10-21
2,bbl,Big Bash League,2011-12-16,2022-01-28
3,bpl,Bangladesh Premier League,2012-02-11,2022-02-18
4,cpl,Caribbean Premier League,2013-07-30,2022-09-30
5,ctc,CSA T20 Challenge,2012-02-15,2022-02-27
6,ipl,Indian Premier League,2008-04-18,2022-05-29
7,lpl,Lanka Premier League,2020-11-26,2021-12-23
8,psl,Pakistan Super League,2016-02-04,2022-02-27
9,ssm,Super Smash,2013-01-20,2022-01-29


In [135]:
selected_tournament = 'Indian Premier League'
tournament_start_date = helper.tournaments.df.query(f'name == "{selected_tournament}"').first_match_date.iloc[0]
tournament_end_date = helper.tournaments.df.query(f'name == "{selected_tournament}"').last_match_date.iloc[0]

In [136]:
helper.tournaments.set_selected_tournament_names([selected_tournament])
helper.tournaments.set_start_end_dates(tournament_start_date, tournament_end_date, False)

In [137]:
helper.tournaments.get_selected_tournaments()

['ipl']

In [138]:
helper.tournaments.get_start_end_dates(False)

(datetime.date(2008, 4, 18), datetime.date(2022, 5, 29))

In [139]:
all_matches_for_tournament = data_selection.get_selected_matches(False)

In [198]:
all_matches_for_tournament.season.unique()

array(['2009', '2021', '2015', '2007/08', '2011', '2012', '2019', '2022',
       '2016', '2014', '2017', '2013', '2020/21', '2018', '2009/10'],
      dtype=object)

In [140]:
all_bowling_outcomes_for_tournament = perfect_simulator.get_bowling_outcomes_by_ball_and_innings(False)
all_match_states_for_tournament = perfect_simulator.get_match_state_by_ball_and_innings(False)

In [199]:
test_season = '2022'

In [200]:
test_season_matches = all_matches_for_tournament.query(f'season == "{test_season}"')

In [201]:
test_start_date = test_season_matches.date.min()
test_end_date = test_season_matches.date.max()

In [202]:
test_season_venues = test_season_matches.venue.unique().tolist()
test_season_match_keys = test_season_matches.key.unique().tolist()

In [203]:
test_season_match_state_df = all_match_states_for_tournament.query('match_key in @test_season_match_keys')
test_season_bowling_outcomes_df = all_bowling_outcomes_for_tournament.query('match_key in @test_season_match_keys')

In [204]:
test_season_batters = test_season_match_state_df.batter.unique().tolist()
test_season_bowlers = test_season_match_state_df.bowler.unique().tolist()

In [205]:
#Now we know all the batters, bowlers and venues that we need to query for training data.
#So we choose all tournaments and set the training window to precede the test window
helper.tournaments.set_selected_tournament_names(helper.tournaments.df.name.tolist())
helper.tournaments.set_start_end_dates(helper.tournaments.df.first_match_date.min(), 
                                       test_start_date, 
                                       False)


In [206]:
helper.tournaments.get_selected_tournaments()

['t20s',
 'apl',
 'bbl',
 'bpl',
 'cpl',
 'ctc',
 'ipl',
 'lpl',
 'psl',
 'ssm',
 'ntb',
 'msl']

In [207]:
helper.tournaments.get_start_end_dates(False)

(datetime.date(2005, 2, 17), datetime.date(2022, 3, 26))

In [208]:
unqualified_train_bowling_outcomes_df = perfect_simulator.get_bowling_outcomes_by_ball_and_innings(False)
unqualified_train_match_state_df = perfect_simulator.get_match_state_by_ball_and_innings(False)

In [209]:
unqualified_train_bowling_outcomes_df.shape

(1141693, 20)

In [210]:
unqualified_train_match_state_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,batter,bowler,batting_team,total_runs,is_wicket,target_runs,target_overs,batting_featured_player,bowling_featured_player,venue,...,bowler_ZGn1517,bowler_ZHn2712,bowler_ZKn1926,bowler_ZMd766,batter_ZSd2789,bowler_ZSr2027,total_balls_bowled,current_total,wickets_fallen,runs_to_target
match_key,inning,over,ball,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
211028,1,0,1,MTk1695,BLe850,England,0,0,-1,-1,False,False,"The Rose Bowl, Southampton",...,0,0,0,0,0,0,0,0,0,-1
211028,1,0,2,MTk1695,BLe850,England,1,0,-1,-1,False,False,"The Rose Bowl, Southampton",...,0,0,0,0,0,0,1,1,0,-1
211028,1,0,3,GJs1815,BLe850,England,0,0,-1,-1,False,False,"The Rose Bowl, Southampton",...,0,0,0,0,0,0,2,1,0,-1
211028,1,0,4,GJs1815,BLe850,England,0,0,-1,-1,False,False,"The Rose Bowl, Southampton",...,0,0,0,0,0,0,3,1,0,-1
211028,1,0,5,GJs1815,BLe850,England,0,0,-1,-1,False,False,"The Rose Bowl, Southampton",...,0,0,0,0,0,0,4,1,0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304047,2,17,5,SIr720,DBo344,Kolkata Knight Riders,1,0,132,20,True,True,"Wankhede Stadium, Mumbai",...,0,0,0,0,0,0,106,125,4,7
1304047,2,17,6,SJn3574,DBo344,Kolkata Knight Riders,1,0,132,20,False,True,"Wankhede Stadium, Mumbai",...,0,0,0,0,0,0,107,126,4,6
1304047,2,18,1,SJn3574,AMe979,Kolkata Knight Riders,1,0,132,20,False,False,"Wankhede Stadium, Mumbai",...,0,0,0,0,0,0,108,127,4,5
1304047,2,18,2,SIr720,AMe979,Kolkata Knight Riders,2,0,132,20,True,False,"Wankhede Stadium, Mumbai",...,0,0,0,0,0,0,109,129,4,3


In [211]:
is_test_season_venue = unqualified_train_match_state_df.venue.isin(test_season_venues)
is_test_season_batter = unqualified_train_match_state_df.batter.isin(test_season_batters)
is_test_season_bowler = unqualified_train_match_state_df.bowler.isin(test_season_bowlers)

In [212]:
train_match_state_df = unqualified_train_match_state_df.loc[is_test_season_batter & is_test_season_bowler & is_test_season_venue]
train_bowling_outcomes_df = unqualified_train_bowling_outcomes_df.loc[is_test_season_batter & is_test_season_bowler & is_test_season_venue]

In [213]:
train_match_state_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,batter,bowler,batting_team,total_runs,is_wicket,target_runs,target_overs,batting_featured_player,bowling_featured_player,venue,...,bowler_ZGn1517,bowler_ZHn2712,bowler_ZKn1926,bowler_ZMd766,batter_ZSd2789,bowler_ZSr2027,total_balls_bowled,current_total,wickets_fallen,runs_to_target
match_key,inning,over,ball,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
336004,1,6,2,RUa1125,PSn3443,Mumbai Indians,1,0,-1,-1,True,False,"Dr DY Patil Sports Academy, Mumbai",...,0,0,0,0,0,0,38,52,2,-1
336004,1,6,4,RUa1125,PSn3443,Mumbai Indians,1,0,-1,-1,True,False,"Dr DY Patil Sports Academy, Mumbai",...,0,0,0,0,0,0,40,54,2,-1
336004,1,8,3,RUa1125,PSn3443,Mumbai Indians,0,0,-1,-1,True,False,"Dr DY Patil Sports Academy, Mumbai",...,0,0,0,0,0,0,51,59,2,-1
336004,1,8,4,RUa1125,PSn3443,Mumbai Indians,2,0,-1,-1,True,False,"Dr DY Patil Sports Academy, Mumbai",...,0,0,0,0,0,0,52,61,2,-1
336004,1,8,6,RUa1125,PSn3443,Mumbai Indians,0,0,-1,-1,True,False,"Dr DY Patil Sports Academy, Mumbai",...,0,0,0,0,0,0,54,62,2,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304047,2,17,5,SIr720,DBo344,Kolkata Knight Riders,1,0,132,20,True,True,"Wankhede Stadium, Mumbai",...,0,0,0,0,0,0,106,125,4,7
1304047,2,17,6,SJn3574,DBo344,Kolkata Knight Riders,1,0,132,20,False,True,"Wankhede Stadium, Mumbai",...,0,0,0,0,0,0,107,126,4,6
1304047,2,18,1,SJn3574,AMe979,Kolkata Knight Riders,1,0,132,20,False,False,"Wankhede Stadium, Mumbai",...,0,0,0,0,0,0,108,127,4,5
1304047,2,18,2,SIr720,AMe979,Kolkata Knight Riders,2,0,132,20,True,False,"Wankhede Stadium, Mumbai",...,0,0,0,0,0,0,109,129,4,3


In [214]:
frequent_players_set = set(test_season_batters + test_season_bowlers)

## Building new features and targets

In [215]:
def batter_featured_id(df):
    c1 = df['batter'].isin(frequent_players_set)
    df.loc[c1,'batter_featured_id'] = df['batter']
    df['batter_featured_id'].fillna('non_frequent_player',inplace=True)
    
def bowler_featured_id(df):
    c1 = df['batter'].isin(frequent_players_set)
    df.loc[c1,'bowler_featured_id'] = df['bowler']
    df['bowler_featured_id'].fillna('non_frequent_player',inplace=True)
    
def is_legal_delivery(df):
    c1 = df.noballs.isna() 
    c2 = df.wides.isna()
    df.loc[c1&c2,'is_legal_delivery'] = True
    df['is_legal_delivery'].fillna(False,inplace=True)
    
def striker_dismissed(df):
    c1 = (df['batter'] == df['player_dismissed'])
    c2 = (df['player_dismissed'].isna()==False)
    df.loc[c1&c2,'striker_dismissed'] = True
    c3 = (df['batter'] != df['player_dismissed'])
    df.loc[c3 & c2,'striker_dismissed'] = False
    
def add_column_to_df(df,
                     column_name):
    column_name(df)

In [216]:
for df in [train_match_state_df,
           test_season_match_state_df]:
    for column in [batter_featured_id,
                   bowler_featured_id]:
        add_column_to_df(df,
                         column)

In [217]:
for df in [train_bowling_outcomes_df,
           test_season_bowling_outcomes_df]:
    for column in [is_legal_delivery,
                   striker_dismissed]:
        add_column_to_df(df,
                         column)

In [218]:
for df in [train_match_state_df,
           test_season_match_state_df,
           train_bowling_outcomes_df,
           test_season_bowling_outcomes_df]:
    df.reset_index(inplace=True)

## Building Categoricals for Indexing

In [219]:
def build_categoricals_for_column(df,column):
    categorical = pd.Categorical(df[column])
    return categorical.categories
    
def get_categorical_column_index_for_df(df,
                            categories,
                            column):
    idx = pd.Categorical(df[column],
                         categories).codes
    return idx

In [220]:
COORDS = {}
train_feature_data = {}
test_feature_data = {}
all_feature_data = {}
for dim in ['batter_featured_id',
            'bowler_featured_id',
            'batting_team',
            'bowling_team',
            'venue',
            'wickets_fallen',
            'over',
            'match_key',
            'inning',
            'ball']:
    categories = build_categoricals_for_column(test_season_match_state_df,
                                               dim)
    COORDS[dim] = categories
    train_feature_data[dim] = get_categorical_column_index_for_df(train_match_state_df,
                                                                  categories,
                                                                  dim)
    test_feature_data[dim] = get_categorical_column_index_for_df(test_season_match_state_df,
                                                              categories,
                                                              dim)

In [221]:
train_outcome_data = {}
test_outcome_data = {}
all_outcome_data = {}
for dim in ['batter_runs',
            'extras',
            'is_legal_delivery',
            'is_wicket',
            'wides',
            'noballs',
            'dismissal_kind',
            'is_direct_runout']:
    categories = build_categoricals_for_column(train_bowling_outcomes_df,
                                               dim)
    COORDS[dim] = categories
    train_outcome_data[dim] = get_categorical_column_index_for_df(train_bowling_outcomes_df,
                                                                  categories,
                                                                  dim)
    test_outcome_data[dim] = get_categorical_column_index_for_df(test_season_bowling_outcomes_df,
                                                              categories,
                                                              dim)

In [222]:
COORDS

{'batter_featured_id': Index(['ABi3523', 'ADp3604', 'AFh613', 'AHKn3689', 'AJh2407', 'AKn1993',
        'AMm964', 'AMr3466', 'ANe1413', 'APl721',
        ...
        'UYv970', 'VAa3474', 'VIr2222', 'VKi607', 'VSr1177', 'WSa3464',
        'WSr946', 'YCl723', 'YDl3467', 'YJl3557'],
       dtype='object', length=174),
 'bowler_featured_id': Index(['ABi3523', 'ADp3604', 'AHKn3689', 'AJh2407', 'AKn1993', 'AMe979',
        'AMm964', 'ANe1413', 'APl721', 'ARl347',
        ...
        'UMk1846', 'UYv970', 'VAa3474', 'VAn3457', 'VIr2222', 'VSr1177',
        'WSr946', 'YCl723', 'YDl3467', 'YJl3557'],
       dtype='object', length=125),
 'batting_team': Index(['Chennai Super Kings', 'Delhi Capitals', 'Gujarat Titans',
        'Kolkata Knight Riders', 'Lucknow Super Giants', 'Mumbai Indians',
        'Punjab Kings', 'Rajasthan Royals', 'Royal Challengers Bangalore',
        'Sunrisers Hyderabad'],
       dtype='object'),
 'bowling_team': Index(['Chennai Super Kings', 'Delhi Capitals', 'Gujarat Tit

In [223]:
train_feature_data

{'batter_featured_id': array([135, 135, 135, ..., 145, 144, 144], dtype=int16),
 'bowler_featured_id': array([80, 80, 80, ...,  5,  5,  5], dtype=int8),
 'batting_team': array([5, 5, 5, ..., 3, 3, 3], dtype=int8),
 'bowling_team': array([-1, -1, -1, ...,  0,  0,  0], dtype=int8),
 'venue': array([1, 1, 1, ..., 5, 5, 5], dtype=int8),
 'wickets_fallen': array([2, 2, 2, ..., 4, 4, 4], dtype=int8),
 'over': array([ 6,  6,  8, ..., 18, 18, 18], dtype=int8),
 'match_key': array([-1, -1, -1, ...,  0,  0,  0], dtype=int8),
 'inning': array([0, 0, 0, ..., 1, 1, 1], dtype=int8),
 'ball': array([1, 3, 2, ..., 0, 1, 2], dtype=int8)}

In [224]:
test_feature_data

{'batter_featured_id': array([118, 118, 118, ..., 141,  30, 141], dtype=int16),
 'bowler_featured_id': array([116, 116, 116, ...,  82,  82,  75], dtype=int8),
 'batting_team': array([0, 0, 0, ..., 2, 2, 2], dtype=int8),
 'bowling_team': array([3, 3, 3, ..., 7, 7, 7], dtype=int8),
 'venue': array([5, 5, 5, ..., 4, 4, 4], dtype=int8),
 'wickets_fallen': array([0, 0, 0, ..., 3, 3, 3], dtype=int8),
 'over': array([ 0,  0,  0, ..., 17, 17, 18], dtype=int8),
 'match_key': array([ 0,  0,  0, ..., 73, 73, 73], dtype=int8),
 'inning': array([0, 0, 0, ..., 1, 1, 1], dtype=int8),
 'ball': array([0, 1, 2, ..., 4, 5, 0], dtype=int8)}

In [225]:
def build_xarray(feature_dict,
                 outcome_dict,
                 index_df):
    feature_df = pd.DataFrame(feature_dict)
    feature_df.index = index_df.index
    outcome_df = pd.DataFrame(outcome_dict)
    outcome_df.index = index_df.index
    combined_df = pd.merge(outcome_df,
                           feature_df,
                           left_index = True,
                           right_index = True)
    return combined_df, combined_df.to_xarray()

In [226]:
train_combined_df,train_combined_xarray = build_xarray(train_feature_data,
                                                         train_outcome_data,
                                                         train_match_state_df)

In [227]:
test_combined_df,test_combined_xarray = build_xarray(test_feature_data,
                                                         test_outcome_data,
                                                         test_season_match_state_df)

In [228]:
train_combined_df.shape

(12612, 18)

In [229]:
test_combined_df.shape

(17912, 18)

In [230]:
test_combined_df.reset_index().query('batter_runs == 4').over.value_counts(normalize=True)

4     0.081188
3     0.074257
5     0.068317
2     0.064851
1     0.059901
0     0.055446
18    0.054455
17    0.051485
16    0.049010
13    0.043564
9     0.043069
7     0.041584
8     0.041089
10    0.040594
19    0.040594
11    0.040099
14    0.039604
15    0.039109
6     0.036139
12    0.035644
Name: over, dtype: float64

In [231]:
train_combined_df.reset_index().query('batter_runs == 4').over.value_counts(normalize=True)

2     0.073401
4     0.065320
16    0.064646
5     0.060606
18    0.059933
3     0.059933
13    0.053872
17    0.053199
14    0.052525
1     0.052525
10    0.049158
11    0.048485
0     0.047811
19    0.042424
15    0.040404
8     0.037037
12    0.036364
7     0.035690
9     0.035017
6     0.031650
Name: over, dtype: float64

In [232]:
test_combined_df.reset_index().query('batter_runs == 5').over.value_counts(normalize=True)

Series([], Name: over, dtype: float64)

In [233]:
train_combined_df.reset_index().query('batter_runs == 5').over.value_counts(normalize=True)

0     0.285714
2     0.285714
14    0.142857
17    0.142857
5     0.142857
Name: over, dtype: float64

In [234]:
train_combined_df.batter_runs.value_counts(normalize=True)

0    0.383682
1    0.377894
4    0.117745
6    0.061212
2    0.057247
3    0.001665
5    0.000555
Name: batter_runs, dtype: float64

In [235]:
test_combined_df.batter_runs.value_counts(normalize=True)

0    0.411289
1    0.356409
4    0.112774
6    0.059290
2    0.057001
3    0.003238
Name: batter_runs, dtype: float64

In [236]:
COORDS['batter_runs']

Int64Index([0, 1, 2, 3, 4, 5, 6], dtype='int64')

In [237]:
test_combined_df.batter_runs.value_counts(normalize=True)

0    0.411289
1    0.356409
4    0.112774
6    0.059290
2    0.057001
3    0.003238
Name: batter_runs, dtype: float64

In [238]:
train_combined_df.reset_index().query('over == 19').batter_runs.value_counts(normalize=True)

0    0.306667
1    0.284444
6    0.146667
4    0.140000
2    0.117778
3    0.004444
Name: batter_runs, dtype: float64

In [239]:
test_combined_df.reset_index().query('over == 19').batter_runs.value_counts(normalize=True)

0    0.412011
1    0.245810
6    0.142458
4    0.114525
2    0.083799
3    0.001397
Name: batter_runs, dtype: float64

In [240]:
train_combined_df.reset_index().inning.value_counts(normalize=True)

 0    0.508088
 1    0.491595
-1    0.000317
Name: inning, dtype: float64

In [241]:
train_combined_df.reset_index().query('batter_runs == 2').inning.value_counts(normalize=True)

 0    0.513850
 1    0.484765
-1    0.001385
Name: inning, dtype: float64

In [242]:
test_combined_df.reset_index().query('batter_runs == 2').inning.value_counts(normalize=True)

0    0.53477
1    0.46523
Name: inning, dtype: float64

In [243]:
test_combined_df.reset_index().query('bowler_featured_id == 30').batter_runs.value_counts(normalize=True)

0    0.445205
1    0.383562
6    0.075342
2    0.047945
4    0.041096
3    0.006849
Name: batter_runs, dtype: float64

In [244]:
test_combined_df.reset_index().query('bowler_featured_id == 3').batter_runs.value_counts(normalize=True)

0    0.410811
1    0.324324
4    0.129730
6    0.070270
2    0.064865
Name: batter_runs, dtype: float64

In [245]:
test_combined_df.reset_index().batter_runs.value_counts(normalize=True)

0    0.411289
1    0.356409
4    0.112774
6    0.059290
2    0.057001
3    0.003238
Name: batter_runs, dtype: float64

In [246]:
test_combined_df.batter_featured_id.value_counts(normalize=True).iloc[:10]

49     0.033274
66     0.026351
140    0.022052
42     0.021606
39     0.021047
141    0.020880
47     0.019819
112    0.019540
30     0.019428
28     0.018982
Name: batter_featured_id, dtype: float64

In [247]:
test_combined_df.bowler_featured_id.value_counts(normalize=True).iloc[:10]

63     0.024118
122    0.023950
82     0.023392
109    0.021606
87     0.021550
66     0.021103
81     0.019317
104    0.019093
83     0.018982
33     0.018870
Name: bowler_featured_id, dtype: float64

In [252]:
def get_batter_runs_odds_ratio_by_player(df,
                             player_id,
                             player_type):
    base_odds = df.batter_runs.value_counts(normalize=True)
    selected_df = df.query(f'{player_type} == {player_id}')
    print(f"Selected {selected_df.shape[0]} balls for {player_type}: {player_id}")
    selected_odds = df.query(f'{player_type} == {player_id}').batter_runs.value_counts(normalize=True)
    odds_ratio = selected_odds/base_odds
    return odds_ratio
    

In [251]:
for batter in test_combined_df.batter_featured_id.value_counts(normalize=True).iloc[:10].index:
   print(f"Train Odds {batter}")
   print(get_batter_runs_odds_ratio_by_player(train_combined_df,
                                              batter,
                                              'batter_featured_id'))
   print(f"Test Odds {batter}")
   print(get_batter_runs_odds_ratio_by_player(test_combined_df,
                                              batter,
                                              'batter_featured_id'))

Train Odds 49
Selected 378 balls for batter_featured_id: 49
0    0.383682
1    0.377894
4    0.117745
6    0.061212
2    0.057247
3    0.001665
5    0.000555
Name: batter_runs, dtype: float64
0    0.362434
1    0.341270
4    0.137566
6    0.087302
2    0.068783
3    0.002646
Name: batter_runs, dtype: float64
0    0.944620
1    0.903083
2    1.201513
3    1.588813
4    1.168339
5         NaN
6    1.426227
Name: batter_runs, dtype: float64
Test Odds 49
Selected 596 balls for batter_featured_id: 49
0    0.411289
1    0.356409
4    0.112774
6    0.059290
2    0.057001
3    0.003238
Name: batter_runs, dtype: float64
0    0.414430
1    0.310403
4    0.140940
6    0.075503
2    0.055369
3    0.003356
Name: batter_runs, dtype: float64
0    1.007637
1    0.870917
4    1.249757
6    1.273461
2    0.971373
3    1.036334
Name: batter_runs, dtype: float64
Train Odds 66
Selected 330 balls for batter_featured_id: 66
0    0.383682
1    0.377894
4    0.117745
6    0.061212
2    0.057247
3    0.001665
5

In [197]:
for bowler in test_combined_df.bowler_featured_id.value_counts(normalize=True).iloc[:10].index:
   print(f"Train Odds {bowler}")
   print(get_batter_runs_odds_ratio_by_player(train_combined_df,
                                              bowler,
                                              'bowler_featured_id'))
   print(f"Test Odds {batter}")
   print(get_batter_runs_odds_ratio_by_player(test_combined_df,
                                              bowler,
                                              'bowler_featured_id'))

Train Odds 77
Selected 54 balls for bowler_featured_id: 77
0    1.448832
1    0.460478
2    0.596296
3         NaN
4    1.331829
5         NaN
6    0.950617
Name: batter_runs, dtype: float64
Test Odds 121
Selected 382 balls for bowler_featured_id: 77
0    1.191515
1    0.784017
2    0.967486
3         NaN
4    1.078510
5         NaN
6    0.841365
Name: batter_runs, dtype: float64
Train Odds 3
Selected 19 balls for bowler_featured_id: 3
0    1.746917
1    0.727071
2         NaN
3         NaN
4         NaN
5         NaN
6         NaN
Name: batter_runs, dtype: float64
Test Odds 121
Selected 358 balls for bowler_featured_id: 3
0    0.953545
1    1.188408
2    1.185285
3    1.266853
4    0.517866
5         NaN
6    0.598513
Name: batter_runs, dtype: float64
Train Odds 65
Selected 0 balls for bowler_featured_id: 65
0   NaN
1   NaN
4   NaN
2   NaN
6   NaN
3   NaN
5   NaN
Name: batter_runs, dtype: float64
Test Odds 121
Selected 327 balls for bowler_featured_id: 65
0    0.830893
1    1.335309
2

In [110]:
test_combined_df.reset_index().query('bowler_featured_id == 77').batter_runs.value_counts(normalize=True)/test_combined_df.reset_index().batter_runs.value_counts(normalize=True)

-1         NaN
 0    1.191515
 1    0.784017
 2    0.967486
 3         NaN
 4    1.078510
 5    0.841365
Name: batter_runs, dtype: float64

In [102]:
train_combined_df.reset_index().query('bowler_featured_id == 3').batter_runs.value_counts(normalize=True)

0    0.75
1    0.25
Name: batter_runs, dtype: float64

In [104]:
train_combined_df.reset_index().query('bowler_featured_id == 3').shape

(16, 18)

In [103]:
test_combined_df.reset_index().query('bowler_featured_id == 77').batter_runs.value_counts(normalize=True)

0    0.513089
1    0.280105
4    0.104712
2    0.070681
5    0.031414
Name: batter_runs, dtype: float64

In [105]:
train_combined_df.reset_index().query('bowler_featured_id == 77').shape

(9, 18)

In [132]:
COORDS['batter_featured_id'][6]

'AGt990'

In [None]:
train