In [49]:
import numpy as np
import pandas as pd
import os
from tqdm.auto import tqdm

In [50]:
# ------------ CHANGE THIS TO THE FORMAT OF YOUR MATCHES ------------
format = 'ODI'

format_to_balls_map = {
    'ODI': 300,  # 50 overs
    'T20': 120,  # 20 overs
    'Test': None,
}

df = pd.read_csv('data/bbb.csv', index_col=0)
df.head()

Unnamed: 0_level_0,match_id,match_date,dls,gender,venue,innings,bat_team,bowl_team,over,ball,...,non_striker,non_striker_name,bowler,bowler_name,ball_runs_batter,ball_runs_extras,ball_extras_type,ball_runs_total,ball_wicket_type,ball_player_out
ball_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,64814,2002-12-29,False,male,"McLean Park, Napier",1,New Zealand,India,0,1,...,99639abf,NJ Astle,bad31fac,J Srinath,0,1,legbyes,1,,
1,64814,2002-12-29,False,male,"McLean Park, Napier",1,New Zealand,India,0,2,...,eea6b7f1,SP Fleming,bad31fac,J Srinath,0,0,,0,,
2,64814,2002-12-29,False,male,"McLean Park, Napier",1,New Zealand,India,0,3,...,eea6b7f1,SP Fleming,bad31fac,J Srinath,0,0,,0,,
3,64814,2002-12-29,False,male,"McLean Park, Napier",1,New Zealand,India,0,4,...,eea6b7f1,SP Fleming,bad31fac,J Srinath,1,0,,1,,
4,64814,2002-12-29,False,male,"McLean Park, Napier",1,New Zealand,India,0,5,...,99639abf,NJ Astle,bad31fac,J Srinath,0,0,,0,,


In [51]:
dfs = df.groupby('match_id', group_keys=False)

processed_dfs = []

for match_id, df in tqdm(dfs, desc="Calculating Match Stats: "):
        df['match_batter_total_runs'] = df.groupby(['innings', 'batter'])['ball_runs_batter'].cumsum()

        df['match_batter_balls_faced'] = df['ball_extras_type'].apply(lambda x: 0 if x in ['wides'] else 1) # 'noballs' are not counted
        df['match_batter_balls_faced'] = df.groupby(['innings', 'batter'])['match_batter_balls_faced'].cumsum()

        df['match_bowler_total_runs'] = np.where(df['ball_extras_type'].isin(['byes', 'legbyes']), 0, df['ball_runs_total'])
        df['match_bowler_total_runs'] = df.groupby(['innings', 'bowler'])['match_bowler_total_runs'].cumsum()

        df['match_bowler_balls_bowled'] = df['ball_extras_type'].apply(lambda x: 0 if x in ['wides', 'noballs'] else 1)
        df['match_bowler_balls_bowled'] = df.groupby(['innings', 'bowler'])['match_bowler_balls_bowled'].cumsum()

        df['match_bowler_economy'] = df['match_bowler_total_runs'] / (df['match_bowler_balls_bowled'] / 6)

        df['match_bowler_wickets_taken'] = df['ball_wicket_type'].apply(lambda x: 1 if x not in [np.NaN, None, 'run out', 'retired hurt', 'retired out'] else 0)
        df['match_bowler_wickets_taken'] = df.groupby(['innings', 'bowler'])['match_bowler_wickets_taken'].cumsum()

        df['match_team_total_runs'] = df.groupby('innings')['ball_runs_total'].cumsum()
        df['match_wickets_taken'] = df['ball_player_out'].notna().astype(int)
        df['match_wickets_taken'] = df.groupby('innings')['match_wickets_taken'].cumsum()

        df ['match_team_rr'] = df['match_team_total_runs'] / (df['over'] + df['ball'] / 6)

        target = df[df['innings'] == 1]['match_team_total_runs'].iloc[-1] + 1
        df['target'] = df['innings'].apply(lambda x: target if x == 2 else None)
        
        df['remaining_balls'] = format_to_balls_map[format] - (df['over'] * 6 + df['ball'])
        df['rrr'] = df['target'] / (df['remaining_balls'] / 6)

        processed_dfs.append(df)   # keep this group's dataframe


Calculating Match Stats: 100%|██████████| 2937/2937 [00:09<00:00, 318.90it/s]
Calculating Match Stats: 100%|██████████| 2937/2937 [00:09<00:00, 318.90it/s]


In [52]:
df = pd.concat(processed_dfs, ignore_index=True)

# Post-processing
df['match_date'] = pd.to_datetime(df['match_date'])
df.sort_values(by=['match_date', 'match_id', 'innings', 'over'], inplace=True)

df = df.reset_index(drop=True)
df['ball_id'] = df.index
df.set_index('ball_id', inplace=True)

  df = pd.concat(processed_dfs, ignore_index=True)


In [53]:
df.to_csv("data/bbb_w_game_stats.csv", index=True)