In [1]:
import numpy as np
import pandas as pd
import json
import os
from tqdm.notebook import tqdm

In [2]:
files = os.listdir('odis_json')
len(files)

2924

In [3]:
dfs = []
ball_id = 1

for file in tqdm(files, desc="Processing Matches: "):
    with open('odis_json/' + file) as f:
        data = json.load(f)
        rows = []
        registry = data['info']['registry']['people']
        teams = data['info']['teams']
        match_id = file.split('.')[0]
        match_date = data['info']['dates'][0]
        venue = data['info']['venue']

        dl = False

        if 'method' in data['info']['outcome']:
            if data['info']['outcome']['method'] == 'D/L':
                dl = True

        gender = data['info']['gender']

        players = data['info']['players']
        for team in players:
            for player in range(len(players[team])):
                players[team][player] = registry[players[team][player]]                

        for innings_number, innings in enumerate(data['innings']):
            bat_team = innings['team']
            bowl_team = teams[1] if bat_team == teams[0] else teams[0]

            bat_team_players = players[bat_team].copy()
            bowl_team_players = players[bowl_team]

            # Identify the top 5 bowlers who bowled the most in this innings
            bowler_counts = {}
            for over in innings['overs']:
                for ball in over['deliveries']:
                    bowler_id = registry[ball['bowler']]
                    if bowler_id in bowler_counts:
                        bowler_counts[bowler_id] += 1
                    else:
                        bowler_counts[bowler_id] = 1
            
            # Get the top 5 bowlers with the most balls bowled
            top_5_bowlers = sorted(bowler_counts.items(), key=lambda x: x[1], reverse=True)[:5]
            top_5_bowlers_ids = [bowler[0] for bowler in top_5_bowlers]
            
            # Fill with None if there are fewer than 5 bowlers
            while len(top_5_bowlers_ids) < 5:
                top_5_bowlers_ids.append(None)

            for over in innings['overs']:
                over_number = over['over']
                ball_number = 1
                for ball in over['deliveries']:
                    wicket_type = None
                    player_out = None
                    if 'wickets' in ball:
                        wicket = ball['wickets'][0]
                        wicket_type = wicket.get('kind')
                        player_out_id = wicket.get('player_out')
                        player_out = registry[player_out_id]
                    
                    # Create the row dictionary with current state of players
                    row = {
                        'ball_id': ball_id,
                        'match_id': match_id,
                        'match_date': match_date,
                        'dl': dl,
                        'gender': gender,
                        'venue': venue,
                        'innings': innings_number + 1,
                        'bat_team': bat_team,
                        'bowl_team': bowl_team,
                        'over': over_number,
                        'ball': ball_number,
                        'batter': registry[ball['batter']],
                        'batter_name': ball['batter'],
                        'bowler': registry[ball['bowler']],
                        'bowler_name': ball['bowler'],
                        'non_striker': registry[ball['non_striker']],
                        'runs_batter': ball['runs']['batter'],
                        'runs_extras': ball['runs']['extras'],
                        'runs_total': ball['runs']['total'],
                        'wicket_type': wicket_type,
                        'player_out': player_out,
                    }
                    
                    # Add batting team players - fill with None if fewer than 11 players
                    for i in range(11):
                        if i < len(bat_team_players):
                            row[f'bat_team_player_{i+1}'] = bat_team_players[i]
                        else:
                            row[f'bat_team_player_{i+1}'] = None
                    
                    # Add only the top 5 bowlers
                    for i in range(5):
                        row[f'bowl_team_top_bowler_{i+1}'] = top_5_bowlers_ids[i]

                    if 'extras' in ball:
                        extras = ball['extras']
                        if 'wides' in extras or 'noballs' in extras:
                            ball_number -= 1

                    ball_number += 1
                    ball_id += 1 
                    rows.append(row)
                    
        df = pd.DataFrame(rows)
        dfs.append(df)

Processing Matches:   0%|          | 0/2924 [00:00<?, ?it/s]

In [4]:
for df in tqdm(dfs, desc="Concatenating Matches: "):
    df['batter_total_runs'] = df.groupby(['innings', 'batter'])['runs_batter'].cumsum()
    df['batter_balls_faced'] = df.apply(
        lambda row: 1 if not ((row['runs_extras'] > 0) and (row['runs_batter'] == 0)) else 0, axis=1)
    df['batter_balls_faced'] = df.groupby(['innings', 'batter'])['batter_balls_faced'].cumsum()

    df['bowler_total_runs'] = df.groupby(['innings', 'bowler'])['runs_total'].cumsum()
    df['bowler_balls_bowled'] = df.apply(
        lambda row: 1 if not ((row['runs_extras'] > 0) and (row['runs_batter'] == 0)) else 0, axis=1)
    df['bowler_balls_bowled'] = df.groupby(['innings', 'bowler'])['bowler_balls_bowled'].cumsum()

    df['team_total_runs'] = df.groupby('innings')['runs_total'].cumsum()
    df['wickets_taken'] = df['player_out'].notna().astype(int)
    df['wickets_taken'] = df.groupby('innings')['wickets_taken'].cumsum()

    df ['rr'] = df['team_total_runs'] / (df['over'] + df['ball'] / 6)
    target = df[df['innings'] == 1]['team_total_runs'].iloc[-1] + 1
    df['target'] = df['innings'].apply(lambda x: target if x == 2 else None)
    
    df['remaining_balls'] = 300 - (df['over'] * 6 + df['ball'])
    df['rrr'] = df['target'] / (df['remaining_balls'] / 6)

Concatenating Matches:   0%|          | 0/2924 [00:00<?, ?it/s]

In [5]:
df = pd.concat(dfs, ignore_index=True)
df['match_date'] = pd.to_datetime(df['match_date'])
df.sort_values(by=['match_date', 'match_id', 'innings', 'over', 'ball_id'], inplace=True)

In [6]:
# df.to_csv("data/bbb_from_json.csv")

In [7]:
df = df.reset_index(drop=True)
df['ball_id'] = df.index
df.set_index('ball_id', inplace=True)

In [8]:
df.to_csv("data/bbb_reindexed.csv", index=True)