In [1]:
import numpy as np
import pandas as pd
import json
import os
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
files = os.listdir('odis_json')
len(files)

2938

In [4]:
dfs = []
ball_id = 1

for file in tqdm(files, desc="Processing Matches: "):
    with open('odis_json/' + file, encoding='utf-8', errors='replace') as f:
        try:
            data = json.load(f)
        except (json.JSONDecodeError, UnicodeDecodeError):
            print(f"Skipping file due to decoding error: {file}")
            continue

        rows = []
        registry = data['info']['registry']['people']
        teams = data['info']['teams']
        match_id = file.split('.')[0]
        match_date = data['info']['dates'][0]
        venue = data['info']['venue']

        dl = False

        if 'method' in data['info']['outcome']:
            if data['info']['outcome']['method'] == 'D/L':
                dl = True

        gender = data['info']['gender']

        players = data['info']['players']
        for team in players:
            for player in range(len(players[team])):
                players[team][player] = registry[players[team][player]]                

        for innings_number, innings in enumerate(data['innings']):
            bat_team = innings['team']
            bowl_team = teams[1] if bat_team == teams[0] else teams[0]

            bat_team_players = players[bat_team].copy()
            bowl_team_players = players[bowl_team]

            # Identify the top 5 bowlers who bowled the most in this innings
            bowler_counts = {}
            for over in innings['overs']:
                for ball in over['deliveries']:
                    bowler_id = registry[ball['bowler']]
                    if bowler_id in bowler_counts:
                        bowler_counts[bowler_id] += 1
                    else:
                        bowler_counts[bowler_id] = 1
            
            # Get the top 5 bowlers with the most balls bowled
            top_5_bowlers = sorted(bowler_counts.items(), key=lambda x: x[1], reverse=True)[:5]
            top_5_bowlers_ids = [bowler[0] for bowler in top_5_bowlers]
            
            # Fill with None if there are fewer than 5 bowlers
            while len(top_5_bowlers_ids) < 5:
                top_5_bowlers_ids.append(None)

            for over in innings['overs']:
                over_number = over['over']
                ball_number = 1
                for ball in over['deliveries']:
                    wicket_type = None
                    player_out = None
                    extra_type = None
                    if 'wickets' in ball:
                        wicket = ball['wickets'][0]
                        wicket_type = wicket.get('kind')
                        player_out_id = wicket.get('player_out')
                        player_out = registry[player_out_id]
                    if 'extras' in ball:
                        extra_type = list(ball['extras'].keys())[0]
                    
                    
                    # Create the row dictionary with current state of players
                    row = {
                        'ball_id': ball_id,
                        'match_id': match_id,
                        'match_date': match_date,
                        'dls': dl,
                        'gender': gender,
                        'venue': venue,
                        'innings': innings_number + 1,
                        'bat_team': bat_team,
                        'bowl_team': bowl_team,
                        'over': over_number,
                        'ball': ball_number,
                        'batter': registry[ball['batter']],
                        'batter_name': ball['batter'],
                        'bowler': registry[ball['bowler']],
                        'bowler_name': ball['bowler'],
                        'non_striker': registry[ball['non_striker']],
                        'non_striker_name': ball['non_striker'],
                        'runs_batter': ball['runs']['batter'],
                        'runs_extras': ball['runs']['extras'],
                        'extra_run_type': extra_type,
                        'runs_total': ball['runs']['total'],
                        'wicket_type': wicket_type,
                        'player_out': player_out,
                    }
                    
                    # To make sure that there are only 6 balls in an over:
                    if 'extras' in ball:
                        extras = ball['extras']
                        if 'wides' in extras or 'noballs' in extras:
                            ball_number -= 1

                    ball_number += 1
                    ball_id += 1 
                    rows.append(row)
                    
        df = pd.DataFrame(rows)
        dfs.append(df)

Processing Matches:  16%|█▌        | 477/2938 [00:01<00:06, 399.84it/s]

Skipping file due to decoding error: .DS_Store


Processing Matches: 100%|██████████| 2938/2938 [00:07<00:00, 399.52it/s]


In [5]:
df = pd.concat(dfs, ignore_index=True)
df['match_date'] = pd.to_datetime(df['match_date'])
df.sort_values(by=['match_date', 'match_id', 'innings', 'over', 'ball_id'], inplace=True)

In [6]:
df = df.reset_index(drop=True)
df['ball_id'] = df.index
df.set_index('ball_id', inplace=True)

In [7]:
df.to_csv("data/bbb.csv", index=True)

In [None]:
# TODO: Make sure this works for all formats