In [10]:
import numpy as np
import pandas as pd
import pickle
import os

from tqdm.notebook import tqdm

In [11]:
player_df = pd.read_csv('data/player_data.csv')

df = pd.read_csv('data/bbb_reindexed.csv')

In [12]:
batter_df = player_df.rename(columns={
    'player_id': 'batter',
    'bat_hand': 'batter_bat_hand'
})

non_striker_df = player_df.rename(columns={
    'player_id': 'non_striker',
    'bat_hand': 'non_striker_bat_hand'
})

bowler_df = player_df.rename(columns={
    'player_id': 'bowler',
    'bowl_hand': 'bowler_hand',
    'bowl_style_simple': 'bowler_style'
})

df = df.merge(batter_df[['batter', 'batter_bat_hand']], on='batter', how='left')
df = df.merge(non_striker_df[['non_striker', 'non_striker_bat_hand']], on='non_striker', how='left')
df = df.merge(bowler_df[['bowler', 'bowler_hand', 'bowler_style']], on='bowler', how='left')

In [13]:
df['bowler_style'].unique()

array(['medium', 'offspin', 'fast', 'legspin', 'unknown', 'slow', nan],
      dtype=object)

In [5]:
df.columns

Index(['ball_id', 'match_id', 'match_date', 'dl', 'gender', 'venue', 'innings',
       'bat_team', 'bowl_team', 'over', 'ball', 'batter', 'batter_name',
       'bowler', 'bowler_name', 'non_striker', 'runs_batter', 'runs_extras',
       'runs_total', 'wicket_type', 'player_out', 'bat_team_player_1',
       'bat_team_player_2', 'bat_team_player_3', 'bat_team_player_4',
       'bat_team_player_5', 'bat_team_player_6', 'bat_team_player_7',
       'bat_team_player_8', 'bat_team_player_9', 'bat_team_player_10',
       'bat_team_player_11', 'bowl_team_top_bowler_1',
       'bowl_team_top_bowler_2', 'bowl_team_top_bowler_3',
       'bowl_team_top_bowler_4', 'bowl_team_top_bowler_5', 'batter_total_runs',
       'batter_balls_faced', 'bowler_total_runs', 'bowler_balls_bowled',
       'team_total_runs', 'wickets_taken', 'rr', 'target', 'remaining_balls',
       'rrr', 'batter_bat_hand', 'non_striker_bat_hand', 'bowler_hand',
       'bowler_style'],
      dtype='object')

In [6]:
all_deliveries = df.index

In [7]:
batter_timelines = {}
all_stats = []

for bat, group in tqdm(df.groupby('batter')):
    legal_balls = ~((group['runs_extras'] > 0) & (group['runs_batter'] == 0))
    
    stats = pd.DataFrame({
        'batter': bat,
        'runs': group['runs_batter'].cumsum().shift(fill_value=0),
        'balls': legal_balls.astype(int).cumsum().shift(fill_value=0),
        'dismissals': (group['player_out'] == bat).astype(int).cumsum().shift(fill_value=0)
    }, index=group.index)
    
    stats['avg'] = stats['runs'] / stats['dismissals'].replace(0, np.nan)
    stats['sr'] = stats['runs'] / stats['balls'] * 100

    spin_mask = group['bowler_style'].isin(['offspin', 'legspin'])
    pace_mask = group['bowler_style'].isin(['medium', 'fast'])

    stats['runs_vs_spin'] = (group['runs_batter'] * spin_mask).cumsum().shift(fill_value=0)
    stats['balls_vs_spin'] = (legal_balls & spin_mask).astype(int).cumsum().shift(fill_value=0)
    stats['dismissals_vs_spin'] = ((group['player_out'] == bat) & spin_mask).astype(int).cumsum().shift(fill_value=0)

    stats['runs_vs_pace'] = (group['runs_batter'] * pace_mask).cumsum().shift(fill_value=0)
    stats['balls_vs_pace'] = (legal_balls & pace_mask).astype(int).cumsum().shift(fill_value=0)
    stats['dismissals_vs_pace'] = ((group['player_out'] == bat) & pace_mask).astype(int).cumsum().shift(fill_value=0)

    stats['avg_vs_spin'] = stats['runs_vs_spin'] / stats['dismissals_vs_spin'].replace(0, np.nan)
    stats['sr_vs_spin'] = stats['runs_vs_spin'] / stats['balls_vs_spin'] * 100
    stats['avg_vs_pace'] = stats['runs_vs_pace'] / stats['dismissals_vs_pace'].replace(0, np.nan)
    stats['sr_vs_pace'] = stats['runs_vs_pace'] / stats['balls_vs_pace'] * 100

    stats['is_left'] = group['batter_bat_hand'] == 'left'
    stats['is_right'] = group['batter_bat_hand'] == 'right'
    
    all_stats.append(stats)

combined_stats = pd.concat(all_stats)

full_stats_batter = combined_stats.reindex(all_deliveries).ffill().fillna(0)

for bat in tqdm(df['batter'].unique()):
    batter_stats = full_stats_batter[full_stats_batter['batter'] == bat].drop('batter', axis=1)
    batter_timelines[bat] = batter_stats

  0%|          | 0/2394 [00:00<?, ?it/s]

  0%|          | 0/2394 [00:00<?, ?it/s]

In [8]:
def create_fast_lookup_map_vectorized(batter_timelines):
    lookup_map = {}
    
    for batter_id, timeline in tqdm(batter_timelines.items()):
        if timeline.empty:
            lookup_map[batter_id] = {'ranges': [], 'mapping': {}}
            continue
            
        sorted_indices = sorted(timeline.index)
        ranges = []
        mapping = {}
        
        first_idx = sorted_indices[0]
        ranges.append((0, first_idx))
        mapping[(0, first_idx)] = first_idx
        
        for i, idx in enumerate(sorted_indices):
            if i < len(sorted_indices) - 1:
                next_idx = sorted_indices[i+1]
                ranges.append((idx, next_idx))
                mapping[(idx, next_idx)] = idx
            else:
                ranges.append((idx, float('inf')))
                mapping[(idx, float('inf'))] = idx
        
        lookup_map[batter_id] = {'ranges': ranges, 'mapping': mapping}
    
    return lookup_map

In [9]:
def get_stats_constant_time(batter_id, ball_id, lookup_map, batter_timelines):
    if batter_id not in lookup_map:
        return None
    
    batter_data = lookup_map[batter_id]
    ranges = batter_data['ranges']
    mapping = batter_data['mapping']
    
    left, right = 0, len(ranges) - 1
    
    while left <= right:
        mid = (left + right) // 2
        start, end = ranges[mid]
        
        if start <= ball_id < end:
            idx = mapping[(start, end)]
            return batter_timelines[batter_id].loc[idx]
        elif ball_id >= end:
            left = mid + 1
        else:
            right = mid - 1
    
    return None

In [10]:
lookup_batsmen = create_fast_lookup_map_vectorized(batter_timelines)

  0%|          | 0/2394 [00:00<?, ?it/s]

In [11]:
bowler_timelines = {}
all_bowler_stats = []

for bowl, group in tqdm(df.groupby('bowler')):
    legal_balls = ~((group['runs_extras'] > 0) & (group['runs_batter'] == 0))
    
    stats = pd.DataFrame({
        'bowler': bowl,
        'runs_conceded': (group['runs_batter'] + group['runs_extras']).cumsum().shift(fill_value=0),
        'balls_bowled': legal_balls.astype(int).cumsum().shift(fill_value=0),
        'wickets': (~pd.isna(group['player_out'])).astype(int).cumsum().shift(fill_value=0)
    }, index=group.index)
    
    stats['bowling_avg'] = stats['runs_conceded'] / stats['wickets'].replace(0, np.nan)
    stats['bowling_sr'] = stats['balls_bowled'] / stats['wickets'].replace(0, np.nan)
    stats['economy'] = stats['runs_conceded'] / (stats['balls_bowled'] / 6)

    rh_batsman_mask = group['batter_bat_hand'] == 'right'
    lh_batsman_mask = group['batter_bat_hand'] == 'left'
    
    stats['runs_vs_right'] = (group['runs_batter'] * rh_batsman_mask).cumsum().shift(fill_value=0)
    stats['balls_vs_right'] = (legal_balls & rh_batsman_mask).astype(int).cumsum().shift(fill_value=0)
    stats['wickets_vs_right'] = (~pd.isna(group['player_out']) & rh_batsman_mask).astype(int).cumsum().shift(fill_value=0)
    
    stats['runs_vs_left'] = (group['runs_batter'] * lh_batsman_mask).cumsum().shift(fill_value=0)
    stats['balls_vs_left'] = (legal_balls & lh_batsman_mask).astype(int).cumsum().shift(fill_value=0)
    stats['wickets_vs_left'] = (~pd.isna(group['player_out']) & lh_batsman_mask).astype(int).cumsum().shift(fill_value=0)
    
    stats['bowling_avg_vs_right'] = stats['runs_vs_right'] / stats['wickets_vs_right'].replace(0, np.nan)
    stats['bowling_sr_vs_right'] = stats['balls_vs_right'] / stats['wickets_vs_right'].replace(0, np.nan)
    stats['economy_vs_right'] = stats['runs_vs_right'] / (stats['balls_vs_right'] / 6)
    
    stats['bowling_avg_vs_left'] = stats['runs_vs_left'] / stats['wickets_vs_left'].replace(0, np.nan)
    stats['bowling_sr_vs_left'] = stats['balls_vs_left'] / stats['wickets_vs_left'].replace(0, np.nan)
    stats['economy_vs_left'] = stats['runs_vs_left'] / (stats['balls_vs_left'] / 6)
    
    # Add flags for bowler type
    is_spin = group['bowler_style'].iloc[0] in ['offspin', 'legspin']
    is_pace = group['bowler_style'].iloc[0] in ['medium', 'fast']
    
    stats['is_spin'] = is_spin
    stats['is_pace'] = is_pace
    
    all_bowler_stats.append(stats)

combined_bowler_stats = pd.concat(all_bowler_stats)

full_stats_bowler = combined_bowler_stats.reindex(all_deliveries).ffill().fillna(0)

for bowl in tqdm(df['bowler'].unique()):
    bowler_stats = full_stats_bowler[full_stats_bowler['bowler'] == bowl].drop('bowler', axis=1)
    bowler_timelines[bowl] = bowler_stats

  0%|          | 0/1844 [00:00<?, ?it/s]

  0%|          | 0/1844 [00:00<?, ?it/s]

In [12]:
lookup_bowlers = create_fast_lookup_map_vectorized(bowler_timelines)

  0%|          | 0/1844 [00:00<?, ?it/s]

In [13]:
os.makedirs('data/lookup_maps', exist_ok=True)
with open('data/lookup_maps/lookup_batsmen.pkl', 'wb') as f:
    pickle.dump(lookup_batsmen, f)

with open('data/lookup_maps/lookup_bowlers.pkl', 'wb') as f:
    pickle.dump(lookup_bowlers, f)

with open('data/lookup_maps/batter_timelines.pkl', 'wb') as f:
    pickle.dump(batter_timelines, f)

with open('data/lookup_maps/bowler_timelines.pkl', 'wb') as f:
    pickle.dump(bowler_timelines, f)