In [38]:
import pandas as pd
import numpy as np
import pickle as pkl

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb

from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score


from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

import optuna

from tqdm.notebook import tqdm

In [39]:
with open('data/lookup_maps/batter_timelines.pkl', 'rb') as f:
    batter_timelines = pkl.load(f)

with open('data/lookup_maps/bowler_timelines.pkl', 'rb') as f:
    bowler_timelines = pkl.load(f)

with open('data/lookup_maps/lookup_batsmen.pkl', 'rb') as f:
    lookup_batsmen = pkl.load(f)

with open('data/lookup_maps/lookup_bowlers.pkl', 'rb') as f:
    lookup_bowlers = pkl.load(f)

In [40]:
df = pd.read_csv('data/bbb_reindexed.csv')
team_stats = pd.read_csv('data/team_match_rolling_stats.csv')
weather_df = pd.read_csv('data/match_location_weather.csv')

In [41]:
team_stats = team_stats.drop(columns=['match_date', 'runs_scored', 'won', 'wickets_taken'])

In [42]:
def get_stats(player_id, ball_id, lookup_map, timelines):
    if player_id == None:
        return None
    
    if player_id not in lookup_map:
        return None
    
    player_data = lookup_map[player_id]
    ranges = player_data['ranges']
    mapping = player_data['mapping']
    
    left, right = 0, len(ranges) - 1
    
    while left <= right:
        mid = (left + right) // 2
        start, end = ranges[mid]
        
        if start <= ball_id < end:
            idx = mapping[(start, end)]
            return timelines[player_id].loc[idx]
        elif ball_id >= end:
            left = mid + 1
        else:
            right = mid - 1
    
    return None

In [43]:
allowed_teams = ['New Zealand', 'Pakistan', 'South Africa', 'Sri Lanka', 'West Indies', 'India', 'Australia', 'England']
allowed_genders = ['male']
allowed_dls = [False]

tuna = True

In [44]:
df = pd.merge(df, 
                     team_stats[["match_id", "team", "win_rate_last_5", "avg_runs_last_5"]], 
                     left_on=["match_id", "bat_team"],
                     right_on=["match_id", "team"],
                     how="left",
                     suffixes=("", "_bat"))

df = pd.merge(df,
                     team_stats[["match_id", "team", "win_rate_last_5", "avg_wkts_last_5"]],
                     left_on=["match_id", "bowl_team"],
                     right_on=["match_id", "team"],
                     how="left",
                     suffixes=("", "_bowl"))

df = pd.merge(df,
                        weather_df[["match_id", "temperature_2m", "relative_humidity_2m", "cloud_cover", "wind_speed_10m", "dew_point_2m"]],
                        left_on=["match_id"],
                        right_on=["match_id"],
                        how="left",
                        suffixes=("", "_weather"))

In [45]:
df['year'] = pd.to_datetime(df['match_date']).dt.year

In [46]:
df.columns

Index(['ball_id', 'match_id', 'match_date', 'dl', 'gender', 'venue', 'innings',
       'bat_team', 'bowl_team', 'over', 'ball', 'batter', 'batter_name',
       'bowler', 'bowler_name', 'non_striker', 'runs_batter', 'runs_extras',
       'runs_total', 'wicket_type', 'player_out', 'bat_team_player_1',
       'bat_team_player_2', 'bat_team_player_3', 'bat_team_player_4',
       'bat_team_player_5', 'bat_team_player_6', 'bat_team_player_7',
       'bat_team_player_8', 'bat_team_player_9', 'bat_team_player_10',
       'bat_team_player_11', 'bowl_team_top_bowler_1',
       'bowl_team_top_bowler_2', 'bowl_team_top_bowler_3',
       'bowl_team_top_bowler_4', 'bowl_team_top_bowler_5', 'batter_total_runs',
       'batter_balls_faced', 'bowler_total_runs', 'bowler_balls_bowled',
       'team_total_runs', 'wickets_taken', 'rr', 'target', 'remaining_balls',
       'rrr', 'team', 'win_rate_last_5', 'avg_runs_last_5', 'team_bowl',
       'win_rate_last_5_bowl', 'avg_wkts_last_5', 'temperature_2m',
  

In [47]:
print(df.shape)

df = df[(df['bat_team'].isin(allowed_teams)) & 
    (df['bowl_team'].isin(allowed_teams)) & 
    (df['gender'].isin(allowed_genders)) &
    (df['dl'].isin(allowed_dls))
    ]
print(df.shape)

(1547846, 59)
(690316, 59)


In [48]:
df['match_id'].unique().shape

(1278,)

In [49]:
last_balls = df.groupby(['match_id', 'innings']).tail(1)
last_balls['innings_total_score'] = last_balls['team_total_runs']
innings_score_map = last_balls.set_index(['match_id', 'innings'])['innings_total_score']
df['innings_total_score'] = df.set_index(['match_id', 'innings']).index.map(innings_score_map)
df['innings_total_score'] = df['innings_total_score'].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  last_balls['innings_total_score'] = last_balls['team_total_runs']


In [50]:
overs = [i for i in range(51)]
print(overs)
df = df[df['over'].isin(overs) & (df['ball'] == 1)].drop_duplicates(subset=['match_id', 'innings', 'over', 'ball'])

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50]


In [51]:
BATTER_STATS = ['avg', 'sr']
BOWLER_STATS = ['bowling_avg', 'bowling_sr', 'economy']

for i in tqdm(range(1, 12)):
    bat_player_col = f'bat_team_player_{i}'
    
    if bat_player_col in df.columns:
        stats_list = []
        for idx, row in df.iterrows():
            player_id = row[bat_player_col]
            ball_id = row['ball_id']
            if pd.notna(player_id):
                stats = get_stats(player_id, ball_id, lookup_batsmen, batter_timelines)
                stats_list.append(stats)
            else:
                stats_list.append(None)
        
        for stat in BATTER_STATS:
            stat_values = []
            for stats_obj in stats_list:
                if stats_obj is not None and stat in stats_obj:
                    stat_values.append(stats_obj[stat])
                else:
                    stat_values.append(None)
            df[f'{bat_player_col}_{stat}'] = stat_values


for i in tqdm(range(1, 6)):
    bowl_player_col = f'bowl_team_top_bowler_{i}'

    if bowl_player_col in df.columns:
        stats_list = []
        for idx, row in df.iterrows():
            player_id = row[bowl_player_col]
            ball_id = row['ball_id']
            if pd.notna(player_id):
                stats = get_stats(player_id, ball_id, lookup_bowlers, bowler_timelines)
                stats_list.append(stats)
            else:
                stats_list.append(None)
        
        for stat in BOWLER_STATS:
            stat_values = []
            for stats_obj in stats_list:
                if stats_obj is not None and stat in stats_obj:
                    stat_values.append(stats_obj[stat])
                else:
                    stat_values.append(None)
            
            df[f'{bowl_player_col}_{stat}'] = stat_values

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
 df.columns

Index(['ball_id', 'match_id', 'match_date', 'dl', 'gender', 'venue', 'innings',
       'bat_team', 'bowl_team', 'over', 'ball', 'batter', 'batter_name',
       'bowler', 'bowler_name', 'non_striker', 'runs_batter', 'runs_extras',
       'runs_total', 'wicket_type', 'player_out', 'bat_team_player_1',
       'bat_team_player_2', 'bat_team_player_3', 'bat_team_player_4',
       'bat_team_player_5', 'bat_team_player_6', 'bat_team_player_7',
       'bat_team_player_8', 'bat_team_player_9', 'bat_team_player_10',
       'bat_team_player_11', 'bowl_team_top_bowler_1',
       'bowl_team_top_bowler_2', 'bowl_team_top_bowler_3',
       'bowl_team_top_bowler_4', 'bowl_team_top_bowler_5', 'batter_total_runs',
       'batter_balls_faced', 'bowler_total_runs', 'bowler_balls_bowled',
       'team_total_runs', 'wickets_taken', 'rr', 'target', 'remaining_balls',
       'rrr', 'team', 'win_rate_last_5', 'avg_runs_last_5', 'team_bowl',
       'win_rate_last_5_bowl', 'avg_wkts_last_5', 'temperature_2m',
  

In [53]:
drop_cols1 = ['target','batter', 'bowler', 'non_striker', 'player_out', 'dl', 'gender', 'wicket_type', 'match_date', 'venue', 'bat_team', 'bowl_team', 'team', 'team_bowl', 'team_bat']
drop_cols2 = [f'bat_team_player_{i}' for i in range(1, 12)] + [f'bowl_team_top_bowler_{i}' for i in range(1, 12)]
drop_cols = [col for col in df.columns if 'name' in col or '_id' in col
             or col in drop_cols1 or col in drop_cols2]

drop_cols.remove('match_id')

df = df.drop(columns=drop_cols)

In [54]:
df.to_csv('data/premades/over_by_over_data.csv')