In [81]:
import pandas as pd
import numpy as np
import pickle

In [82]:
ball2ball = pd.read_csv('../data/ipl_ball_by_ball_2008_2022.csv')

In [83]:
ball2ball.columns

Index(['id', 'innings', 'overs', 'ball_number', 'batter', 'bowler',
       'non_striker', 'extra_type', 'batsman_run', 'extras_run', 'total_run',
       'non_boundary', 'iswicket_delivery', 'player_out', 'dismisal_kind',
       'fielders_involved', 'batting_team'],
      dtype='object')

In [84]:
matches = pd.read_csv('../data/ipl_matches_2008_2022.csv')

In [85]:
matches.columns

Index(['id', 'city', 'match_date', 'season', 'match_number', 'team1', 'team2',
       'venue', 'toss_winner', 'toss_decision', 'superover', 'winning_team',
       'won_by', 'margin', 'method', 'player_of_match', 'umpire1', 'umpire2'],
      dtype='object')

In [86]:
matches = matches.dropna(subset = 'match_date')

run this to add year column to ball2ball dataframe

In [87]:
import pandas as pd

# Assuming your original DataFrame is named 'original_df'
# Example:
# original_df = pd.read_csv('your_csv_file.csv')

# Create a new DataFrame with 'id' and 'year'
new_df = matches[['id', 'match_date']].copy()

# Convert 'match_date' to datetime format
new_df['match_date'] = pd.to_datetime(new_df['match_date'], format='%d-%m-%Y')

# Extract the year and create a new 'year' column
new_df['year'] = new_df['match_date'].dt.year

# Drop the 'match_date' column if you no longer need it
new_df.drop('match_date', axis=1, inplace=True)

# Print or use the new DataFrame 'new_df'
ball2ball = pd.merge(new_df, ball2ball, on='id', how='inner')


In [88]:
ball2ball['extra_type'].unique()

array([nan, 'legbyes', 'wides', 'byes', 'noballs', 'penalty'],
      dtype=object)

In [89]:
from collections import defaultdict
bowler_stats = {}
for index, row in ball2ball.iterrows():
    bowler = row['bowler']
    year = row['year']
    extra_type = row['extra_type']
    total_run = row['total_run']
    extras_run = row['extras_run']
    if year not in bowler_stats:
        bowler_stats[year] = {}
    if bowler not in bowler_stats[year]:
        bowler_stats[year][bowler] = {}
        bowler_stats[year][bowler]['runs_conceeded'] = 0
        bowler_stats[year][bowler]['balls_bowled'] = 0
    bowler_stats[year][bowler]['runs_conceeded'] += total_run - extras_run if extra_type in ['byes','legbyes'] else total_run
    bowler_stats[year][bowler]['balls_bowled'] += 1 if extra_type not in ['wides', 'noballs', 'penalty'] else 0
            


In [90]:
bowler_economy = bowler_stats
for year in bowler_economy:
    for player in bowler_economy[year]:
        bowler_economy[year][player]['economy'] = bowler_economy[year][player]['runs_conceeded'] * 6 / bowler_economy[year][player]['balls_bowled']

In [None]:
best_economies = {}
for year in bowler_economy:
    sorted_economies = sorted(bowler_economy[year], key = lambda x: bowler_economy[year][x]['economy'])
    best5 = sorted_economies[0:5]
    best_economies[year] = {player: round(bowler_economy[year][player]['economy'],2) for player in best5}
    
best_economies

In [92]:
import pickle
with open('best_economies.pkl','wb') as bef:
    pickle.dump(best_economies,bef)

In [93]:
ball2ball['dismisal_kind'].unique()

array([nan, 'caught', 'caught and bowled', 'run out', 'bowled', 'stumped',
       'lbw', 'hit wicket', 'retired hurt', 'retired out',
       'obstructing the field'], dtype=object)

In [94]:
grouped_df = ball2ball.groupby(['year', 'batter'])['batsman_run'].sum().reset_index()
top_scorers_df = grouped_df.groupby('year').apply(lambda x: x.nlargest(5, 'batsman_run')).reset_index(drop=True)
# Print or use the resulting DataFrame 'grouped_df'
# top_scorers_dict = top_scorers_df.to_dict(orient='records')

# Print or use the resulting dictionary 'top_scorers_dict'
top_scorers_dict_2d = {}

# Iterate through the rows of the DataFrame and populate the 2D dictionary
for index, row in top_scorers_df.iterrows():
    year = row['year']
    batter = row['batter']
    batsman_run = row['batsman_run']

    # If the year is not in the dictionary, add it
    if year not in top_scorers_dict_2d:
        top_scorers_dict_2d[year] = {}

    # Add the batter and batsman_run to the dictionary for the specific year
    top_scorers_dict_2d[year][batter] = batsman_run

# Print or use the resulting 2D dictionary 'top_scorers_dict_2d'
print(top_scorers_dict_2d)
with open('top_scorers.pkl','wb') as tf:
    pickle.dump(top_scorers_dict_2d,tf)

{2008: {'SE Marsh': 616, 'G Gambhir': 534, 'ST Jayasuriya': 514, 'SR Watson': 472, 'GC Smith': 441}, 2009: {'ML Hayden': 572, 'AC Gilchrist': 495, 'AB de Villiers': 465, 'SK Raina': 434, 'TM Dilshan': 418}, 2010: {'SR Tendulkar': 618, 'JH Kallis': 572, 'SK Raina': 528, 'SC Ganguly': 493, 'M Vijay': 458}, 2011: {'CH Gayle': 608, 'V Kohli': 557, 'SR Tendulkar': 553, 'SE Marsh': 504, 'MEK Hussey': 492}, 2012: {'CH Gayle': 733, 'G Gambhir': 590, 'S Dhawan': 569, 'AM Rahane': 560, 'V Sehwag': 495}, 2013: {'MEK Hussey': 733, 'CH Gayle': 720, 'V Kohli': 639, 'SK Raina': 548, 'SR Watson': 543}, 2014: {'RV Uthappa': 660, 'DR Smith': 566, 'GJ Maxwell': 552, 'DA Warner': 528, 'SK Raina': 523}, 2015: {'DA Warner': 562, 'AM Rahane': 540, 'LMP Simmons': 540, 'AB de Villiers': 513, 'V Kohli': 505}, 2016: {'V Kohli': 973, 'DA Warner': 848, 'AB de Villiers': 687, 'G Gambhir': 501, 'S Dhawan': 501}, 2017: {'DA Warner': 641, 'G Gambhir': 498, 'S Dhawan': 479, 'SPD Smith': 472, 'SK Raina': 442}, 2018: {'K

In [95]:
ball2ball.columns

Index(['id', 'year', 'innings', 'overs', 'ball_number', 'batter', 'bowler',
       'non_striker', 'extra_type', 'batsman_run', 'extras_run', 'total_run',
       'non_boundary', 'iswicket_delivery', 'player_out', 'dismisal_kind',
       'fielders_involved', 'batting_team'],
      dtype='object')

In [None]:
grouped_df = ball2ball.groupby(['year', 'bowler'])['iswicket_delivery'].count().reset_index()

grouped_df
top_scorers_df = grouped_df.groupby('year').apply(lambda x: x.nlargest(5, 'iswicket_delivery')).reset_index(drop=True)
# Print or use the resulting DataFrame 'grouped_df'
# top_scorers_dict = top_scorers_df.to_dict(orient='records')

# Print or use the resulting dictionary 'top_scorers_dict'
top_scorers_dict_2d = {}

# Iterate through the rows of the DataFrame and populate the 2D dictionary
for index, row in top_scorers_df.iterrows():
    year = row['year']
    batter = row['bowler']
    batsman_run = row['batsman_run']

    # If the year is not in the dictionary, add it
    if year not in top_scorers_dict_2d:
        top_scorers_dict_2d[year] = {}

    # Add the batter and batsman_run to the dictionary for the specific year
    top_scorers_dict_2d[year][batter] = batsman_run

# Print or use the resulting 2D dictionary 'top_scorers_dict_2d'
print(top_scorers_dict_2d)
with open('top_scorers.pkl','wb') as tf:
    pickle.dump(top_scorers_dict_2d,tf)


In [None]:
ge = open('../data/player_stats/top_bowlers_by_season_corrected_economy.pkl','rb')
me = open('../data/player_stats/best_economies.pkl','rb')
ce = open('../data/player_stats/top_bowlers_by_season_int_year.pkl','rb')
gpt_economy = pickle.load(ge)
my_economy = pickle.load(me)
corrected_economy = pickle.load(ce)

In [None]:
ball2ball['batter']

0         YBK Jaiswal
1         YBK Jaiswal
2          JC Buttler
3         YBK Jaiswal
4         YBK Jaiswal
             ...     
225949        P Kumar
225950       SB Joshi
225951        P Kumar
225952       SB Joshi
225953       SB Joshi
Name: batter, Length: 225954, dtype: object

In [None]:
ball2ball.columns

Index(['id', 'year', 'innings', 'overs', 'ball_number', 'batter', 'bowler',
       'non_striker', 'extra_type', 'batsman_run', 'extras_run', 'total_run',
       'non_boundary', 'iswicket_delivery', 'player_out', 'dismisal_kind',
       'fielders_involved', 'batting_team'],
      dtype='object')

0         1
1         1
2         1
3         1
4         1
         ..
225949    2
225950    2
225951    2
225952    2
225953    2
Name: innings, Length: 225954, dtype: int64

In [None]:
outs = ball2ball[ball2ball['iswicket_delivery'] == 1].groupby(['year','player_out'])['iswicket_delivery'].count().reset_index()

In [None]:
runs = ball2ball.groupby(['year', 'batter']).agg(
    balls_faced =('batsman_run', 'size'),
    total_runs =('batsman_run', 'sum')
).reset_index()

Unnamed: 0,year,batter,balls_faced,total_runs
0,2008,A Chopra,55,42
1,2008,A Kumble,17,13
2,2008,A Mishra,42,37
3,2008,A Mukund,1,0
4,2008,A Nehra,13,3
...,...,...,...,...
2264,2022,WP Saha,266,317
2265,2022,Washington Sundar,76,101
2266,2022,YBK Jaiswal,198,258
2267,2022,YS Chahal,8,5


In [None]:
result = pd.merge(runs, outs, left_on=['batter','year'], right_on=['player_out','year'], how='left')
result = result.dropna(subset='iswicket_delivery')

In [None]:
result['batting_average'] = round(result['total_runs']/result['iswicket_delivery'],2)
result['strike_rate'] = round(result['total_runs']/result['balls_faced'],2)*100
result

In [None]:
def convert_group_to_dict(group):
    return {row['batter']: {'runs': row['total_runs'], 'batting_average': row['batting_average'], 'strike_rate': row['strike_rate'], 'balls_faced': row['balls_faced']} for index, row in group.iterrows()}

# Group by 'year' and convert each group into the required dictionary format
result_dict = {year: convert_group_to_dict(group) for year, group in result.groupby('year')}

In [106]:
pickle_file = '../data/player_stats/batsman_metrics.pkl'
with open(pickle_file, 'wb') as file:
    pickle.dump(result_dict, file)

In [47]:
ball2ball['dismisal_kind'].unique()

array([nan, 'caught', 'caught and bowled', 'run out', 'bowled', 'stumped',
       'lbw', 'hit wicket', 'retired hurt', 'retired out',
       'obstructing the field'], dtype=object)

In [99]:
from collections import defaultdict
bowler_stats = {}
accepted = ['caught', 'caught and bowled', 'bowled', 'stumped','lbw', 'hit wicket']
for index, row in ball2ball.iterrows():
    bowler = row['bowler']
    year = row['year']
    extra_type = row['extra_type']
    total_run = row['total_run']
    extras_run = row['extras_run']
    iswicket = 1 if row['iswicket_delivery'] == 1 and row['dismisal_kind'] in accepted else 0
    if year not in bowler_stats:
        bowler_stats[year] = {}
    if bowler not in bowler_stats[year]:
        bowler_stats[year][bowler] = {}
        bowler_stats[year][bowler]['runs_conceeded'] = 0
        bowler_stats[year][bowler]['balls_bowled'] = 0
        bowler_stats[year][bowler]['wickets_taken'] = 0
    bowler_stats[year][bowler]['runs_conceeded'] += total_run - extras_run if extra_type in ['byes','legbyes'] else total_run
    bowler_stats[year][bowler]['balls_bowled'] += 1 if extra_type not in ['wides', 'noballs', 'penalty'] else 0
    bowler_stats[year][bowler]['wickets_taken'] += iswicket

In [100]:
for year in bowler_stats:
    for player in bowler_stats[year]:
        bowler_stats[year][player]['economy'] = bowler_stats[year][player]['runs_conceeded'] * 6 / bowler_stats[year][player]['balls_bowled']

In [102]:
file = '../data/player_stats/bowler_metrics.pkl'
with open(file,'wb') as bmf:
    pickle.dump(bowler_stats, bmf)

In [38]:
df = ball2ball.dropna(subset = 'fielders_involved')
df = df.groupby(['fielders_involved','year'])['innings'].count().reset_index()

Unnamed: 0,fielders_involved,year,innings
0,A Ashish Reddy,2012,1
1,A Ashish Reddy,2013,6
2,A Ashish Reddy,2015,2
3,A Ashish Reddy,2016,1
4,A Badoni,2022,9
...,...,...,...
1960,Z Khan,2010,4
1961,Z Khan,2011,4
1962,Z Khan,2012,5
1963,Z Khan,2016,3
