# 1. INITIAL SETUP AND DATA LOADING

In [430]:
# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from functools import reduce

In [431]:
# Load the dataset with proper dtype handling
print("Loading dataset...")
df = pd.read_csv('IPL.csv', low_memory=False)
print(f"Dataset loaded with shape: {df.shape}")

Loading dataset...
Dataset loaded with shape: (278205, 64)


In [432]:
df.head()

Unnamed: 0.1,Unnamed: 0,match_id,date,match_type,event_name,innings,batting_team,bowling_team,over,ball,...,team_runs,team_balls,team_wicket,new_batter,batter_runs,batter_balls,bowler_wicket,batting_partners,next_batter,striker_out
0,131970,335982,4/18/2008,T20,Indian Premier League,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,1,...,1,1,0,,0,1,0,"('BB McCullum', 'SC Ganguly')",,False
1,131971,335982,4/18/2008,T20,Indian Premier League,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,2,...,1,2,0,,0,1,0,"('BB McCullum', 'SC Ganguly')",,False
2,131972,335982,4/18/2008,T20,Indian Premier League,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,3,...,2,2,0,,0,1,0,"('BB McCullum', 'SC Ganguly')",,False
3,131973,335982,4/18/2008,T20,Indian Premier League,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,3,...,2,3,0,,0,2,0,"('BB McCullum', 'SC Ganguly')",,False
4,131974,335982,4/18/2008,T20,Indian Premier League,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,4,...,2,4,0,,0,3,0,"('BB McCullum', 'SC Ganguly')",,False


## 1.1 DATA QUALITY CHECKS AND VALIDATION

In [433]:

# Basic data quality checks
print(f"Total rows: {len(df)}")
print(f"Matches where innings > 2: {len(df[df['innings'] > 2])}") # Super overs
print(f"Invalid runs values: {len(df[df['runs_total'] < 0])}")


Total rows: 278205
Matches where innings > 2: 171
Invalid runs values: 0


In [434]:

# Separate super over data
super_overs = df[df['innings'] > 2].copy()
normal_matches = df[df['innings'] <= 2].copy()

print(f"Super over entries: {len(super_overs)}")
print(f"Normal match entries: {len(normal_matches)}")


Super over entries: 171
Normal match entries: 278034


In [435]:

# Check for missing values in key columns
print("\nMissing values in key columns:")
key_columns = ['batter', 'bowler', 'runs_total', 'wicket_kind', 'venue']
for col in key_columns:
    missing_pct = (df[col].isna().sum() / len(df)) * 100
    print(f"{col}: {missing_pct:.2f}% missing")



Missing values in key columns:
batter: 0.00% missing
bowler: 0.00% missing
runs_total: 0.00% missing
wicket_kind: 95.03% missing
venue: 0.00% missing


## 1.2 INITIAL DATA CLEANING

In [436]:

# Remove absolutely unnecessary columns
columns_to_drop = [
    'date', 'match_type', 'balls_faced', 'runs_not_boundary', 
    'day', 'month', 'gender', 'team_type', 'balls_per_over', 
    'overs', 'match_number', 'striker_out'
]

df.drop(columns=columns_to_drop, axis=1, inplace=True)
print(f"Dataset shape after dropping columns: {df.shape}")


Dataset shape after dropping columns: (278205, 52)


In [437]:

# Handle missing values in key columns
print("\nHandling missing values...")

# Wicket kind - create 'not_out' category
df['wicket_kind'] = df['wicket_kind'].fillna('not_out')

# DRS/review columns - meaningful imputation
review_columns = ['review_batter', 'team_reviewed', 'review_decision', 'umpire']
for col in review_columns:
    df[col] = df[col].fillna('no_review')



Handling missing values...


In [438]:

# Venue standardization
venue_mapping = {
    'Arun Jaitley Stadium, Delhi' : 'Arun Jaitley Stadium',
    'Brabourne Stadium, Mumbai' : 'Brabourne Stadium',
    'Dr DY Patil Sports Academy, Mumbai': 'Dr DY Patil Sports Academy',
    'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium, Visakhapatnam':'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium',
    'Eden Gardens, Kolkata' : 'Eden Gardens',
    'Himachal Pradesh Cricket Association Stadium, Dharamsala': 'Himachal Pradesh Cricket Association Stadium',
    'M Chinnaswamy Stadium, Bengaluru': 'M Chinnaswamy Stadium',
    'M.Chinnaswamy Stadium' : 'M Chinnaswamy Stadium',
    'MA Chidambaram Stadium, Chepauk, Chennai' : 'MA Chidambaram Stadium',
    'MA Chidambaram Stadium, Chepauk' :'MA Chidambaram Stadium',
    'Maharaja Yadavindra Singh International Cricket Stadium, Mullanpur' : 'Maharaja Yadavindra Singh International Cricket Stadium, New Chandigarh',
    'Maharashtra Cricket Association Stadium, Pune': 'Maharashtra Cricket Association Stadium',
    'Punjab Cricket Association IS Bindra Stadium, Mohali, Chandigarh': 'Punjab Cricket Association IS Bindra Stadium',
    'Punjab Cricket Association IS Bindra Stadium, Mohali': 'Punjab Cricket Association IS Bindra Stadium',
    'Punjab Cricket Association Stadium, Mohali':'Punjab Cricket Association IS Bindra Stadium',
    'Rajiv Gandhi International Stadium, Uppal, Hyderabad': 'Rajiv Gandhi International Stadium',
    'Rajiv Gandhi International Stadium, Uppal':'Rajiv Gandhi International Stadium',
    'Sawai Mansingh Stadium, Jaipur': 'Sawai Mansingh Stadium',
    'Wankhede Stadium, Mumbai': 'Wankhede Stadium'
    # Add more mappings as discovered
}
df['venue_standardized'] = df['venue'].replace(venue_mapping)

print("Missing values handled successfully")


Missing values handled successfully


In [439]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,match_id,event_name,innings,batting_team,bowling_team,over,ball,ball_no,batter,...,team_runs,team_balls,team_wicket,new_batter,batter_runs,batter_balls,bowler_wicket,batting_partners,next_batter,venue_standardized
0,131970,335982,Indian Premier League,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,1,0.1,SC Ganguly,...,1,1,0,,0,1,0,"('BB McCullum', 'SC Ganguly')",,M Chinnaswamy Stadium
1,131971,335982,Indian Premier League,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,2,0.2,BB McCullum,...,1,2,0,,0,1,0,"('BB McCullum', 'SC Ganguly')",,M Chinnaswamy Stadium
2,131972,335982,Indian Premier League,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,3,0.3,BB McCullum,...,2,2,0,,0,1,0,"('BB McCullum', 'SC Ganguly')",,M Chinnaswamy Stadium
3,131973,335982,Indian Premier League,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,3,0.3,BB McCullum,...,2,3,0,,0,2,0,"('BB McCullum', 'SC Ganguly')",,M Chinnaswamy Stadium
4,131974,335982,Indian Premier League,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,4,0.4,BB McCullum,...,2,4,0,,0,3,0,"('BB McCullum', 'SC Ganguly')",,M Chinnaswamy Stadium
5,131975,335982,Indian Premier League,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,5,0.5,BB McCullum,...,2,5,0,,0,4,0,"('BB McCullum', 'SC Ganguly')",,M Chinnaswamy Stadium
6,131976,335982,Indian Premier League,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,6,0.6,BB McCullum,...,3,6,0,,0,5,0,"('BB McCullum', 'SC Ganguly')",,M Chinnaswamy Stadium
7,131977,335982,Indian Premier League,1,Kolkata Knight Riders,Royal Challengers Bangalore,1,1,1.1,BB McCullum,...,3,7,0,,0,6,0,"('BB McCullum', 'SC Ganguly')",,M Chinnaswamy Stadium
8,131978,335982,Indian Premier League,1,Kolkata Knight Riders,Royal Challengers Bangalore,1,2,1.2,BB McCullum,...,7,8,0,,4,7,0,"('BB McCullum', 'SC Ganguly')",,M Chinnaswamy Stadium
9,131979,335982,Indian Premier League,1,Kolkata Knight Riders,Royal Challengers Bangalore,1,3,1.3,BB McCullum,...,11,9,0,,8,8,0,"('BB McCullum', 'SC Ganguly')",,M Chinnaswamy Stadium


## 1.3 ENHANCED FEATURE ENGINEERING

In [440]:
# Match phase classification
def get_match_phase(over):
    if over <= 6:
        return 'Powerplay'
    elif over <= 15:
        return 'Middle'
    else:
        return 'Death'

df['match_phase'] = df['over'].apply(get_match_phase)


In [441]:

# Pressure situations - required run rate
df['required_run_rate'] = np.where(
    df['runs_target'] > 0,
    (df['runs_target'] - df['team_runs']) / ((20 - df['over']) + 0.1),
    np.nan
)


In [442]:

# Boundary indicator
df['is_boundary'] = (df['runs_batter'] >= 4).astype(int)

# Dot ball indicator
df['is_dot_ball'] = (df['runs_batter'] == 0) & (df['runs_extras'] == 0)

# Partnership runs (cumulative partnership for current wicket)
df['partnership_runs'] = df.groupby(['match_id', 'innings', 'team_wicket'])['runs_total'].cumsum()

# High pressure indicator (last 5 overs chasing > 8 RRR)
df['high_pressure'] = ((df['over'] >= 15) & 
                       (df['innings'] == 2) & 
                       (df['required_run_rate'] > 8)).astype(int)


In [443]:
# HIGH-STAKES MATCH IDENTIFIER
print("Adding high-stakes match identification...")
df['is_high_stakes'] = df['stage'].isin(['Final', 'Semi Final', 'Qualifier 1', 'Qualifier 2', 'Eliminator','Elimination Final','3rd Place Play-Off']).astype(int)
df['is_knockout'] = df['stage'].isin(['Final', 'Semi Final', 'Qualifier 2', 'Eliminator','Elimination Final']).astype(int)

# Calculate match importance weight (1.0 for league, 1.5 for playoffs, 2.0 for finals)
def get_match_importance(stage):
    if stage == 'Final':
        return 2.0
    elif stage in [ 'Semi Final', 'Qualifier 1', 'Qualifier 2', 'Eliminator','Elimination Final','3rd Place Play-Off']:
        return 1.5
    else:
        return 1.0

df['match_importance'] = df['stage'].apply(get_match_importance)

print("High-stakes features added successfully")

Adding high-stakes match identification...
High-stakes features added successfully


In [444]:
df.head()


Unnamed: 0.1,Unnamed: 0,match_id,event_name,innings,batting_team,bowling_team,over,ball,ball_no,batter,...,venue_standardized,match_phase,required_run_rate,is_boundary,is_dot_ball,partnership_runs,high_pressure,is_high_stakes,is_knockout,match_importance
0,131970,335982,Indian Premier League,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,1,0.1,SC Ganguly,...,M Chinnaswamy Stadium,Powerplay,,0,False,1,0,0,0,1.0
1,131971,335982,Indian Premier League,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,2,0.2,BB McCullum,...,M Chinnaswamy Stadium,Powerplay,,0,True,1,0,0,0,1.0
2,131972,335982,Indian Premier League,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,3,0.3,BB McCullum,...,M Chinnaswamy Stadium,Powerplay,,0,False,2,0,0,0,1.0
3,131973,335982,Indian Premier League,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,3,0.3,BB McCullum,...,M Chinnaswamy Stadium,Powerplay,,0,True,2,0,0,0,1.0
4,131974,335982,Indian Premier League,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,4,0.4,BB McCullum,...,M Chinnaswamy Stadium,Powerplay,,0,True,2,0,0,0,1.0


# 2. Consolidating into dataframes

## 2.1 PLAYER-TEAM-YEAR MAPPING

In [445]:
# Create working DataFrame
player_team_df = df[['year', 'batting_team', 'bowling_team', 'batter', 'bowler']].copy()

# For batters
batter_list = player_team_df[['year', 'batting_team', 'batter']].copy()
batter_list.columns = ['Season', 'Team', 'Player']

# For bowlers
bowler_list = player_team_df[['year', 'bowling_team', 'bowler']].copy()
bowler_list.columns = ['Season', 'Team', 'Player']

# Combine and remove duplicates
master_list = pd.concat([batter_list, bowler_list], ignore_index=True)
df_player_team_year = master_list.drop_duplicates()

print(f"Player-team-year mapping created with {len(df_player_team_year)} entries")


Player-team-year mapping created with 3138 entries


In [446]:
df_player_team_year.head()

Unnamed: 0,Season,Team,Player
0,2008,Kolkata Knight Riders,SC Ganguly
1,2008,Kolkata Knight Riders,BB McCullum
34,2008,Kolkata Knight Riders,RT Ponting
82,2008,Kolkata Knight Riders,DJ Hussey
108,2008,Kolkata Knight Riders,Mohammad Hafeez


In [447]:

# Handle players with multiple teams in same season
def get_primary_team_per_season(player_season_data):
    """For players with multiple teams in a season, pick the one they played most matches for"""
    if len(player_season_data) == 1:
        return player_season_data.iloc[0]['Team']
    
    # Count matches per team (simplified - in practice, you'd count actual matches)
    match_counts = player_season_data['Team'].value_counts()
    return match_counts.index[0]


In [448]:

# Apply enhanced team mapping
df_player_team_enhanced = df_player_team_year.groupby(['Player', 'Season']).apply(
    lambda x: pd.Series({
        'Team': get_primary_team_per_season(x),
        'Team_Count': len(x['Team'].unique())
    })
).reset_index()

print("Enhanced player-team mapping completed")


Enhanced player-team mapping completed


  df_player_team_enhanced = df_player_team_year.groupby(['Player', 'Season']).apply(


## 2.2 MATCH-WISE PERFORMANCE 

In [449]:
# Create focused DataFrame for match stats
cols_for_match_stats = [
    'match_id', 'year', 'innings', 'batter', 'bat_pos', 'bowler', 'valid_ball', 
    'ball_no', 'runs_batter', 'runs_extras', 'extra_type', 'wicket_kind', 
    'player_out', 'fielders', 'batter_runs', 'batter_balls', 'bowler_wicket', 
    'runs_bowler', 'stage', 'match_phase', 'is_boundary', 'is_dot_ball'
]

df_match_stats = df[cols_for_match_stats].copy()


### 2.2.1 Batting performance

In [450]:

print("Calculating batting performance...")
batting_perf = df_match_stats.groupby(['match_id', 'year', 'innings', 'batter']).agg(
    runs_scored=('batter_runs', 'max'),
    balls_faced=('batter_balls', 'max'),
    boundaries=('is_boundary', 'sum'),
    dot_balls_faced=('is_dot_ball', 'sum'),
    player_outs=('player_out', lambda x: x.tolist())
).reset_index()

batting_perf['was_dismissed'] = batting_perf.apply(
    lambda row: 'Yes' if row['batter'] in row['player_outs'] else 'No', 
    axis=1
)
batting_perf = batting_perf.drop(columns=['player_outs'])


Calculating batting performance...


In [451]:
batting_perf.head()

Unnamed: 0,match_id,year,innings,batter,runs_scored,balls_faced,boundaries,dot_balls_faced,was_dismissed
0,335982,2008,1,BB McCullum,158,73,23,17,No
1,335982,2008,1,DJ Hussey,12,12,1,4,Yes
2,335982,2008,1,Mohammad Hafeez,5,3,1,1,No
3,335982,2008,1,RT Ponting,20,20,2,8,Yes
4,335982,2008,1,SC Ganguly,10,12,2,6,Yes


### 2.2.2 Bowling performance

In [452]:
print("Calculating bowling performance...")

# Valid balls (wickets and regular runs)
df_bowling_valid = df_match_stats[df_match_stats['valid_ball'] == 1].copy()
bowling_perf_valid = df_bowling_valid.groupby(['match_id', 'year', 'innings', 'bowler']).agg(
    runs_conceded=('runs_bowler', 'sum'),
    wickets_taken=('player_out', lambda x: x.notnull().sum()),
    balls_bowled=('ball_no', 'count'),
    dot_balls_bowled=('is_dot_ball', 'sum'),
    boundaries_conceded=('is_boundary', 'sum')
).reset_index()

# Extras from invalid balls
df_bowling_extras = df_match_stats[df_match_stats['valid_ball'] == 0].copy()
bowling_perf_extras = df_bowling_extras.groupby(['match_id', 'year', 'innings', 'bowler']).agg(
    runs_from_extras=('runs_extras', 'sum')
).reset_index()

# Merge bowling stats
bowling_perf = pd.merge(bowling_perf_valid, bowling_perf_extras, 
                        on=['match_id', 'year', 'innings', 'bowler'], 
                        how='left')
bowling_perf['runs_from_extras'] = bowling_perf['runs_from_extras'].fillna(0)


Calculating bowling performance...


In [453]:
bowling_perf.head()

Unnamed: 0,match_id,year,innings,bowler,runs_conceded,wickets_taken,balls_bowled,dot_balls_bowled,boundaries_conceded,runs_from_extras
0,335982,2008,1,AA Noffke,35,1,24,6,4,5.0
1,335982,2008,1,CL White,22,0,6,0,4,2.0
2,335982,2008,1,JH Kallis,47,1,24,8,8,1.0
3,335982,2008,1,P Kumar,37,0,24,10,6,1.0
4,335982,2008,1,SB Joshi,26,0,18,4,2,0.0


### 2.2.3 Merging performance

In [454]:
print("Merging batting and bowling performance...")

batting_perf_merge = batting_perf.rename(columns={'batter': 'Player'})
bowling_perf_merge = bowling_perf.rename(columns={'bowler': 'Player', 'runs_from_extras': 'extras_of_bowler'})

# Merge all DataFrames
data_frames = [
    batting_perf_merge[['match_id', 'year', 'innings', 'Player', 'runs_scored', 'balls_faced', 'was_dismissed', 'boundaries', 'dot_balls_faced']],
    bowling_perf_merge[['match_id', 'year', 'innings', 'Player', 'runs_conceded', 'wickets_taken', 'balls_bowled', 'extras_of_bowler', 'dot_balls_bowled', 'boundaries_conceded']]
]

df_match_performance = reduce(lambda left, right: pd.merge(left, right, on=['match_id', 'innings', 'year', 'Player'], how='outer'), data_frames)


Merging batting and bowling performance...


In [455]:
df_match_performance.head()

Unnamed: 0,match_id,year,innings,Player,runs_scored,balls_faced,was_dismissed,boundaries,dot_balls_faced,runs_conceded,wickets_taken,balls_bowled,extras_of_bowler,dot_balls_bowled,boundaries_conceded
0,335982,2008,1,AA Noffke,,,,,,35.0,1.0,24.0,5.0,6.0,4.0
1,335982,2008,1,BB McCullum,158.0,73.0,No,23.0,17.0,,,,,,
2,335982,2008,1,CL White,,,,,,22.0,0.0,6.0,2.0,0.0,4.0
3,335982,2008,1,DJ Hussey,12.0,12.0,Yes,1.0,4.0,,,,,,
4,335982,2008,1,JH Kallis,,,,,,47.0,1.0,24.0,1.0,8.0,8.0


In [456]:
# Fill NaN values
performance_columns = ['runs_scored', 'balls_faced', 'runs_conceded', 'wickets_taken', 'balls_bowled', 'extras_of_bowler', 'boundaries', 'dot_balls_faced', 'dot_balls_bowled', 'boundaries_conceded']
df_match_performance[performance_columns] = df_match_performance[performance_columns].fillna(0)
df_match_performance[performance_columns] = df_match_performance[performance_columns].astype(int)


In [457]:
print("Adding team information...")
df_match_perf_with_team = pd.merge(
    df_match_performance.rename(columns={'year': 'Season'}),
    df_player_team_enhanced[['Season', 'Player', 'Team']],
    how='left',
    on=['Season', 'Player']
)

Adding team information...


In [458]:
# Move Team column to better position
def move_column(df, col_name, new_pos):
    cols = list(df.columns)
    cols.insert(new_pos, cols.pop(cols.index(col_name)))
    return df[cols]

df_match_perf_team = move_column(df_match_perf_with_team, 'Team', 3)

print(f"Final match performance DataFrame shape: {df_match_perf_team.shape}")


Final match performance DataFrame shape: (31585, 16)


In [459]:
df_match_perf_team.head()

Unnamed: 0,match_id,Season,innings,Team,Player,runs_scored,balls_faced,was_dismissed,boundaries,dot_balls_faced,runs_conceded,wickets_taken,balls_bowled,extras_of_bowler,dot_balls_bowled,boundaries_conceded
0,335982,2008,1,Royal Challengers Bangalore,AA Noffke,0,0,,0,0,35,1,24,5,6,4
1,335982,2008,1,Kolkata Knight Riders,BB McCullum,158,73,No,23,17,0,0,0,0,0,0
2,335982,2008,1,Royal Challengers Bangalore,CL White,0,0,,0,0,22,0,6,2,0,4
3,335982,2008,1,Kolkata Knight Riders,DJ Hussey,12,12,Yes,1,4,0,0,0,0,0,0
4,335982,2008,1,Royal Challengers Bangalore,JH Kallis,0,0,,0,0,47,1,24,1,8,8


## 2.3 SEASONAL PERFORMANCE AGGREGATION

In [460]:

df_season_perf = df_match_perf_team.groupby(['Player', 'Season', 'Team']).agg(
    total_runs=('runs_scored', 'sum'),
    total_balls_faced=('balls_faced', 'sum'),
    total_wickets_taken=('wickets_taken', 'sum'),
    total_runs_conceded=('runs_conceded', 'sum'),
    total_balls_bowled=('balls_bowled', 'sum'),
    total_boundaries=('boundaries', 'sum'),
    total_dot_balls_faced=('dot_balls_faced', 'sum'),
    total_dot_balls_bowled=('dot_balls_bowled', 'sum'),
    total_matches=('match_id', 'nunique')
).reset_index()


### 2.3.1 Batting stats

In [461]:
print("Calculating advanced batting statistics...")
seasonal_batting_stats = df_match_perf_team.groupby(['Player', 'Season', 'Team']).agg(
    total_runs=('runs_scored', 'sum'),
    total_balls_faced=('balls_faced', 'sum'),
    total_boundaries=('boundaries', 'sum'),
    times_dismissed=('was_dismissed', lambda x: (x == 'Yes').sum()),
    innings_batted=('was_dismissed', lambda x: x.notna().sum())
).reset_index()


Calculating advanced batting statistics...


In [462]:

# Batting metrics
seasonal_batting_stats['Strike_Rate'] = np.where(
    seasonal_batting_stats['total_balls_faced'] > 0,
    (seasonal_batting_stats['total_runs'] / seasonal_batting_stats['total_balls_faced']) * 100,
    0.0
)

seasonal_batting_stats['Batting_Average'] = np.where(
    seasonal_batting_stats['times_dismissed'] > 0,
    seasonal_batting_stats['total_runs'] / seasonal_batting_stats['times_dismissed'],
    np.nan
)

seasonal_batting_stats['Boundary_Percentage'] = np.where(
    seasonal_batting_stats['total_balls_faced'] > 0,
    (seasonal_batting_stats['total_boundaries'] / seasonal_batting_stats['total_balls_faced']) * 100,
    0.0
)


In [463]:
seasonal_batting_stats.head()

Unnamed: 0,Player,Season,Team,total_runs,total_balls_faced,total_boundaries,times_dismissed,innings_batted,Strike_Rate,Batting_Average,Boundary_Percentage
0,A Ashish Reddy,2012,Deccan Chargers,35,29,4,4,5,120.689655,8.75,13.793103
1,A Ashish Reddy,2013,Sunrisers Hyderabad,125,89,13,6,10,140.449438,20.833333,14.606742
2,A Ashish Reddy,2015,Sunrisers Hyderabad,73,46,8,3,5,158.695652,24.333333,17.391304
3,A Ashish Reddy,2016,Sunrisers Hyderabad,47,29,6,2,3,162.068966,23.5,20.689655
4,A Badoni,2022,Lucknow Super Giants,161,130,18,8,11,123.846154,20.125,13.846154


### 2.3.2 Bowling stats

In [464]:

# 7.3 Advanced Bowling Statistics
print("Calculating advanced bowling statistics...")
seasonal_bowling_stats = df_match_perf_team.groupby(['Player', 'Season']).agg(
    total_wickets=('wickets_taken', 'sum'),
    total_runs_conceded=('runs_conceded', 'sum'),
    total_balls_bowled=('balls_bowled', 'sum'),
    total_dot_balls=('dot_balls_bowled', 'sum'),
    total_boundaries_conceded=('boundaries_conceded', 'sum'),
    total_innings_bowled=('match_id', 'nunique')
).reset_index()


Calculating advanced bowling statistics...


In [465]:

# Bowling metrics
seasonal_bowling_stats['Economy_Rate'] = np.where(
    seasonal_bowling_stats['total_balls_bowled'] > 0,
    (seasonal_bowling_stats['total_runs_conceded'] / seasonal_bowling_stats['total_balls_bowled']) * 6,
    np.nan
)

seasonal_bowling_stats['Balls_Per_Wicket'] = np.where(
    seasonal_bowling_stats['total_wickets'] > 0,
    seasonal_bowling_stats['total_balls_bowled'] / seasonal_bowling_stats['total_wickets'],
    np.nan
)

seasonal_bowling_stats['Dot_Ball_Percentage'] = np.where(
    seasonal_bowling_stats['total_balls_bowled'] > 0,
    (seasonal_bowling_stats['total_dot_balls'] / seasonal_bowling_stats['total_balls_bowled']) * 100,
    0.0
)


In [466]:
seasonal_bowling_stats.head()

Unnamed: 0,Player,Season,total_wickets,total_runs_conceded,total_balls_bowled,total_dot_balls,total_boundaries_conceded,total_innings_bowled,Economy_Rate,Balls_Per_Wicket,Dot_Ball_Percentage
0,A Ashish Reddy,2012,11,227,163,55,26,9,8.355828,14.818182,33.742331
1,A Ashish Reddy,2013,3,69,40,10,10,10,10.35,13.333333,25.0
2,A Ashish Reddy,2015,4,49,36,9,4,6,8.166667,9.0,25.0
3,A Ashish Reddy,2016,1,39,23,5,6,3,10.173913,23.0,21.73913
4,A Badoni,2022,2,11,12,3,0,11,5.5,6.0,25.0


# 3. IMPACT SCORE CALCULATIONS

In [467]:
# Z-score normalization within seasons
def standardize_within_season(df, metric_column):
    return df.groupby('Season')[metric_column].transform(
        lambda x: (x - x.mean()) / x.std() if x.std() > 0 else 0
    )


## 3.1 Batting impact score

In [468]:
print("Calculating batting impact scores...")
batting_stats_clean = seasonal_batting_stats.copy()
batting_stats_clean['Batting_Average'] = batting_stats_clean['Batting_Average'].fillna(0)


# standardize batting metrics
batting_stats_clean['z_runs'] = standardize_within_season(batting_stats_clean, 'total_runs')
batting_stats_clean['z_avg'] = standardize_within_season(batting_stats_clean, 'Batting_Average')
batting_stats_clean['z_sr'] = standardize_within_season(batting_stats_clean, 'Strike_Rate')
#batting_stats_clean['z_boundary_pct'] = standardize_within_season(batting_stats_clean, 'Boundary_Percentage')


# Batting impact score (weights can be adjusted)
w_runs, w_avg, w_sr = 0.55, 0.15, 0.30
batting_stats_clean['Batting_Impact_Score'] = (
    (w_runs * batting_stats_clean['z_runs']) + 
    (w_avg * batting_stats_clean['z_avg']) + 
    (w_sr * batting_stats_clean['z_sr']) #+
    #(w_boundary * batting_stats_clean['z_boundary_pct'])
)


Calculating batting impact scores...


In [469]:
batting_stats_clean.head(200)

Unnamed: 0,Player,Season,Team,total_runs,total_balls_faced,total_boundaries,times_dismissed,innings_batted,Strike_Rate,Batting_Average,Boundary_Percentage,z_runs,z_avg,z_sr,Batting_Impact_Score
0,A Ashish Reddy,2012,Deccan Chargers,35,29,4,4,5,120.689655,8.750000,13.793103,-0.524277,-0.447956,0.366190,-0.245689
1,A Ashish Reddy,2013,Sunrisers Hyderabad,125,89,13,6,10,140.449438,20.833333,14.606742,0.098457,0.438442,0.941692,0.402425
2,A Ashish Reddy,2015,Sunrisers Hyderabad,73,46,8,3,5,158.695652,24.333333,17.391304,-0.311240,0.474229,0.991535,0.197413
3,A Ashish Reddy,2016,Sunrisers Hyderabad,47,29,6,2,3,162.068966,23.500000,20.689655,-0.404345,0.405249,1.085261,0.163976
4,A Badoni,2022,Lucknow Super Giants,161,130,18,8,11,123.846154,20.125000,13.846154,0.252398,0.201137,0.354430,0.275319
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,AJ Tye,2022,Lucknow Super Giants,0,0,0,0,0,0.000000,0.000000,0.000000,-0.771860,-1.070513,-1.717965,-1.100489
196,AK Markram,2021,Punjab Kings,146,119,16,5,6,122.689076,29.200000,13.445378,0.271141,0.866810,0.534809,0.439592
197,AK Markram,2022,Sunrisers Hyderabad,381,274,44,8,12,139.051095,47.625000,16.058394,1.652005,1.938795,0.608864,1.382081
198,AK Markram,2023,Sunrisers Hyderabad,248,197,26,11,13,125.888325,22.545455,13.197970,0.776179,0.474026,0.480932,0.642282


In [470]:
# Analyze the distribution
print("\nBatting Impact Score Distribution:")
print(f"Mean: {batting_stats_clean['Batting_Impact_Score'].mean():.2f}")
print(f"Std Dev: {batting_stats_clean['Batting_Impact_Score'].std():.2f}")
print(f"Min: {batting_stats_clean['Batting_Impact_Score'].min():.2f}")
print(f"Max: {batting_stats_clean['Batting_Impact_Score'].max():.2f}")



Batting Impact Score Distribution:
Mean: 0.00
Std Dev: 0.87
Min: -1.19
Max: 3.71


In [471]:

# Display results
print("Top Batters by Impact Score (Sample):")
top_batters = batting_stats_clean.sort_values('Batting_Impact_Score', ascending=False)
#print(top_batters[['Player', 'Season', 'total_runs', 'Batting_Average', 'Strike_Rate', 'Batting_Impact_Score']].head(15))
print(top_batters[['Player', 'Season', 'Batting_Impact_Score']].head(30))


Top Batters by Impact Score (Sample):
               Player  Season  Batting_Impact_Score
2935          V Kohli    2016              3.708310
2792     Shubman Gill    2023              3.241948
1066       JC Buttler    2022              3.212716
626         DA Warner    2016              3.107372
523          CH Gayle    2011              3.004216
525          CH Gayle    2013              3.000369
524          CH Gayle    2012              2.974329
1665        ML Hayden    2009              2.952288
1594       MEK Hussey    2013              2.859129
2943          V Kohli    2024              2.772466
2531         SE Marsh    2008              2.685534
1300         KL Rahul    2021              2.679047
807      F du Plessis    2023              2.670318
627         DA Warner    2017              2.654404
628         DA Warner    2019              2.623306
2652     SR Tendulkar    2010              2.602839
121    AB de Villiers    2016              2.592013
2500         SA Yadav    2

## 3.2 Bowling impact score

In [472]:

# 8.2 Bowling Impact Score
print("Calculating bowling impact scores...")
bowling_stats_clean = seasonal_bowling_stats.copy()

# standardize bowling metrics
bowling_stats_clean['z_wickets'] = standardize_within_season(bowling_stats_clean, 'total_wickets')
bowling_stats_clean['z_economy'] = standardize_within_season(bowling_stats_clean, 'Economy_Rate')
bowling_stats_clean['z_strike'] = standardize_within_season(bowling_stats_clean, 'Balls_Per_Wicket')
#bowling_stats_clean['z_dot_pct'] = standardize_within_season(bowling_stats_clean, 'Dot_Ball_Percentage')

# INVERT metrics where lower is better (economy, balls per wicket)
bowling_stats_clean['z_economy_inv'] = -bowling_stats_clean['z_economy']
bowling_stats_clean['z_strike_inv'] = -bowling_stats_clean['z_strike']

# Bowling impact score (weights can be adjusted)
w_wickets, w_economy, w_strike = 0.55, 0.30, 0.15
bowling_stats_clean['Bowling_Impact_Score'] =(
    (w_wickets * bowling_stats_clean['z_wickets']) + 
    (w_economy * bowling_stats_clean['z_economy_inv']) + 
    (w_strike * bowling_stats_clean['z_strike_inv']) #+
    #(w_dot * bowling_stats_clean['z_dot_pct'])
)


Calculating bowling impact scores...


In [473]:
bowling_stats_clean.head()

Unnamed: 0,Player,Season,total_wickets,total_runs_conceded,total_balls_bowled,total_dot_balls,total_boundaries_conceded,total_innings_bowled,Economy_Rate,Balls_Per_Wicket,Dot_Ball_Percentage,z_wickets,z_economy,z_strike,z_economy_inv,z_strike_inv,Bowling_Impact_Score
0,A Ashish Reddy,2012,11,227,163,55,26,9,8.355828,14.818182,33.742331,1.014001,-0.047893,-0.622472,0.047893,0.622472,0.66544
1,A Ashish Reddy,2013,3,69,40,10,10,10,10.35,13.333333,25.0,-0.227427,1.394067,-0.774791,-1.394067,0.774791,-0.427086
2,A Ashish Reddy,2015,4,49,36,9,4,6,8.166667,9.0,25.0,-0.110843,-0.078921,-0.921268,0.078921,0.921268,0.100903
3,A Ashish Reddy,2016,1,39,23,5,6,3,10.173913,23.0,21.73913,-0.540163,1.110638,-0.189578,-1.110638,0.189578,-0.601844
4,A Badoni,2022,2,11,12,3,0,11,5.5,6.0,25.0,-0.413646,-1.109358,-1.50942,1.109358,1.50942,0.331715


In [474]:
# Analyze the distribution
print("\nbowling Impact Score Distribution:")
print(f"Mean: {bowling_stats_clean['Bowling_Impact_Score'].mean():.2f}")
print(f"Std Dev: {bowling_stats_clean['Bowling_Impact_Score'].std():.2f}")
print(f"Min: {bowling_stats_clean['Bowling_Impact_Score'].min():.2f}")
print(f"Max: {bowling_stats_clean['Bowling_Impact_Score'].max():.2f}")



bowling Impact Score Distribution:
Mean: 0.36
Std Dev: 0.74
Min: -1.40
Max: 2.86


In [475]:

# Display results
print("Top Bowlers by Impact Score (Sample):")
top_bowlers = bowling_stats_clean.sort_values('Bowling_Impact_Score', ascending=False)
#print(top_bowlers[['Player', 'Season', 'total_wickets', 'Economy_Rate', 'Balls_Per_Wicket', 'Bowling_Impact_Score']].head(15))
print(top_bowlers[['Player', 'Season', 'total_wickets',  'Bowling_Impact_Score']].head(15))


Top Bowlers by Impact Score (Sample):
           Player  Season  total_wickets  Bowling_Impact_Score
914      HV Patel    2021             35              2.864172
2573   SL Malinga    2011             30              2.858554
2627    SP Narine    2012             29              2.595003
1116    JJ Bumrah    2020             30              2.560436
1690    MM Sharma    2023             31              2.556630
402       B Kumar    2017             28              2.523608
1492     M Morkel    2012             30              2.503494
917      HV Patel    2024             30              2.489731
1158  JP Faulkner    2013             33              2.484125
1200     K Rabada    2020             32              2.467900
1199     K Rabada    2019             29              2.459007
1080   JD Unadkat    2017             27              2.430356
658      DJ Bravo    2013             34              2.379113
192        AJ Tye    2018             28              2.364792
2294     RP Singh

## 3.3 Merging

In [476]:
print("Merging impact scores...")

# Merge batting and bowling impact scores
player_impact_merged = pd.merge(
    batting_stats_clean,
    bowling_stats_clean[['Player', 'Season', 'Bowling_Impact_Score', 'total_wickets', 'Economy_Rate', 'total_balls_bowled']],
    on=['Player', 'Season'],
    how='outer'  # Keep all players (even if they only bat or only bowl)
)

# Fill NaN values for players who don't have both batting and bowling data
player_impact_merged['total_wickets'] = player_impact_merged['total_wickets'].fillna(0)
player_impact_merged['total_balls_bowled'] = player_impact_merged['total_balls_bowled'].fillna(0)
player_impact_merged['Economy_Rate'] = player_impact_merged['Economy_Rate'].fillna(999)  # High economy for non-bowlers

print(f"Merged dataset shape: {player_impact_merged.shape}")


Merging impact scores...
Merged dataset shape: (3137, 19)


In [477]:
player_impact_merged

Unnamed: 0,Player,Season,Team,total_runs,total_balls_faced,total_boundaries,times_dismissed,innings_batted,Strike_Rate,Batting_Average,Boundary_Percentage,z_runs,z_avg,z_sr,Batting_Impact_Score,Bowling_Impact_Score,total_wickets,Economy_Rate,total_balls_bowled
0,A Ashish Reddy,2012,Deccan Chargers,35,29,4,4,5,120.689655,8.750000,13.793103,-0.524277,-0.447956,0.366190,-0.245689,0.665440,11,8.355828,163
1,A Ashish Reddy,2013,Sunrisers Hyderabad,125,89,13,6,10,140.449438,20.833333,14.606742,0.098457,0.438442,0.941692,0.402425,-0.427086,3,10.350000,40
2,A Ashish Reddy,2015,Sunrisers Hyderabad,73,46,8,3,5,158.695652,24.333333,17.391304,-0.311240,0.474229,0.991535,0.197413,0.100903,4,8.166667,36
3,A Ashish Reddy,2016,Sunrisers Hyderabad,47,29,6,2,3,162.068966,23.500000,20.689655,-0.404345,0.405249,1.085261,0.163976,-0.601844,1,10.173913,23
4,A Badoni,2022,Lucknow Super Giants,161,130,18,8,11,123.846154,20.125000,13.846154,0.252398,0.201137,0.354430,0.275319,0.331715,2,5.500000,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3132,Z Khan,2014,Mumbai Indians,8,7,1,1,3,114.285714,8.000000,14.285714,-0.707860,-0.524091,0.297987,-0.378541,0.489800,6,6.358209,134
3133,Z Khan,2015,Delhi Daredevils,0,0,0,0,0,0.000000,0.000000,0.000000,-0.792747,-1.072419,-1.882780,-1.161708,0.729679,8,6.000000,145
3134,Z Khan,2016,Delhi Daredevils,6,13,1,2,2,46.153846,3.000000,7.692308,-0.652954,-0.800379,-0.910912,-0.752455,0.892399,12,7.419847,262
3135,Z Khan,2017,Delhi Daredevils,4,13,0,1,3,30.769231,4.000000,0.000000,-0.748226,-0.785173,-1.258652,-0.906896,0.864578,12,7.668050,241


In [478]:
player_impact_merged.head()

Unnamed: 0,Player,Season,Team,total_runs,total_balls_faced,total_boundaries,times_dismissed,innings_batted,Strike_Rate,Batting_Average,Boundary_Percentage,z_runs,z_avg,z_sr,Batting_Impact_Score,Bowling_Impact_Score,total_wickets,Economy_Rate,total_balls_bowled
0,A Ashish Reddy,2012,Deccan Chargers,35,29,4,4,5,120.689655,8.75,13.793103,-0.524277,-0.447956,0.36619,-0.245689,0.66544,11,8.355828,163
1,A Ashish Reddy,2013,Sunrisers Hyderabad,125,89,13,6,10,140.449438,20.833333,14.606742,0.098457,0.438442,0.941692,0.402425,-0.427086,3,10.35,40
2,A Ashish Reddy,2015,Sunrisers Hyderabad,73,46,8,3,5,158.695652,24.333333,17.391304,-0.31124,0.474229,0.991535,0.197413,0.100903,4,8.166667,36
3,A Ashish Reddy,2016,Sunrisers Hyderabad,47,29,6,2,3,162.068966,23.5,20.689655,-0.404345,0.405249,1.085261,0.163976,-0.601844,1,10.173913,23
4,A Badoni,2022,Lucknow Super Giants,161,130,18,8,11,123.846154,20.125,13.846154,0.252398,0.201137,0.35443,0.275319,0.331715,2,5.5,12


## 3.4 Player role classification

In [495]:
#print("Adding standardized scores for fair comparison...")

# Calculate Z-scores based on your distribution statistics
player_impact_merged['Batting_Impact_Z'] = player_impact_merged['Batting_Impact_Score'] + 5
player_impact_merged['Bowling_Impact_Z'] = player_impact_merged['Bowling_Impact_Score'] -0.36 + 5


In [496]:

print("Standardized scores calculated:")
print(f"Batting Z-scores: Mean = {player_impact_merged['Batting_Impact_Z'].mean():.2f}, Std = {player_impact_merged['Batting_Impact_Z'].std():.2f}, min = {player_impact_merged['Batting_Impact_Z'].min():.2f}, max = {player_impact_merged['Batting_Impact_Z'].max():.2f}")
print(f"Bowling Z-scores: Mean = {player_impact_merged['Bowling_Impact_Z'].mean():.2f}, Std = {player_impact_merged['Bowling_Impact_Z'].std():.2f}, min = {player_impact_merged['Bowling_Impact_Z'].min():.2f}, max = {player_impact_merged['Bowling_Impact_Z'].max():.2f}")


Standardized scores calculated:
Batting Z-scores: Mean = 5.00, Std = 0.87, min = 3.81, max = 8.71
Bowling Z-scores: Mean = 5.00, Std = 0.74, min = 3.24, max = 7.50


In [497]:
player_impact_merged.head()

Unnamed: 0,Player,Season,Team,total_runs,total_balls_faced,total_boundaries,times_dismissed,innings_batted,Strike_Rate,Batting_Average,...,total_wickets,Economy_Rate,total_balls_bowled,Batting_Impact_Z,Bowling_Impact_Z,Weighted_Impact_Score,Player_Role,Batting_Impact_Normalized,Bowling_Impact_Normalized,Overall_Impact_Normalized
0,A Ashish Reddy,2012,Deccan Chargers,35,29,4,4,5,120.689655,8.75,...,11,8.355828,163,4.754311,5.30544,5.029875,All-Rounder,19.279551,48.464774,57.268795
1,A Ashish Reddy,2013,Sunrisers Hyderabad,125,89,13,6,10,140.449438,20.833333,...,3,10.35,40,5.402425,4.212914,5.283474,Batter,32.510729,22.857489,60.214771
2,A Ashish Reddy,2015,Sunrisers Hyderabad,73,46,8,3,5,158.695652,24.333333,...,4,8.166667,36,5.197413,4.740903,4.969158,All-Rounder,28.325424,35.232814,56.563458
3,A Ashish Reddy,2016,Sunrisers Hyderabad,47,29,6,2,3,162.068966,23.5,...,1,10.173913,23,5.163976,4.038156,5.163976,Batter,27.642813,18.761404,58.826599
4,A Badoni,2022,Lucknow Super Giants,161,130,18,8,11,123.846154,20.125,...,2,5.5,12,5.275319,4.971715,5.275319,Batter,29.91586,40.642737,60.120031


In [498]:
print("Classifying player roles with enhanced logic...")

def classify_player_role(row):
    """
    Determine player role based on Score batting and bowling impact scores
    """
    batting_score = row['Batting_Impact_Z']
    bowling_score = row['Bowling_Impact_Z']
    
    # Minimum qualification criteria
    MIN_BATTING_INNINGS = 2
    MIN_BOWLING_BALLS = 35
    
    is_batter = row['innings_batted'] >= MIN_BATTING_INNINGS
    is_bowler = row['total_balls_bowled'] >= MIN_BOWLING_BALLS
    
    # Handle players who don't meet minimum criteria
    if not is_batter and not is_bowler:
        return 0.1, 'Low data'  
     # Single-role players
    if is_bowler and not is_batter:
        # Pure bowler - use bowling score directly
        return bowling_score, 'Bowler'
    
    if is_batter and not is_bowler:
        # Pure batter - use batting score directly  
        return batting_score, 'Batter'
    

    # Both qualify - use simple difference in Z-scores
    batting_advantage = batting_score - bowling_score
    
    if batting_advantage > 1:
        # Significantly better at batting
        weighted_score = 0.9 * batting_score + 0.1 * bowling_score
        return weighted_score, 'Batter'
    elif batting_advantage < -0.75:
        # Significantly better at bowling
        weighted_score = 0.1 * batting_score + 0.9 * bowling_score
        return weighted_score, 'Bowler'
    else:
        if batting_advantage > 0.4:
            weighted_score = 0.8 * batting_score + 0.2 * bowling_score
            return weighted_score, 'All-Rounder'
        elif batting_advantage < -0.3:
            weighted_score = 0.2 * batting_score + 0.3 * bowling_score
            return weighted_score, 'All-Rounder'
        weighted_score = 0.55 * batting_score + 0.45 * bowling_score
        return weighted_score, 'All-Rounder'


# Apply role classification
role_results = player_impact_merged.apply(classify_player_role, axis=1)
player_impact_merged['Weighted_Impact_Score'] = role_results.apply(lambda x: x[0])
player_impact_merged['Player_Role'] = role_results.apply(lambda x: x[1])

print("Player roles classified")


Classifying player roles with enhanced logic...
Player roles classified


In [499]:
# Role distribution
print(f"ROLE DISTRIBUTION:")
role_counts = player_impact_merged['Player_Role'].value_counts()
for role, count in role_counts.items():
    print(f"{role}: {count} player-seasons")


ROLE DISTRIBUTION:
Batter: 1313 player-seasons
Bowler: 786 player-seasons
All-Rounder: 635 player-seasons
Low data: 403 player-seasons


In [500]:
print("Normalizing final scores to 0-100 scale...")

def normalize_to_100(series):
    """Normalize any series to 0-100 scale"""
    min_val = series.min()
    max_val = series.max()
    if max_val == min_val:
        return 50  # All same values → middle score
    return (series - min_val) / (max_val - min_val) * 100

# Normalize all impact scores to 0-100 for easier interpretation
player_impact_merged['Batting_Impact_Normalized'] = normalize_to_100(player_impact_merged['Batting_Impact_Score'])
player_impact_merged['Bowling_Impact_Normalized'] = normalize_to_100(player_impact_merged['Bowling_Impact_Score'])
player_impact_merged['Overall_Impact_Normalized'] = normalize_to_100(player_impact_merged['Weighted_Impact_Score'])

print("All scores normalized to 0-100 scale")


Normalizing final scores to 0-100 scale...
All scores normalized to 0-100 scale


In [501]:
# Top overall players (using normalized scores for readability)
print("TOP 10 OVERALL PLAYERS (0-100 Scale):")
top_overall = player_impact_merged.nlargest(10, 'Overall_Impact_Normalized')[
    ['Player', 'Season', 'Team', 'Player_Role', 'Overall_Impact_Normalized', 
     'Batting_Impact_Normalized', 'Bowling_Impact_Normalized']
]
print(top_overall.round(1))


TOP 10 OVERALL PLAYERS (0-100 Scale):
            Player  Season                         Team Player_Role  \
2935       V Kohli    2016  Royal Challengers Bangalore      Batter   
2792  Shubman Gill    2023               Gujarat Titans      Batter   
1066    JC Buttler    2022             Rajasthan Royals      Batter   
626      DA Warner    2016          Sunrisers Hyderabad      Batter   
525       CH Gayle    2013  Royal Challengers Bangalore      Batter   
524       CH Gayle    2012  Royal Challengers Bangalore      Batter   
1665     ML Hayden    2009          Chennai Super Kings      Batter   
1594    MEK Hussey    2013          Chennai Super Kings      Batter   
2943       V Kohli    2024  Royal Challengers Bengaluru      Batter   
523       CH Gayle    2011  Royal Challengers Bangalore      Batter   

      Overall_Impact_Normalized  Batting_Impact_Normalized  \
2935                      100.0                      100.0   
2792                       94.6                       90

In [502]:
# Top by role
print("TOP PLAYERS BY ROLE:")
for role in player_impact_merged['Player_Role'].unique():
    role_players = player_impact_merged[player_impact_merged['Player_Role'] == role]
    if len(role_players) > 0:
        top_in_role = role_players.nlargest(3, 'Overall_Impact_Normalized')[
            ['Player', 'Season', 'Overall_Impact_Normalized', 'Player_Role']
        ]
        print(f"\n{role}:")
        print(top_in_role.round(1))


TOP PLAYERS BY ROLE:

All-Rounder:
         Player  Season  Overall_Impact_Normalized  Player_Role
2656  SR Watson    2008                       79.2  All-Rounder
2639  SP Narine    2024                       76.5  All-Rounder
895   HH Pandya    2019                       74.4  All-Rounder

Batter:
            Player  Season  Overall_Impact_Normalized Player_Role
2935       V Kohli    2016                      100.0      Batter
2792  Shubman Gill    2023                       94.6      Batter
1066    JC Buttler    2022                       94.2      Batter

Bowler:
          Player  Season  Overall_Impact_Normalized Player_Role
914     HV Patel    2021                       83.1      Bowler
1116   JJ Bumrah    2020                       82.5      Bowler
2573  SL Malinga    2011                       82.5      Bowler

Low data:
         Player  Season  Overall_Impact_Normalized Player_Role
11     A Chopra    2009                        0.0    Low data
13  A Dananjaya    2018           

In [503]:
# VALIDATION CHECKS

# Check score ranges
print("Score Ranges (0-100 scale):")
print(f"Batting Impact: {player_impact_merged['Batting_Impact_Normalized'].min():.1f} to {player_impact_merged['Batting_Impact_Normalized'].max():.1f}")
print(f"Bowling Impact: {player_impact_merged['Bowling_Impact_Normalized'].min():.1f} to {player_impact_merged['Bowling_Impact_Normalized'].max():.1f}")
print(f"Overall Impact: {player_impact_merged['Overall_Impact_Normalized'].min():.1f} to {player_impact_merged['Overall_Impact_Normalized'].max():.1f}")

# Check if known all-rounders are classified correctly
all_rounders = ['RA Jadeja', 'SH Khan', 'AD Russell', 'BA Stokes']
print(f"\nAll-Rounder Classification Check:")
for player in all_rounders:
    player_data = player_impact_merged[player_impact_merged['Player'] == player]
    if not player_data.empty:
        roles = player_data['Player_Role'].unique()
        print(f"{player}: {', '.join(roles)}")


Score Ranges (0-100 scale):
Batting Impact: 0.0 to 100.0
Bowling Impact: 0.0 to 100.0
Overall Impact: 0.0 to 100.0

All-Rounder Classification Check:
RA Jadeja: Batter, All-Rounder, Bowler
AD Russell: Batter, All-Rounder
BA Stokes: All-Rounder, Batter, Low data


# 4. FINAL OUTPUTS AND SUMMARY

In [488]:

# Top performers sample
print("\nTop 5 Batters by Impact Score:")
top_batters = batting_stats_clean.sort_values('Batting_Impact_Score', ascending=False).head()
print(top_batters[['Player', 'Season', 'Batting_Impact_Score']])

print("\nTop 5 Bowlers by Impact Score:")
top_bowlers = bowling_stats_clean.sort_values('Bowling_Impact_Score', ascending=False).head()
print(top_bowlers[['Player', 'Season', 'Bowling_Impact_Score']])



Top 5 Batters by Impact Score:
            Player  Season  Batting_Impact_Score
2935       V Kohli    2016              3.708310
2792  Shubman Gill    2023              3.241948
1066    JC Buttler    2022              3.212716
626      DA Warner    2016              3.107372
523       CH Gayle    2011              3.004216

Top 5 Bowlers by Impact Score:
          Player  Season  Bowling_Impact_Score
914     HV Patel    2021              2.864172
2573  SL Malinga    2011              2.858554
2627   SP Narine    2012              2.595003
1116   JJ Bumrah    2020              2.560436
1690   MM Sharma    2023              2.556630


## 4.1 SAVE PROCESSED DATASETS

In [489]:
# Save all processed datasets
df_player_team_year.to_csv('outputs/df_player_team_year.csv', index=False)
df_match_perf_team.to_csv('outputs/df_match_wise_performance.csv', index=False)
df_season_perf.to_csv('outputs/df_season_wise_performance.csv', index=False)
seasonal_batting_stats.to_csv('outputs/seasonal_batting_stats.csv', index=False)
seasonal_bowling_stats.to_csv('outputs/seasonal_bowling_stats.csv', index=False)
batting_stats_clean.to_csv('outputs/batting_impact_scores.csv', index=False)
bowling_stats_clean.to_csv('outputs/bowling_impact_scores.csv', index=False)

print("All datasets saved successfully!")
print("\nPreprocessing pipeline completed!")

All datasets saved successfully!

Preprocessing pipeline completed!


In [490]:
df.to_csv('outputs/df_cleaned.csv', index=False)

In [504]:
player_impact_merged.to_csv('outputs/impact_scores_with_role.csv', index = False)

In [494]:
player_impact_merged.head()

Unnamed: 0,Player,Season,Team,total_runs,total_balls_faced,total_boundaries,times_dismissed,innings_batted,Strike_Rate,Batting_Average,...,total_wickets,Economy_Rate,total_balls_bowled,Batting_Impact_Z,Bowling_Impact_Z,Weighted_Impact_Score,Player_Role,Batting_Impact_Normalized,Bowling_Impact_Normalized,Overall_Impact_Normalized
0,A Ashish Reddy,2012,Deccan Chargers,35,29,4,4,5,120.689655,8.75,...,11,8.355828,163,4.754311,5.30544,5.029875,All-Rounder,19.279551,48.464774,57.268795
1,A Ashish Reddy,2013,Sunrisers Hyderabad,125,89,13,6,10,140.449438,20.833333,...,3,10.35,40,5.402425,4.212914,5.283474,Batter,32.510729,22.857489,60.214771
2,A Ashish Reddy,2015,Sunrisers Hyderabad,73,46,8,3,5,158.695652,24.333333,...,4,8.166667,36,5.197413,4.740903,4.969158,All-Rounder,28.325424,35.232814,56.563458
3,A Ashish Reddy,2016,Sunrisers Hyderabad,47,29,6,2,3,162.068966,23.5,...,1,10.173913,23,5.163976,4.038156,5.163976,Batter,27.642813,18.761404,58.826599
4,A Badoni,2022,Lucknow Super Giants,161,130,18,8,11,123.846154,20.125,...,2,5.5,12,5.275319,4.971715,5.275319,Batter,29.91586,40.642737,60.120031
