In [1]:
import pandas as pd

df = pd.read_csv("ipl_dataset_2025.csv")

df.head(2)


Unnamed: 0,match_id,date,venue,team_1,team_2,stage,toss-winner,toss_decision,first_ings_score,first_ings_wkts,second_ings_score,second_ings_wkts,match_winner,won_by,margin,player_of_the_match,top_scorer,highscore,best_bowling,Unnamed: 19
0,1,22-Mar-25,Kolkata,Kolkata Knight Riders,Royal Challengers Bengaluru,,Royal Challengers Bengaluru,Bowling,174,8,177,3.0,Royal Challengers Bengaluru,Wickets,7.0,Krunal Pandya,Virat Kohli,59.0,Krunal Pandya,
1,2,23-Mar-25,Hyderabad,Sunrisers Hyderabad,Rajasthan Royals,,Rajasthan Royals,Bowling,286,6,242,6.0,Sunrisers Hyderabad,Runs,44.0,Ishan Kishan,Ishan Kishan,106.0,Tushar Deshpande,


In [2]:
# Check data types of all columns
df.dtypes


match_id                 int64
date                    object
venue                   object
team_1                  object
team_2                  object
stage                  float64
toss-winner             object
toss_decision           object
first_ings_score        object
first_ings_wkts         object
second_ings_score       object
second_ings_wkts       float64
match_winner            object
won_by                  object
margin                 float64
player_of_the_match     object
top_scorer              object
highscore              float64
best_bowling            object
Unnamed: 19            float64
dtype: object

In [8]:
# Which team had the highest win rate in high-scoring matches (total score > 180)?
df['first_ings_score'] = pd.to_numeric(df['first_ings_score'], errors='coerce')
df['second_ings_score'] = pd.to_numeric(df['second_ings_score'], errors='coerce')

high_score_matches = df[(df['first_ings_score'] > 180) | (df['second_ings_score'] > 180)]
win_counts = high_score_matches['match_winner'].value_counts(normalize=True).reset_index()
win_counts.columns = ['Team', 'Win_Rate']

print("Team with Highest Win Rate in High-Scoring Matches (>180):")
print(win_counts.head(1))

Team with Highest Win Rate in High-Scoring Matches (>180):
           Team  Win_Rate
0  Punjab Kings  0.170213


In [9]:
# When a team batted first, how often did they win compared to chasing teams?
df['team_batting_first'] = df.apply(
    lambda row: row['team_1'] if row['toss_decision'] == 'Batting' else row['team_2'], axis=1)
bat_first_wins = (df['team_batting_first'] == df['match_winner']).sum()
total_matches = len(df)

bat_win_rate = round(bat_first_wins / total_matches * 100, 2)
chase_win_rate = round(100 - bat_win_rate, 2)

print(f" Batting First Win Rate: {bat_win_rate}%")
print(f" Chasing Win Rate: {chase_win_rate}%")

 Batting First Win Rate: 52.7%
 Chasing Win Rate: 47.3%


In [10]:
# Which city/venue saw the most closely contested matches (margin < 10 runs or < 2 wickets)?
df['margin'] = pd.to_numeric(df['margin'], errors='coerce')

close_matches = df[
    ((df['won_by'] == 'Runs') & (df['margin'] < 10)) |
    ((df['won_by'] == 'Wickets') & (df['margin'] < 2))
]

venue_counts = close_matches['venue'].value_counts().reset_index()
venue_counts.columns = ['Venue', 'Close_Match_Count']

print(" Venue with Most Close Matches:")
print(venue_counts.head(1))

 Venue with Most Close Matches:
            Venue  Close_Match_Count
0  Vishakhapatnam                  1


In [11]:
# How often did the Player of the Match come from the losing team?
df['potm_lost'] = df['player_of_the_match'] != df['match_winner']
count_lost = df['potm_lost'].sum()
total = len(df)
rate = round(count_lost / total * 100, 2)

print(f" Player of the Match from Losing Team: {rate}% of matches")

 Player of the Match from Losing Team: 98.65% of matches


In [13]:
#  Which player contributed more: Krunal Pandya or Prasidh Krishna?
def impact(player):
    return (
        (df['player_of_the_match'] == player).sum() +
        (df['top_scorer'] == player).sum() +
        (df['best_bowling'] == player).sum()
    )

krunal = impact("Krunal Pandya")
prasidh = impact("Prasidh Krishna")

print(f"Krunal Pandya impact score: {krunal}")
print(f"Prasidh Krishna impact score: {prasidh}")

if krunal > prasidh:
    print("Krunal Pandya contributed more")
elif prasidh > krunal:
    print("Prasidh Krishna contributed more")
else:
    print("Both contributed equally")

Krunal Pandya impact score: 9
Prasidh Krishna impact score: 7
Krunal Pandya contributed more


In [15]:
# Who had more match-winning performances — bowlers or batters?
bowlers = df['best_bowling'].value_counts().sum()
batters = df['top_scorer'].value_counts().sum()

print(f"🎯 Bowler performances (best bowling): {bowlers}")
print(f"🏏 Batter performances (top scorer): {batters}")

if bowlers > batters:
    print("Bowlers had more match-winning performances")
elif batters > bowlers:
    print("Batters had more match-winning performances")
else:
    print("Equal match-winning performances")

🎯 Bowler performances (best bowling): 74
🏏 Batter performances (top scorer): 74
Equal match-winning performances


In [16]:
# Clean column names
# Which team made the most effective toss decisions — i.e., had the highest win rate after winning the toss?
df['match_winner'] = df['match_winner'].astype(str)
df['toss-winner'] = df['toss-winner'].astype(str)

# Calculate win rate after winning toss
toss_wins = df[df['toss-winner'] == df['match_winner']]
toss_effectiveness = toss_wins['toss-winner'].value_counts() / df['toss-winner'].value_counts()
toss_effectiveness = toss_effectiveness.dropna().sort_values(ascending=False)

top_team = toss_effectiveness.index[0]
top_rate = round(toss_effectiveness.iloc[0] * 100, 2)

print(f" Most effective toss team: {top_team} with a {top_rate}% win rate after winning the toss.")

 Most effective toss team: Royal Challengers Bengaluru with a 85.71% win rate after winning the toss.


In [19]:
#  Is there a statistically significant advantage to bowling first vs. batting first across all venues?
from scipy.stats import chi2_contingency

# Determine if toss winner chose to bat or bowl
df['batting_first'] = df['toss_decision'].apply(lambda x: 1 if x == 'Batting' else 0)
df['batting_first_won'] = df.apply(
    lambda row: row['match_winner'] == row['team_1'] if row['toss_decision'] == 'Batting'
    else row['match_winner'] == row['team_2'],
    axis=1
)

# Build contingency table
bat = df[df['batting_first'] == 1]['batting_first_won'].value_counts()
bowl = df[df['batting_first'] == 0]['batting_first_won'].value_counts()

table = [
    [bat.get(True, 0), bat.get(False, 0)],
    [bowl.get(True, 0), bowl.get(False, 0)]
]

chi2, p, _, _ = chi2_contingency(table)

print("Chi-Square Test: Bowling vs Batting First")
print(f"P-value: {round(p, 4)}")
if p < 0.05:
    print("Statistically significant difference found.")
else:
    print("No statistically significant advantage between bowling or batting first.")

Chi-Square Test: Bowling vs Batting First
P-value: 0.7848
No statistically significant advantage between bowling or batting first.


In [18]:
# Which player had the highest average match impact when they were Player of the Match — combining top scorer and best bowling frequency?
# Ensure numeric type
df['highscore'] = pd.to_numeric(df['highscore'], errors='coerce')

# Flag 70+ scores
df['highscore_70plus'] = df['highscore'] >= 70
df['top_scorer_won'] = df['top_scorer'] == df['match_winner']

# Drop missing highscores
df = df[df['highscore'].notna()]

# Aggregate win/loss stats
above_70 = df[df['highscore_70plus']]['top_scorer_won'].value_counts()
below_70 = df[~df['highscore_70plus']]['top_scorer_won'].value_counts()

print("🎯 Impact of 70+ Top Scorer:")
print(f"70+ and won:  {above_70.get(True, 0)}")
print(f"70+ and lost: {above_70.get(False, 0)}")
print(f"<70 and won:  {below_70.get(True, 0)}")
print(f"<70 and lost: {below_70.get(False, 0)}")


🎯 Impact of 70+ Top Scorer:
70+ and won:  0
70+ and lost: 41
<70 and won:  0
<70 and lost: 32
