## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

## Loading the data

In [3]:
df = pd.read_csv(r'..\the_real_deal\cricket_prediction_dataset.csv')

In [4]:
df.head()

Unnamed: 0,id,season,city,date,match_type,player_of_match,venue,team1,team2,toss_winner,...,team2_win_streak,team1_last_5_wins,team2_last_5_wins,team1_has_previous_player_of_match,team2_has_previous_player_of_match,team1_player_of_match_count,team2_player_of_match_count,match_number_in_season,total_matches_in_season,season_progress
0,335982,2007/08,Bangalore,2008-04-18,League,BB McCullum,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,...,0,0.0,0.0,0,0,0,0,1,58,0.017241
1,335983,2007/08,Chandigarh,2008-04-19,League,MEK Hussey,"Punjab Cricket Association Stadium, Mohali",Kings XI Punjab,Chennai Super Kings,Chennai Super Kings,...,0,0.0,0.0,0,0,0,0,2,58,0.034483
2,335984,2007/08,Delhi,2008-04-19,League,MF Maharoof,Feroz Shah Kotla,Delhi Daredevils,Rajasthan Royals,Rajasthan Royals,...,0,0.0,0.0,0,0,0,0,3,58,0.051724
3,335985,2007/08,Mumbai,2008-04-20,League,MV Boucher,Wankhede Stadium,Mumbai Indians,Royal Challengers Bangalore,Mumbai Indians,...,0,0.0,0.0,0,0,0,0,4,58,0.068966
4,335986,2007/08,Kolkata,2008-04-20,League,DJ Hussey,Eden Gardens,Kolkata Knight Riders,Deccan Chargers,Deccan Chargers,...,0,1.0,0.0,0,1,0,1,5,58,0.086207


## Some basic statistics run:-

In [5]:
# Match outcomes
print("\nMatch outcomes:")
print(f"Total matches: {len(df)}")
print(f"Matches won by runs: {df[df['result'] == 'runs'].shape[0]}")
print(f"Matches won by wickets: {df[df['result'] == 'wickets'].shape[0]}")


Match outcomes:
Total matches: 1095
Matches won by runs: 498
Matches won by wickets: 578


In [6]:
toss_win_match_win = df[df['toss_winner'] == df['winner']].shape[0]
toss_win_percentage = (toss_win_match_win / len(df)) * 100
print(f"\nToss win led to match win: {toss_win_match_win} times ({toss_win_percentage:.2f}%)")


Toss win led to match win: 554 times (50.59%)


In [7]:
batting_first_win = df[df['batting_first_team'] == df['winner']].shape[0]
batting_first_win_percentage = (batting_first_win / len(df)) * 100
print(f"Batting first led to win: {batting_first_win} times ({batting_first_win_percentage:.2f}%)")

Batting first led to win: 500 times (45.66%)


## Analyze team performance

In [15]:
team_map ={"Mumbai Indians":"Mumbai Indians",
          "Chennai Super Kings":"Chennai Super Kings",
          "Kolkata Knight Riders":"Kolkata Knight Riders",
          "Royal Challengers Bangalore":"Royal Challengers Bangalore",
          "Royal Challengers Bengaluru":"Royal Challengers Bangalore",
          "Rajasthan Royals":"Rajasthan Royals",
          "Kings XI Punjab":"Kings XI Punjab",
          "Punjab Kings":"Kings XI Punjab",
          "Sunrisers Hyderabad":"Sunrisers Hyderabad",
          "Deccan Chargers":"Sunrisers Hyderabad",
          "Delhi Capitals":"Delhi Capitals",
          "Delhi Daredevils":"Delhi Capitals",
          "Gujarat Titans":"Gujarat Titans",
          "Gujarat Lions":"Gujarat Titans",
          "Lucknow Super Giants":"Lucknow Super Giants",
          "Pune Warriors":"Pune Warriors",
          "Rising Pune Supergiant":"Pune Warriors",
          "Rising Pune Supergiants":"Pune Warriors",
          "Kochi Tuskers Kerala":"Kochi Tuskers Kerala"}

df['team1']= df['team1'].map(team_map)
df['team2']= df['team2'].map(team_map)

In [16]:
teams = list(set(df['team1'].unique()) | set(df['team2'].unique()))

team_stats = {team: {'matches': 0, 'wins': 0, 'win_rate': 0, 'avg_elo': 0, 'avg_form': 0} for team in teams}

# Calculate team statistics
for _, row in df.iterrows():
    team1 = row['team1']
    team2 = row['team2']
    winner = row['winner']
    
    # Update team1 statistics
    team_stats[team1]['matches'] += 1
    team_stats[team1]['wins'] += 1 if row['team1_won'] == 1 else 0
    team_stats[team1]['avg_elo'] += row['team1_avg_elo']
    team_stats[team1]['avg_form'] += row['team1_avg_form']
    
    # Update team2 statistics
    team_stats[team2]['matches'] += 1
    team_stats[team2]['wins'] += 1 if row['team2_won'] == 1 else 0
    team_stats[team2]['avg_elo'] += row['team2_avg_elo']
    team_stats[team2]['avg_form'] += row['team2_avg_form']

# Calculate win rates and average stats
for team in team_stats:
    matches = team_stats[team]['matches']
    if matches > 0:
        team_stats[team]['win_rate'] = (team_stats[team]['wins'] / matches) * 100
        team_stats[team]['avg_elo'] /= matches
        team_stats[team]['avg_form'] /= matches

# Convert to DataFrame for better visualization
team_stats_df = pd.DataFrame.from_dict(team_stats, orient='index')
team_stats_df = team_stats_df.sort_values('win_rate', ascending=False)

print("\nTeam Performance Analysis:")
print(team_stats_df)


Team Performance Analysis:
                             matches  wins   win_rate      avg_elo  avg_form
Chennai Super Kings              238   138  57.983193  1609.481029  0.197063
Mumbai Indians                   261   144  55.172414  1605.395529  0.191851
Gujarat Titans                    75    41  54.666667  1631.385091  0.194795
Lucknow Super Giants              44    24  54.545455  1587.510576  0.185586
Kolkata Knight Riders            251   131  52.191235  1630.364456  0.189654
Rajasthan Royals                 221   112  50.678733  1592.142758  0.186282
Royal Challengers Bangalore      255   123  48.235294  1602.124756  0.190742
Delhi Capitals                   252   115  45.634921  1600.175056  0.193021
Kings XI Punjab                  246   112  45.528455  1578.994698  0.183305
Sunrisers Hyderabad              257   117  45.525292  1595.788651  0.187043
Kochi Tuskers Kerala              14     6  42.857143  1587.084663  0.174772
Pune Warriors                     76    27  35.5

In [None]:
print("VENUE ANALYSIS")

df['venue']=df['venue'].str.replace('Feroz Shah Kotla','Arun Jaitley Stadium')
df['venue']=df['venue'].str.replace('Arun Jaitley Stadium, Delhi','Arun Jaitley Stadium')
df['venue']=df['venue'].str.replace('Eden Gardens, Kolkata','Eden Gardens')
df['venue']=df['venue'].str.replace('Eden Gardens, Kolkata, Kolkata','Eden Gardens')
df['venue']=df['venue'].str.replace('Wankhede Stadium, Mumbai','Wankhede Stadium')
df['venue']=df['venue'].str.replace('Punjab Cricket Association IS Bindra Stadium, Mohali, Chandigarh','Punjab Cricket Association Stadium, Mohali')
df['venue']=df['venue'].str.replace('Rajiv Gandhi International Stadium, Uppal, Hyderabad','Rajiv Gandhi International Stadium, Uppal')
df['venue']=df['venue'].str.replace('MA Chidambaram Stadium','MA Chidambaram Stadium, Chepauk')
df['venue']=df['venue'].str.replace('MA Chidambaram Stadium, Chepauk, Chennai','MA Chidambaram Stadium, Chepauk')

venue_counts = df['venue'].value_counts()

print(f"Number of unique venues: {len(venue_counts)}")
print(f"Most common venues: {', '.join(venue_counts.head(5).index)}")

# Batting first advantage by venue
venues = df.groupby('venue').agg(
    matches=('id', 'count'),
    batting_first_wins=('batting_first_won', 'sum')
)

venues['batting_first_win_rate'] = venues['batting_first_wins'] / venues['matches']

# Sort venues by batting first advantage
top_batting_venues = venues.sort_values('batting_first_win_rate', ascending=False).head(3)
bottom_batting_venues = venues.sort_values('batting_first_win_rate').head(3)

print("\nTop venues for batting first:")
for venue, row in top_batting_venues.iterrows():
    print(f"{venue}: {row['batting_first_win_rate']:.3f} ({row['batting_first_wins']}/{row['matches']})")

print("\nTop venues for bowling first:")
for venue, row in bottom_batting_venues.iterrows():
    print(f"{venue}: {1 - row['batting_first_win_rate']:.3f} ({row['matches'] - row['batting_first_wins']}/{row['matches']})")

# Analyze average first innings scores by venue
venue_scores = df.groupby('venue').agg(
    avg_first_innings=('venue_avg_first_innings', 'mean')
).sort_values('avg_first_innings', ascending=False)

print("\nVenues by average first innings score:")
for venue, row in venue_scores.head(3).iterrows():
    print(f"{venue}: {row['avg_first_innings']:.2f}")

for venue, row in venue_scores.tail(3).iterrows():
    print(f"{venue}: {row['avg_first_innings']:.2f}")

print("\n")

VENUE ANALYSIS
Number of unique venues: 52
Most common venues: Wankhede Stadium, Eden Gardens, Arun Jaitley Stadium

Top venues for batting first:
Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium, Visakhapatnam: 1.000 (2.0/2.0)
Maharashtra Cricket Association Stadium, Pune: 0.769 (10.0/13.0)
Himachal Pradesh Cricket Association Stadium, Dharamsala: 0.750 (3.0/4.0)

Top venues for bowling first:
Green Park: 1.000 (4.0/4.0)
Holkar Cricket Stadium: 0.889 (8.0/9.0)
JSCA International Stadium Complex: 0.714 (5.0/7.0)

Venues by average first innings score:
Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium, Visakhapatnam: 232.50
Himachal Pradesh Cricket Association Stadium, Dharamsala: 203.00
M Chinnaswamy Stadium, Bengaluru: 197.21
New Wanderers Stadium: 145.00
OUTsurance Oval: 136.00
Newlands: 133.14




In [23]:
"""Analyze player performances"""
# Extract players of the match
players_of_match = df['player_of_match'].value_counts().reset_index()
players_of_match.columns = ['Player', 'Count']
players_of_match = players_of_match.sort_values('Count', ascending=False)

print("\nTop Players of the Match:")
print(players_of_match.head(10))


Top Players of the Match:
           Player  Count
0  AB de Villiers     25
1        CH Gayle     22
2       RG Sharma     19
4         V Kohli     18
3       DA Warner     18
5        MS Dhoni     17
6       SR Watson     16
7       YK Pathan     16
8       RA Jadeja     16
9      AD Russell     15


## Analyze factors that could predict match outcomes

In [None]:
df['match_outcome'] = df['team1_won']

# Create a correlation matrix
correlation_columns = [
    'team1_avg_elo', 'team2_avg_elo', 'team1_avg_form', 'team2_avg_form',
    'elo_diff_team1_team2', 'form_diff_team1_team2',
    'team1_batting_vs_team2_bowling', 'team2_batting_vs_team1_bowling',
    'team1_vs_team2_wins', 'team2_vs_team1_wins',
    'team1_win_streak', 'team2_win_streak',
    'team1_last_5_wins', 'team2_last_5_wins',
    'venue_batting_first_win_rate', 'venue_toss_win_rate',
    'toss_winner_is_team1', 'match_outcome'
]

correlation_df = df[correlation_columns].corr()
outcome_correlations = correlation_df['match_outcome'].sort_values(ascending=False)

print("\nFactors correlated with match outcome (team1 winning):")
print(outcome_correlations)

# Create a feature importance dataframe for visualization
features = outcome_correlations.index.tolist()
features.remove('match_outcome')
importance = outcome_correlations[features].abs().sort_values(ascending=False)

print("\nFeature importance for predicting match outcomes:")
print(importance)


Factors correlated with match outcome (team1 winning):
match_outcome                     1.000000
elo_diff_team1_team2              0.157602
form_diff_team1_team2             0.150027
team1_vs_team2_wins               0.138996
venue_batting_first_win_rate      0.099668
team1_batting_vs_team2_bowling    0.094037
team1_avg_elo                     0.083281
team1_avg_form                    0.082406
team2_win_streak                  0.058172
team1_last_5_wins                 0.040793
team2_last_5_wins                 0.034563
toss_winner_is_team1              0.020950
venue_toss_win_rate               0.006359
team2_avg_elo                     0.000101
team1_win_streak                 -0.007933
team2_vs_team1_wins              -0.065324
team2_avg_form                   -0.097669
team2_batting_vs_team1_bowling   -0.170784
Name: match_outcome, dtype: float64

Feature importance for predicting match outcomes:
team2_batting_vs_team1_bowling    0.170784
elo_diff_team1_team2              0.1576

In [29]:
with open("IDK.csv", 'w') as f:
    f.write(df.to_csv())