### Importing the required libraries and Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
plt.style.use('ggplot')

In [2]:
try:
    match_df = pd.read_csv('matchwise_data.csv')
    delivery_df = pd.read_csv('deliverywise_data.csv')
    print("Datasets loaded successfully.")
except FileNotFoundError:
    print("Error: Datasets not found. Please ensure 'matchwise_data.csv' and 'deliverywise_data.csv' are in the same directory.")

Datasets loaded successfully.


# Extensive Data Analysis

### Inspecting the Data structure

In [3]:
# Inspect Match Data
print("Match Data Info:")
print(match_df.info())
display(match_df.head())

Match Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3037 entries, 0 to 3036
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   match_number         3037 non-null   int64  
 1   match_id             3037 non-null   int64  
 2   date                 3037 non-null   object 
 3   event_name           2975 non-null   object 
 4   ground_name          3037 non-null   object 
 5   ground_city          2914 non-null   object 
 6   team_1               3037 non-null   object 
 7   team_2               3037 non-null   object 
 8   toss_winner          3037 non-null   object 
 9   toss_decision        3037 non-null   object 
 10  team_1_total_runs    3037 non-null   int64  
 11  team_2_total_runs    3037 non-null   int64  
 12  winner               3037 non-null   object 
 13  margin_runs          1456 non-null   float64
 14  margin_wickets       1475 non-null   float64
 15  winning_method       

Unnamed: 0,match_number,match_id,date,event_name,ground_name,ground_city,team_1,team_2,toss_winner,toss_decision,team_1_total_runs,team_2_total_runs,winner,margin_runs,margin_wickets,winning_method,player_of_the_match
0,1,211048,2005-02-17,,Eden Park,Auckland,New Zealand,Australia,Australia,bat,170,214,Australia,44.0,,,RT Ponting
1,2,211028,2005-06-13,,The Rose Bowl,Southampton,England,Australia,England,bat,179,79,England,100.0,,,KP Pietersen
2,3,222678,2005-10-21,,New Wanderers Stadium,Johannesburg,South Africa,New Zealand,New Zealand,field,133,134,New Zealand,,5.0,,JS Patel
3,4,226374,2006-01-09,,"Brisbane Cricket Ground, Woolloongabba",Brisbane,Australia,South Africa,Australia,bat,209,114,Australia,95.0,,,DR Martyn
4,5,237242,2006-02-16,,Eden Park,Auckland,New Zealand,West Indies,New Zealand,field,126,126,tie,,,,DR Smith


In [4]:
# Inspect Delivery Data
print("Delivery Data Info:")
print(delivery_df.info())
display(delivery_df.head())

Delivery Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 685558 entries, 0 to 685557
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   match_id          685558 non-null  int64 
 1   innings_number    685558 non-null  int64 
 2   batting_team      685558 non-null  object
 3   bowling_team      685558 non-null  object
 4   over_number       685558 non-null  int64 
 5   ball_number       685558 non-null  int64 
 6   batter            685558 non-null  object
 7   bowler            685558 non-null  object
 8   non_striker       685558 non-null  object
 9   wide_runs         685558 non-null  int64 
 10  leg_bye_runs      685558 non-null  int64 
 11  bye_runs          685558 non-null  int64 
 12  no_ball_runs      685558 non-null  int64 
 13  penalty_runs      685558 non-null  int64 
 14  batsman_runs      685558 non-null  int64 
 15  extra_runs        685558 non-null  int64 
 16  total_runs        

Unnamed: 0,match_id,innings_number,batting_team,bowling_team,over_number,ball_number,batter,bowler,non_striker,wide_runs,leg_bye_runs,bye_runs,no_ball_runs,penalty_runs,batsman_runs,extra_runs,total_runs,player_dismissed,dismissal_type,fielder_name,match_number
0,211048,1,Australia,New Zealand,0,1,AC Gilchrist,DR Tuffey,MJ Clarke,1,0,0,0,0,0,1,1,,,,1
1,211048,1,Australia,New Zealand,0,2,AC Gilchrist,DR Tuffey,MJ Clarke,0,1,0,0,0,0,1,1,,,,1
2,211048,1,Australia,New Zealand,0,3,MJ Clarke,DR Tuffey,AC Gilchrist,0,0,0,0,0,0,0,0,,,,1
3,211048,1,Australia,New Zealand,0,4,MJ Clarke,DR Tuffey,AC Gilchrist,0,0,0,0,0,1,0,1,,,,1
4,211048,1,Australia,New Zealand,0,5,AC Gilchrist,DR Tuffey,MJ Clarke,0,0,0,0,0,1,0,1,,,,1


### Data Cleaning & Preprocessing

In [5]:
# getting missing values count for each column
match_df['date'] = pd.to_datetime(match_df['date'], errors='coerce')

print("Missing values in Match Data:")
print(match_df.isnull().sum()[match_df.isnull().sum() > 0])

print("\nMissing values in Delivery Data:")
print(delivery_df.isnull().sum()[delivery_df.isnull().sum() > 0])

Missing values in Match Data:
event_name               62
ground_city             123
margin_runs            1581
margin_wickets         1562
winning_method         2931
player_of_the_match     261
dtype: int64

Missing values in Delivery Data:
player_dismissed    646556
dismissal_type      646556
fielder_name        659572
dtype: int64


## Distribution of matches by year

In [6]:
match_df['year'] = match_df['date'].dt.year

team_1 = set(match_df['team_1'])
team_2 = set(match_df['team_2'])
teams = sorted(team_1.union(team_2))

team_year_data = {
    team: match_df[(match_df['team_1'] == team) | (match_df['team_2'] == team)]['year'].value_counts().sort_index()
    for team in teams
}

all_year_counts = match_df['year'].value_counts().sort_index()

colors = px.colors.sequential.Viridis
fig = go.Figure()

fig.add_trace(go.Bar(
    x=all_year_counts.index,
    y=all_year_counts.values,
    name="All Matches",
    text=all_year_counts.values,
    textposition='outside',
    marker={
        "color": all_year_counts.values,
        "colorscale": colors,
        "showscale": True,
        "colorbar": {
            "title": "Number of Matches"
        }
    },
    
))

dropdown_buttons = [
    dict(
        label="All Teams",
        method="update",
        args=[
            {
                "x": [all_year_counts.index],
                "y": [all_year_counts.values],
                "text": [all_year_counts.values],
                "marker": {
                    "color": all_year_counts.values,
                    "colorscale": "Viridis",
                    "showscale": True,
                    "colorbar": {
                        "title": "Number of Matches"
                    }
                }
            },
            {"title": "Number of T20I Matches per Year — All Teams"}
        ]
    )
]

for team, year_counts in team_year_data.items():
    dropdown_buttons.append(
        dict(
            label=team,
            method="update",
            args=[
                {
                    "x": [year_counts.index],
                    "y": [year_counts.values],
                    "text": [year_counts.values],
                    "marker": {
                        "color": year_counts.values,
                        "colorscale": colors,
                        "showscale": True,
                        "colorbar": {
                            "title": "Number of Matches"
                        }
                    }
                },
                {"title": f"Number of T20I Matches per Year — {team}"}
            ]
        )
    ) 

fig.update_layout(
    title="Number of T20I Matches per Year — All Teams",
    xaxis_title="Year",
    yaxis_title="Number of Matches",
    updatemenus=[
        dict(
            type="dropdown",
            direction="down",
            showactive=True,
            buttons=dropdown_buttons,
            x=1.25,
            xanchor="left",
            y=1,
            yanchor="top"
        )
    ],
    bargap=0.2
)

fig.show()


## Percentage & Count of No-Result Matches in T20Is

In [7]:
total_number_of_matches = match_df.shape[0]
print(f"Total number of matches: {total_number_of_matches}")

no_result_matches = match_df[match_df['winner'].str.lower() == 'no result']
no_result_matches_count = no_result_matches.shape[0]
print(f"Number of matches with no result: {no_result_matches_count}")

completed_matches_count = total_number_of_matches - no_result_matches_count
print(f"Number of completed matches: {completed_matches_count}")


Total number of matches: 3037
Number of matches with no result: 69
Number of completed matches: 2968


In [8]:
labels = ['No Result', 'Completed']
values = [no_result_matches_count, completed_matches_count]
fig = go.Figure(data=[
    go.Pie(
        labels=labels,
        values=values,
        hole=0.55,
        textinfo='label+value+percent',
        insidetextorientation='auto'
    )
])


fig.update_layout(
    title="Percentage & Count of No-Result Matches in T20Is",
    showlegend=True
)

fig.show()

## Impact of Toss in the match outcome

In [22]:
toss_preference = match_df['toss_decision'].value_counts()

fig = go.Figure(data=[go.Pie(
    labels=toss_preference.index,
    values=toss_preference.values,
    hole=0.55,
    textinfo='label+value+percent'
)])

fig.update_layout(
    title="Toss Winning Team's Preferred Decision: Bat vs Field",
    showlegend=True
)

fig.show()

### Is winning the toss the same as winning the match?

In [10]:
completed_matches_df = match_df[match_df['winner'].str.lower() != 'no result']
completed_matches_df['toss_winner_won'] = completed_matches_df['toss_winner'] == completed_matches_df['winner']
toss_impact = completed_matches_df['toss_winner_won'].value_counts()
    
fig = go.Figure(
    data=[
        go.Pie(
            labels=['Toss Winner Won', 'Toss Winner Lost'],
            values=[toss_impact[True], toss_impact[False]],
            title='Impact of Toss on Match Result'
        )
    ]
)
fig.show()

### Does choosing to bat or field after winning the toss affect the final match result?

In [11]:
combined = completed_matches_df.groupby(['toss_decision', 'toss_winner_won']).size().reset_index(name='count')
combined['outcome'] = combined['toss_winner_won'].map({True: 'Won Match', False: 'Lost Match'})


fig = go.Figure()

for result in ['Won Match', 'Lost Match']:
    fig.add_trace(go.Bar(
        x=combined[combined['outcome'] == result]['toss_decision'],
        y=combined[combined['outcome'] == result]['count'],
        name=result,
        text=combined[combined['outcome'] == result]['count'],
        textposition='auto'
    ))

fig.update_layout(
    barmode='stack',
    title="Toss Decision vs Match Outcome (Did Toss Decision Help?)",
    xaxis_title="Toss Decision (Bat / Field)",
    yaxis_title="Match Count"
)

fig.show()


## Match Result Analysis

In [18]:
win_by_runs_counts = len(completed_matches_df[completed_matches_df['margin_runs'].notna()])
print(win_by_runs_counts)

win_by_wickets_counts = len(completed_matches_df[completed_matches_df['margin_wickets'].notna()])
print(win_by_wickets_counts)

1456
1475


In [34]:
fig = go.Figure(
    data=[
        go.Pie(
            labels=["Win by Wickets", "Win by Runs"],
            values=[win_by_wickets_counts, win_by_runs_counts],
            hole=0.55,
            textinfo='label+value+percent',
            insidetextorientation='auto'
        )
    ]
)

fig.update_layout(
    title='Match Result Distribution - Runs vs Wickets',
    title_x=0.5,
    showlegend=True
)
fig.update_traces(marker=dict(colors=["#1f77b4", "#ff7f0e"]))

fig.show()
