In [3]:
# Importing essential libraries

# Data Gathering 
import pandas as pd
import numpy as np

# Data Visualization
import seaborn as sns



#### Data Gathering 

In [174]:
deliveries = pd.read_csv('deliveries.csv')
deliveries.sample(7)

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,extra_runs,total_runs,extras_type,is_wicket,player_dismissed,dismissal_kind,fielder
41911,501199,1,Deccan Chargers,Rajasthan Royals,12,2,JP Duminy,SK Warne,DT Christian,0,0,0,,0,,,
2809,335993,2,Chennai Super Kings,Kolkata Knight Riders,12,2,MS Dhoni,LR Shukla,ML Hayden,0,0,0,,0,,,
164904,1175356,2,Chennai Super Kings,Royal Challengers Bangalore,8,6,AT Rayudu,UT Yadav,SK Raina,0,0,0,,0,,,
112676,829735,2,Rajasthan Royals,Chennai Super Kings,6,2,SR Watson,RA Jadeja,AM Rahane,0,0,0,,0,,,
144978,1082627,1,Sunrisers Hyderabad,Kolkata Knight Riders,18,7,KS Williamson,CR Woakes,Yuvraj Singh,4,0,4,,0,,,
77644,598003,1,Pune Warriors,Kings XI Punjab,8,1,LRPL Taylor,PP Chawla,RV Uthappa,0,0,0,,0,,,
101385,733987,2,Chennai Super Kings,Kings XI Punjab,5,5,BB McCullum,Sandeep Sharma,SK Raina,1,0,1,,0,,,


`Column` | Description
:-|:-
`match_id` |	Unique identifier for each match, used to link with match details.
`inning`|	Indicates the innings (1 or 2) in the match.
`batting_team`|Name of the team batting in the given inning.
`bowling_team`|	Name of the team bowling in the given inning.
`over`|	The over number in the current inning (e.g., 1, 2, ..., 20).
`ball`|	The ball number within the over (1 to 6).
`batter`|	Name of the player currently batting.
`bowler`	|Name of the player currently bowling.
`non_striker`|	Name of the non-striker at the other end of the pitch.
`batsman_runs`|	Number of runs scored by the batsman on the current delivery (can include 0 for dot balls or 4/6 for boundaries).
`extra_runs`	|Number of extra runs (such as no-balls, wides) on this delivery.
`total_runs`	|Total runs scored on this ball (sum of batsman runs and extra runs).
`extras_type`	|Type of extra run, if applicable (e.g., wide, no-ball, bye, legbye).
`is_wicket`	|Indicates if the delivery resulted in a wicket (1 for wicket, 0 for no wicket).
`player_dismissed`|	Name of the player dismissed on the ball (if applicable, otherwise null).
`dismissal_kind`	|Type of dismissal (e.g., bowled, caught, run out, LBW).
`fielder`	|Name of the fielder involved in the dismissal (for dismissals like catch, run out, etc.; null for dismissals not involving a fielder).

In [21]:
matches = pd.read_csv('matches.csv')
matches.sample(7)

Unnamed: 0,id,season,city,date,match_type,player_of_match,venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin,target_runs,target_overs,super_over,method,umpire1,umpire2
210,501234,2011,Kolkata,2011-04-30,League,Iqbal Abdulla,Eden Gardens,Kolkata Knight Riders,Kings XI Punjab,Kolkata Knight Riders,field,Kolkata Knight Riders,wickets,8.0,120.0,20.0,N,,AM Saheba,SL Shastri
827,1254069,2021,Mumbai,2021-04-19,League,MM Ali,"Wankhede Stadium, Mumbai",Chennai Super Kings,Rajasthan Royals,Rajasthan Royals,field,Chennai Super Kings,runs,45.0,189.0,20.0,N,,PR Reiffel,VK Sharma
54,336019,2007/08,Chandigarh,2008-05-28,League,SE Marsh,"Punjab Cricket Association Stadium, Mohali",Kings XI Punjab,Rajasthan Royals,Rajasthan Royals,field,Kings XI Punjab,runs,41.0,222.0,20.0,N,,SJ Davis,K Hariharan
282,548343,2012,Chennai,2012-04-28,League,Mandeep Singh,"MA Chidambaram Stadium, Chepauk",Chennai Super Kings,Kings XI Punjab,Kings XI Punjab,bat,Kings XI Punjab,runs,7.0,157.0,20.0,N,,BF Bowden,SK Tarapore
692,1136617,2018,Mumbai,2018-05-22,Qualifier 1,F du Plessis,Wankhede Stadium,Sunrisers Hyderabad,Chennai Super Kings,Chennai Super Kings,field,Chennai Super Kings,wickets,2.0,140.0,20.0,N,,C Shamshuddin,M Erasmus
1033,1422128,2024,Bengaluru,2024-03-29,League,SP Narine,"M Chinnaswamy Stadium, Bengaluru",Royal Challengers Bengaluru,Kolkata Knight Riders,Kolkata Knight Riders,field,Kolkata Knight Riders,wickets,7.0,183.0,20.0,N,,AK Chaudhary,R Pandit
159,419150,2009/10,Jaipur,2010-04-11,League,SR Tendulkar,Sawai Mansingh Stadium,Rajasthan Royals,Mumbai Indians,Rajasthan Royals,field,Mumbai Indians,runs,37.0,175.0,20.0,N,,BR Doctrove,SK Tarapore


`Column`	|Description
:-|:-
`id`	|Unique identifier for each match, similar to match_id in the ball-by-ball dataset, useful for linking datasets.
`season`|	Year in which the IPL season took place.
`city`	|City where the match was played.
`date`	|Date of the match.
`match_type`|	Type of match (could indicate regular season, playoff, or final).
`player_of_match`|	Name of the player awarded "Player of the Match."
`venue`|	Stadium where the match was played.
`team1`	|Name of the first team (order doesn’t imply batting or bowling first).
`team2`	|Name of the second team.
`toss_winner`|	Team that won the toss.
`toss_decision`|	Decision taken by the toss-winning team (bat or field).
`winner`|Name of the team that won the match.
`result`|	Outcome type of the match (e.g., normal, tie, no result).
`result_margin`|	Margin by which the team won (e.g., runs or wickets).
`target_runs`	|Target runs set for the chasing team (null if not applicable).
`target_overs`	|Target overs if the match is shortened (e.g., in rain-affected matches; null if not applicable).
`super_over`	|Indicates if the match had a super over (1 for yes, 0 for no).
`method`	|Method used to determine the result in rain-affected games (e.g., Duckworth-Lewis-Stern, or null if not applicable).
`umpire1`	|Name of the first on-field umpire.
`umpire2`	|Name of the second on-field umpire.

- here we have two datasets one containing ball by ball details and other containing match wise details.
- we are going to analyse each data set for insights.


#### Data Cleaning

In [28]:
deliveries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260920 entries, 0 to 260919
Data columns (total 17 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   match_id          260920 non-null  int64 
 1   inning            260920 non-null  int64 
 2   batting_team      260920 non-null  object
 3   bowling_team      260920 non-null  object
 4   over              260920 non-null  int64 
 5   ball              260920 non-null  int64 
 6   batter            260920 non-null  object
 7   bowler            260920 non-null  object
 8   non_striker       260920 non-null  object
 9   batsman_runs      260920 non-null  int64 
 10  extra_runs        260920 non-null  int64 
 11  total_runs        260920 non-null  int64 
 12  extras_type       14125 non-null   object
 13  is_wicket         260920 non-null  int64 
 14  player_dismissed  12950 non-null   object
 15  dismissal_kind    12950 non-null   object
 16  fielder           9354 non-null    obj

- here we get data information where we know basics about our data
    - There are 260920 rows and 17 columns in deliveries dataset.
    - some columns have missing values.


In [30]:
matches.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1095 entries, 0 to 1094
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               1095 non-null   int64  
 1   season           1095 non-null   object 
 2   city             1044 non-null   object 
 3   date             1095 non-null   object 
 4   match_type       1095 non-null   object 
 5   player_of_match  1090 non-null   object 
 6   venue            1095 non-null   object 
 7   team1            1095 non-null   object 
 8   team2            1095 non-null   object 
 9   toss_winner      1095 non-null   object 
 10  toss_decision    1095 non-null   object 
 11  winner           1090 non-null   object 
 12  result           1095 non-null   object 
 13  result_margin    1076 non-null   float64
 14  target_runs      1092 non-null   float64
 15  target_overs     1092 non-null   float64
 16  super_over       1095 non-null   object 
 17  method        

- Here  we have 1095 rows and 20 columns .
- there are some missing values in this data.
- Date column is in form of object which can be converted into date time for better analysis.

##### Further I cleaned both datas column wise 

#### Deliveries cleaning 

In [70]:
deliveries.sample(25)

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,extra_runs,total_runs,extras_type,is_wicket,player_dismissed,dismissal_kind,fielder
219490,1304094,1,Gujarat Titans,Punjab Kings,3,6,HH Pandya,K Rabada,B Sai Sudharsan,0,0,0,,0,,,
228394,1359485,1,Rajasthan Royals,Delhi Capitals,5,2,YBK Jaiswal,A Nortje,JC Buttler,0,0,0,,0,,,
259976,1426307,2,Sunrisers Hyderabad,Punjab Kings,9,6,Abhishek Sharma,Harpreet Brar,Nithish Kumar Reddy,6,0,6,,0,,,
249845,1426263,2,Mumbai Indians,Royal Challengers Bengaluru,0,6,Ishan Kishan,RJW Topley,RG Sharma,1,0,1,,0,,,
230911,1359495,1,Lucknow Super Giants,Punjab Kings,12,5,KL Rahul,Arshdeep Singh,KH Pandya,0,0,0,,0,,,
68149,548347,2,Kolkata Knight Riders,Chennai Super Kings,19,1,YK Pathan,R Ashwin,DB Das,2,0,2,,0,,,
42345,501201,1,Delhi Daredevils,Mumbai Indians,4,2,AJ Finch,AG Murtaza,V Sehwag,4,0,4,,0,,,
180576,1216498,1,Kings XI Punjab,Sunrisers Hyderabad,13,6,DJ Hooda,Sandeep Sharma,N Pooran,0,1,1,wides,0,,,
192648,1237177,1,Mumbai Indians,Delhi Capitals,1,5,Q de Kock,R Ashwin,SA Yadav,4,0,4,,0,,,
8097,336016,1,Royal Challengers Bangalore,Kings XI Punjab,15,4,MV Boucher,JR Hopes,P Kumar,6,0,6,,0,,,


#### match_id

In [43]:
deliveries['match_id'].isnull().sum()

0

In [53]:
deliveries['match_id'].nunique()

1095

- In match_id column there isn't any null value .
- Their are 1095 unique match id's which matches #rows in matches dataset.
   i.e. We have information of all matches.

#### inning

In [56]:
deliveries['inning'].isnull().sum()

0

In [137]:
deliveries['inning'].unique()

array([1, 2, 3, 4, 5, 6], dtype=int64)

- here data does not have null values.
- but innings values are little disturbed, as we know their should be only 2 innings but have till 6.
    - 3rd and 4th inning is their due to a super over but 5th and 6th inning is a entry error.

In [176]:
# If inning are 5 changing to 3 and if 6 changed to 4
def inning_replacing(inning):
    if inning == 5:
        return 3
    elif inning == 6:
        return 4
    else:
        return inning

In [178]:
deliveries['inning']=deliveries['inning'].apply(inning_replacing)

In [180]:
deliveries['inning'].unique()

array([1, 2, 3, 4], dtype=int64)

#### batting team

In [149]:
deliveries['batting_team'].isnull().sum()

0

In [182]:
deliveries['batting_team'].nunique()

19

In [154]:
deliveries['batting_team'].unique()

array(['Kolkata Knight Riders', 'Royal Challengers Bangalore',
       'Chennai Super Kings', 'Kings XI Punjab', 'Rajasthan Royals',
       'Delhi Daredevils', 'Mumbai Indians', 'Deccan Chargers',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Sunrisers Hyderabad',
       'Rising Pune Supergiants', 'Gujarat Lions',
       'Rising Pune Supergiant', 'Delhi Capitals', 'Punjab Kings',
       'Lucknow Super Giants', 'Gujarat Titans',
       'Royal Challengers Bengaluru'], dtype=object)

- their is no null value in batting team
- over the years names of teams are changed so same team is present with several names or their are some speeling difference

In [184]:
# correcting teams name if differs
def change_team_name(team_name):
    if team_name in ['Rising Pune Supergiants','Pune Warriors']:
        return 'Rising Pune Supergiant'
    elif team_name == 'Royal Challengers Bengaluru':
        return 'Royal Challengers Bangalore'
    elif team_name == 'Deccan Chargers':
        return 'Sunrisers Hyderabad'
    elif team_name == 'Delhi Daredevils':
        return 'Delhi Capitals'
    elif team_name == 'Gujarat Lions':
        return 'Gujarat Titans'
    elif team_name == 'Kings XI Punjab':
        return 'Punjab Kings'
    else:
        return team_name

In [186]:
deliveries['batting_team'] = deliveries['batting_team'].apply(change_team_name)

In [188]:
deliveries['batting_team'].nunique()

12

- there were 19 teams in batting team column which reduced to 12 as we know till today 12 teams were played in IPL

#### bowling team

In [194]:
deliveries['bowling_team'].isnull().sum()

0

In [198]:
deliveries['bowling_team'].nunique()

19

In [200]:
deliveries['bowling_team'] = deliveries['bowling_team'].apply(change_team_name)

In [204]:
deliveries['bowling_team'].nunique()

12

- their were 19 teams reduced to 12 with same problem as batting team

#### Over

In [207]:
deliveries['over'].isnull().sum()

0

In [212]:
deliveries['over'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19], dtype=int64)

- 20 overs are present 1 over is calculated after completing.

#### ball

In [218]:
deliveries['ball'].isnull().sum()

0

In [220]:
deliveries['ball'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11], dtype=int64)

- as we know each over has 6 balls their can be wides and no balls in some overs.

#### batter

In [226]:
deliveries['batter'].isnull().sum()

0

In [230]:
deliveries['batter'].nunique()

673

In [236]:
deliveries['batter'].unique()

array(['SC Ganguly', 'BB McCullum', 'RT Ponting', 'DJ Hussey',
       'Mohammad Hafeez', 'R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis',
       'CL White', 'MV Boucher', 'B Akhil', 'AA Noffke', 'P Kumar',
       'Z Khan', 'SB Joshi', 'PA Patel', 'ML Hayden', 'MEK Hussey',
       'MS Dhoni', 'SK Raina', 'JDP Oram', 'S Badrinath', 'K Goel',
       'JR Hopes', 'KC Sangakkara', 'Yuvraj Singh', 'SM Katich',
       'IK Pathan', 'T Kohli', 'YK Pathan', 'SR Watson', 'M Kaif',
       'DS Lehmann', 'RA Jadeja', 'M Rawat', 'D Salunkhe', 'SK Warne',
       'SK Trivedi', 'G Gambhir', 'V Sehwag', 'S Dhawan', 'L Ronchi',
       'ST Jayasuriya', 'DJ Thornely', 'RV Uthappa', 'PR Shah',
       'AM Nayar', 'SM Pollock', 'Harbhajan Singh', 'S Chanderpaul',
       'LRPL Taylor', 'AC Gilchrist', 'Y Venugopal Rao', 'VVS Laxman',
       'A Symonds', 'RG Sharma', 'SB Styris', 'AS Yadav', 'SB Bangar',
       'WPUJC Vaas', 'RP Singh', 'WP Saha', 'LR Shukla',
       'DPMD Jayawardene', 'S Sohal', 'B Lee', 'PP Cha

- there are total 673 batter played in IPL till now

#### bowler

In [242]:
deliveries['bowler'].isnull().sum()

0

In [244]:
deliveries['bowler'].nunique()

530

In [246]:
deliveries['bowler'].unique()

array(['P Kumar', 'Z Khan', 'AA Noffke', 'JH Kallis', 'SB Joshi',
       'CL White', 'AB Dinda', 'I Sharma', 'AB Agarkar', 'SC Ganguly',
       'LR Shukla', 'B Lee', 'S Sreesanth', 'JR Hopes', 'IK Pathan',
       'K Goel', 'PP Chawla', 'WA Mota', 'JDP Oram', 'MS Gony',
       'M Muralitharan', 'P Amarnath', 'Joginder Sharma', 'GD McGrath',
       'B Geeves', 'MF Maharoof', 'R Bhatia', 'DL Vettori', 'MM Patel',
       'SR Watson', 'SK Trivedi', 'SK Warne', 'YK Pathan', 'D Salunkhe',
       'R Vinay Kumar', 'B Akhil', 'A Nehra', 'SM Pollock', 'DS Kulkarni',
       'ST Jayasuriya', 'Harbhajan Singh', 'AM Nayar', 'M Kartik',
       'Mohammad Hafeez', 'DJ Hussey', 'WPUJC Vaas', 'RP Singh',
       'SB Styris', 'SB Bangar', 'A Symonds', 'PP Ojha', 'Pankaj Singh',
       'Mohammad Asif', 'VY Mahesh', 'Shahid Afridi', 'DJ Bravo',
       'VS Yeligati', 'MA Khote', 'D Kalyankrishna', 'VRV Singh',
       'Sohail Tanvir', 'A Kumble', 'DNT Zoysa', 'SD Chitnis',
       'Yuvraj Singh', 'Shoaib Malik',

- 530 bowlers played IPL till now

#### non-striker

In [250]:
deliveries['non_striker'].isnull().sum()

0

In [254]:
deliveries['non_striker'].nunique()

663

In [256]:
deliveries['non_striker'].unique()

array(['BB McCullum', 'SC Ganguly', 'RT Ponting', 'DJ Hussey',
       'Mohammad Hafeez', 'W Jaffer', 'R Dravid', 'V Kohli', 'JH Kallis',
       'CL White', 'MV Boucher', 'B Akhil', 'AA Noffke', 'P Kumar',
       'Z Khan', 'SB Joshi', 'ML Hayden', 'PA Patel', 'MEK Hussey',
       'MS Dhoni', 'SK Raina', 'JDP Oram', 'S Badrinath', 'JR Hopes',
       'K Goel', 'KC Sangakkara', 'Yuvraj Singh', 'SM Katich',
       'IK Pathan', 'YK Pathan', 'T Kohli', 'SR Watson', 'M Kaif',
       'DS Lehmann', 'RA Jadeja', 'M Rawat', 'D Salunkhe', 'SK Warne',
       'SK Trivedi', 'V Sehwag', 'G Gambhir', 'S Dhawan', 'ST Jayasuriya',
       'L Ronchi', 'DJ Thornely', 'RV Uthappa', 'PR Shah', 'AM Nayar',
       'SM Pollock', 'Harbhajan Singh', 'S Chanderpaul', 'LRPL Taylor',
       'Y Venugopal Rao', 'AC Gilchrist', 'VVS Laxman', 'A Symonds',
       'SB Styris', 'AS Yadav', 'SB Bangar', 'WPUJC Vaas', 'RP Singh',
       'PP Ojha', 'WP Saha', 'LR Shukla', 'DPMD Jayawardene', 'S Sohal',
       'PP Chawla', 'WA M

- there are total 663 unique values in non-striker

#### batsmam_runs

In [263]:
deliveries['batsman_runs'].isnull().sum()

0

In [265]:
deliveries['batsman_runs'].unique()

array([0, 4, 6, 1, 2, 5, 3], dtype=int64)

- values are till 6 that a batter can strike

#### extra_runs

In [269]:
deliveries['extra_runs'].isnull().sum()

0

In [271]:
deliveries['extra_runs'].unique()

array([1, 0, 5, 4, 2, 3, 7], dtype=int64)

- because of bowlers mistake some penalties are gives so the value is 7

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,extra_runs,total_runs,extras_type,is_wicket,player_dismissed,dismissal_kind,fielder
1614,335988,2,Delhi Capitals,Sunrisers Hyderabad,11,6,V Sehwag,PP Ojha,S Dhawan,0,5,5,penalty,0,,,
98231,729309,2,Delhi Capitals,Mumbai Indians,4,2,M Vijay,SL Malinga,Q de Kock,0,7,7,penalty,0,,,


array(['legbyes', nan, 'wides', 'byes', 'noballs', 'penalty'],
      dtype=object)