In [456]:
# Importing essential libraries

# Data Gathering 
import pandas as pd
import numpy as np

# Data Visualization
import seaborn as sns

# to find corresponding string in data
import regex

#### Data Gathering 

In [250]:
deliveries = pd.read_csv('deliveries.csv')
deliveries.sample(7)

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,extra_runs,total_runs,extras_type,is_wicket,player_dismissed,dismissal_kind,fielder
139837,1082604,2,Sunrisers Hyderabad,Kolkata Knight Riders,7,3,DA Warner,Kuldeep Yadav,MC Henriques,0,0,0,,0,,,
69755,548354,2,Kings XI Punjab,Rajasthan Royals,3,4,SE Marsh,SR Watson,N Saini,4,0,4,,0,,,
113715,829745,1,Chennai Super Kings,Royal Challengers Bangalore,1,6,SK Raina,YS Chahal,DR Smith,4,0,4,,0,,,
141143,1082609,2,Kings XI Punjab,Sunrisers Hyderabad,17,4,M Vohra,S Kaul,KC Cariappa,1,0,1,,0,,,
157641,1136591,1,Royal Challengers Bangalore,Mumbai Indians,6,3,BB McCullum,M Markande,M Vohra,0,0,0,,0,,,
23578,392225,2,Mumbai Indians,Rajasthan Royals,6,1,SR Tendulkar,SK Warne,ST Jayasuriya,1,0,1,,0,,,
24698,392230,1,Delhi Daredevils,Rajasthan Royals,6,3,AB de Villiers,J Botha,TM Dilshan,0,0,0,,0,,,


`Column` | Description
:-|:-
`match_id` |	Unique identifier for each match, used to link with match details.
`inning`|	Indicates the innings (1 or 2) in the match.
`batting_team`|Name of the team batting in the given inning.
`bowling_team`|	Name of the team bowling in the given inning.
`over`|	The over number in the current inning (e.g., 1, 2, ..., 20).
`ball`|	The ball number within the over (1 to 6).
`batter`|	Name of the player currently batting.
`bowler`	|Name of the player currently bowling.
`non_striker`|	Name of the non-striker at the other end of the pitch.
`batsman_runs`|	Number of runs scored by the batsman on the current delivery (can include 0 for dot balls or 4/6 for boundaries).
`extra_runs`	|Number of extra runs (such as no-balls, wides) on this delivery.
`total_runs`	|Total runs scored on this ball (sum of batsman runs and extra runs).
`extras_type`	|Type of extra run, if applicable (e.g., wide, no-ball, bye, legbye).
`is_wicket`	|Indicates if the delivery resulted in a wicket (1 for wicket, 0 for no wicket).
`player_dismissed`|	Name of the player dismissed on the ball (if applicable, otherwise null).
`dismissal_kind`	|Type of dismissal (e.g., bowled, caught, run out, LBW).
`fielder`	|Name of the fielder involved in the dismissal (for dismissals like catch, run out, etc.; null for dismissals not involving a fielder).

In [252]:
matches = pd.read_csv('matches.csv')
matches.sample(7)

Unnamed: 0,id,season,city,date,match_type,player_of_match,venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin,target_runs,target_overs,super_over,method,umpire1,umpire2
322,597998,2013,Kolkata,2013-04-03,League,SP Narine,Eden Gardens,Kolkata Knight Riders,Delhi Daredevils,Kolkata Knight Riders,field,Kolkata Knight Riders,wickets,6.0,129.0,20.0,N,,S Ravi,SJA Taufel
188,501211,2011,Chennai,2011-04-16,League,MEK Hussey,"MA Chidambaram Stadium, Chepauk",Chennai Super Kings,Royal Challengers Bangalore,Chennai Super Kings,bat,Chennai Super Kings,runs,21.0,184.0,20.0,N,,HDPK Dharmasena,AL Hill
379,598055,2013,Pune,2013-05-11,League,MG Johnson,Subrata Roy Sahara Stadium,Pune Warriors,Mumbai Indians,Pune Warriors,bat,Mumbai Indians,wickets,5.0,113.0,20.0,N,,Asad Rauf,AK Chaudhary
609,1082624,2017,Pune,2017-04-29,League,LH Ferguson,Maharashtra Cricket Association Stadium,Rising Pune Supergiant,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Rising Pune Supergiant,runs,61.0,158.0,20.0,N,,KN Ananthapadmanabhan,M Erasmus
541,980949,2016,Pune,2016-04-29,League,DR Smith,Maharashtra Cricket Association Stadium,Rising Pune Supergiants,Gujarat Lions,Gujarat Lions,field,Gujarat Lions,wickets,3.0,196.0,20.0,N,,CB Gaffaney,BNJ Oxenford
565,980997,2016,Visakhapatnam,2016-05-17,League,AB Dinda,Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket St...,Rising Pune Supergiants,Delhi Daredevils,Rising Pune Supergiants,field,Rising Pune Supergiants,runs,19.0,58.0,11.0,N,D/L,Nitin Menon,C Shamshuddin
968,1359493,2023,Kolkata,2023-04-14,League,HC Brook,"Eden Gardens, Kolkata",Sunrisers Hyderabad,Kolkata Knight Riders,Kolkata Knight Riders,field,Sunrisers Hyderabad,runs,23.0,229.0,20.0,N,,CB Gaffaney,GR Sadashiv Iyer


`Column`	|Description
:-|:-
`id`	|Unique identifier for each match, similar to match_id in the ball-by-ball dataset, useful for linking datasets.
`season`|	Year in which the IPL season took place.
`city`	|City where the match was played.
`date`	|Date of the match.
`match_type`|	Type of match (could indicate regular season, playoff, or final).
`player_of_match`|	Name of the player awarded "Player of the Match."
`venue`|	Stadium where the match was played.
`team1`	|Name of the first team (order doesn’t imply batting or bowling first).
`team2`	|Name of the second team.
`toss_winner`|	Team that won the toss.
`toss_decision`|	Decision taken by the toss-winning team (bat or field).
`winner`|Name of the team that won the match.
`result`|	Outcome type of the match (e.g., normal, tie, no result).
`result_margin`|	Margin by which the team won (e.g., runs or wickets).
`target_runs`	|Target runs set for the chasing team (null if not applicable).
`target_overs`	|Target overs if the match is shortened (e.g., in rain-affected matches; null if not applicable).
`super_over`	|Indicates if the match had a super over (1 for yes, 0 for no).
`method`	|Method used to determine the result in rain-affected games (e.g., Duckworth-Lewis-Stern, or null if not applicable).
`umpire1`	|Name of the first on-field umpire.
`umpire2`	|Name of the second on-field umpire.

- here we have two datasets one containing ball by ball details and other containing match wise details.
- we are going to analyse each data set for insights.


#### Data Cleaning

In [256]:
deliveries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260920 entries, 0 to 260919
Data columns (total 17 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   match_id          260920 non-null  int64 
 1   inning            260920 non-null  int64 
 2   batting_team      260920 non-null  object
 3   bowling_team      260920 non-null  object
 4   over              260920 non-null  int64 
 5   ball              260920 non-null  int64 
 6   batter            260920 non-null  object
 7   bowler            260920 non-null  object
 8   non_striker       260920 non-null  object
 9   batsman_runs      260920 non-null  int64 
 10  extra_runs        260920 non-null  int64 
 11  total_runs        260920 non-null  int64 
 12  extras_type       14125 non-null   object
 13  is_wicket         260920 non-null  int64 
 14  player_dismissed  12950 non-null   object
 15  dismissal_kind    12950 non-null   object
 16  fielder           9354 non-null    obj

- here we get data information where we know basics about our data
    - There are 260920 rows and 17 columns in deliveries dataset.
    - some columns have missing values.


In [258]:
matches.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1095 entries, 0 to 1094
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               1095 non-null   int64  
 1   season           1095 non-null   object 
 2   city             1044 non-null   object 
 3   date             1095 non-null   object 
 4   match_type       1095 non-null   object 
 5   player_of_match  1090 non-null   object 
 6   venue            1095 non-null   object 
 7   team1            1095 non-null   object 
 8   team2            1095 non-null   object 
 9   toss_winner      1095 non-null   object 
 10  toss_decision    1095 non-null   object 
 11  winner           1090 non-null   object 
 12  result           1095 non-null   object 
 13  result_margin    1076 non-null   float64
 14  target_runs      1092 non-null   float64
 15  target_overs     1092 non-null   float64
 16  super_over       1095 non-null   object 
 17  method        

- Here  we have 1095 rows and 20 columns .
- there are some missing values in this data.
- Date column is in form of object which can be converted into date time for better analysis.

##### Further I cleaned both datas column wise 

#### Deliveries cleaning 

In [262]:
deliveries.sample(25)

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,extra_runs,total_runs,extras_type,is_wicket,player_dismissed,dismissal_kind,fielder
206582,1254112,1,Mumbai Indians,Delhi Capitals,2,3,SA Yadav,R Ashwin,Q de Kock,0,0,0,,0,,,
9240,336021,1,Kolkata Knight Riders,Mumbai Indians,10,2,AB Agarkar,DJ Thornely,SC Ganguly,0,0,0,,0,,,
257039,1426293,1,Sunrisers Hyderabad,Mumbai Indians,14,4,M Jansen,PP Chawla,Shahbaz Ahmed,0,0,0,,0,,,
236243,1359516,2,Mumbai Indians,Rajasthan Royals,17,4,Tilak Varma,TA Boult,TH David,1,0,1,,0,,,
101995,733993,1,Delhi Daredevils,Sunrisers Hyderabad,3,4,KP Pietersen,B Kumar,MA Agarwal,0,0,0,,0,,,
225675,1312199,2,Rajasthan Royals,Royal Challengers Bangalore,10,4,SV Samson,HV Patel,JC Buttler,6,0,6,,0,,,
247295,1422133,1,Lucknow Super Giants,Royal Challengers Bengaluru,5,4,D Padikkal,GJ Maxwell,Q de Kock,0,0,0,,0,,,
13308,336040,1,Chennai Super Kings,Rajasthan Royals,11,3,SK Raina,SK Warne,JA Morkel,1,0,1,,0,,,
85594,598036,1,Royal Challengers Bangalore,Rajasthan Royals,5,1,V Kohli,SR Watson,A Mukund,0,0,0,,0,,,
253112,1426277,1,Chennai Super Kings,Lucknow Super Giants,6,2,RA Jadeja,Ravi Bishnoi,RD Gaikwad,1,0,1,,0,,,


#### match_id

In [264]:
deliveries['match_id'].isnull().sum()

0

In [265]:
deliveries['match_id'].nunique()

1095

- In match_id column there isn't any null value .
- Their are 1095 unique match id's which matches #rows in matches dataset.
   i.e. We have information of all matches.

#### inning

In [268]:
deliveries['inning'].isnull().sum()

0

In [269]:
deliveries['inning'].unique()

array([1, 2, 3, 4, 5, 6], dtype=int64)

- here data does not have null values.
- but innings values are little disturbed, as we know their should be only 2 innings but have till 6.
    - 3rd and 4th inning is their due to a super over but 5th and 6th inning is a entry error.

In [271]:
# If inning are 5 changing to 3 and if 6 changed to 4
def inning_replacing(inning):
    if inning == 5:
        return 3
    elif inning == 6:
        return 4
    else:
        return inning

In [272]:
deliveries['inning']=deliveries['inning'].apply(inning_replacing)

In [273]:
deliveries['inning'].unique()

array([1, 2, 3, 4], dtype=int64)

#### batting team

In [275]:
deliveries['batting_team'].isnull().sum()

0

In [276]:
deliveries['batting_team'].nunique()

19

In [277]:
deliveries['batting_team'].unique()

array(['Kolkata Knight Riders', 'Royal Challengers Bangalore',
       'Chennai Super Kings', 'Kings XI Punjab', 'Rajasthan Royals',
       'Delhi Daredevils', 'Mumbai Indians', 'Deccan Chargers',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Sunrisers Hyderabad',
       'Rising Pune Supergiants', 'Gujarat Lions',
       'Rising Pune Supergiant', 'Delhi Capitals', 'Punjab Kings',
       'Lucknow Super Giants', 'Gujarat Titans',
       'Royal Challengers Bengaluru'], dtype=object)

- their is no null value in batting team
- over the years names of teams are changed so same team is present with several names or their are some speeling difference

In [279]:
# correcting teams name if differs
def change_team_name(team_name):
    if team_name in ['Rising Pune Supergiants','Pune Warriors']:
        return 'Rising Pune Supergiant'
    elif team_name == 'Royal Challengers Bengaluru':
        return 'Royal Challengers Bangalore'
    elif team_name == 'Deccan Chargers':
        return 'Sunrisers Hyderabad'
    elif team_name == 'Delhi Daredevils':
        return 'Delhi Capitals'
    elif team_name == 'Gujarat Lions':
        return 'Gujarat Titans'
    elif team_name == 'Kings XI Punjab':
        return 'Punjab Kings'
    else:
        return team_name

In [280]:
deliveries['batting_team'] = deliveries['batting_team'].apply(change_team_name)

In [281]:
deliveries['batting_team'].nunique()

12

- there were 19 teams in batting team column which reduced to 12 as we know till today 12 teams were played in IPL

#### bowling team

In [284]:
deliveries['bowling_team'].isnull().sum()

0

In [285]:
deliveries['bowling_team'].nunique()

19

In [286]:
deliveries['bowling_team'] = deliveries['bowling_team'].apply(change_team_name)

In [287]:
deliveries['bowling_team'].nunique()

12

- their were 19 teams reduced to 12 with same problem as batting team

#### Over

In [290]:
deliveries['over'].isnull().sum()

0

In [291]:
deliveries['over'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19], dtype=int64)

- 20 overs are present 1 over is calculated after completing.

#### ball

In [294]:
deliveries['ball'].isnull().sum()

0

In [295]:
deliveries['ball'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11], dtype=int64)

- as we know each over has 6 balls their can be wides and no balls in some overs.

#### batter

In [298]:
deliveries['batter'].isnull().sum()

0

In [299]:
deliveries['batter'].nunique()

673

In [300]:
deliveries['batter'].unique()

array(['SC Ganguly', 'BB McCullum', 'RT Ponting', 'DJ Hussey',
       'Mohammad Hafeez', 'R Dravid', 'W Jaffer', 'V Kohli', 'JH Kallis',
       'CL White', 'MV Boucher', 'B Akhil', 'AA Noffke', 'P Kumar',
       'Z Khan', 'SB Joshi', 'PA Patel', 'ML Hayden', 'MEK Hussey',
       'MS Dhoni', 'SK Raina', 'JDP Oram', 'S Badrinath', 'K Goel',
       'JR Hopes', 'KC Sangakkara', 'Yuvraj Singh', 'SM Katich',
       'IK Pathan', 'T Kohli', 'YK Pathan', 'SR Watson', 'M Kaif',
       'DS Lehmann', 'RA Jadeja', 'M Rawat', 'D Salunkhe', 'SK Warne',
       'SK Trivedi', 'G Gambhir', 'V Sehwag', 'S Dhawan', 'L Ronchi',
       'ST Jayasuriya', 'DJ Thornely', 'RV Uthappa', 'PR Shah',
       'AM Nayar', 'SM Pollock', 'Harbhajan Singh', 'S Chanderpaul',
       'LRPL Taylor', 'AC Gilchrist', 'Y Venugopal Rao', 'VVS Laxman',
       'A Symonds', 'RG Sharma', 'SB Styris', 'AS Yadav', 'SB Bangar',
       'WPUJC Vaas', 'RP Singh', 'WP Saha', 'LR Shukla',
       'DPMD Jayawardene', 'S Sohal', 'B Lee', 'PP Cha

- there are total 673 batter played in IPL till now

#### bowler

In [303]:
deliveries['bowler'].isnull().sum()

0

In [304]:
deliveries['bowler'].nunique()

530

In [305]:
deliveries['bowler'].unique()

array(['P Kumar', 'Z Khan', 'AA Noffke', 'JH Kallis', 'SB Joshi',
       'CL White', 'AB Dinda', 'I Sharma', 'AB Agarkar', 'SC Ganguly',
       'LR Shukla', 'B Lee', 'S Sreesanth', 'JR Hopes', 'IK Pathan',
       'K Goel', 'PP Chawla', 'WA Mota', 'JDP Oram', 'MS Gony',
       'M Muralitharan', 'P Amarnath', 'Joginder Sharma', 'GD McGrath',
       'B Geeves', 'MF Maharoof', 'R Bhatia', 'DL Vettori', 'MM Patel',
       'SR Watson', 'SK Trivedi', 'SK Warne', 'YK Pathan', 'D Salunkhe',
       'R Vinay Kumar', 'B Akhil', 'A Nehra', 'SM Pollock', 'DS Kulkarni',
       'ST Jayasuriya', 'Harbhajan Singh', 'AM Nayar', 'M Kartik',
       'Mohammad Hafeez', 'DJ Hussey', 'WPUJC Vaas', 'RP Singh',
       'SB Styris', 'SB Bangar', 'A Symonds', 'PP Ojha', 'Pankaj Singh',
       'Mohammad Asif', 'VY Mahesh', 'Shahid Afridi', 'DJ Bravo',
       'VS Yeligati', 'MA Khote', 'D Kalyankrishna', 'VRV Singh',
       'Sohail Tanvir', 'A Kumble', 'DNT Zoysa', 'SD Chitnis',
       'Yuvraj Singh', 'Shoaib Malik',

- 530 bowlers played IPL till now

#### non-striker

In [308]:
deliveries['non_striker'].isnull().sum()

0

In [309]:
deliveries['non_striker'].nunique()

663

In [310]:
deliveries['non_striker'].unique()

array(['BB McCullum', 'SC Ganguly', 'RT Ponting', 'DJ Hussey',
       'Mohammad Hafeez', 'W Jaffer', 'R Dravid', 'V Kohli', 'JH Kallis',
       'CL White', 'MV Boucher', 'B Akhil', 'AA Noffke', 'P Kumar',
       'Z Khan', 'SB Joshi', 'ML Hayden', 'PA Patel', 'MEK Hussey',
       'MS Dhoni', 'SK Raina', 'JDP Oram', 'S Badrinath', 'JR Hopes',
       'K Goel', 'KC Sangakkara', 'Yuvraj Singh', 'SM Katich',
       'IK Pathan', 'YK Pathan', 'T Kohli', 'SR Watson', 'M Kaif',
       'DS Lehmann', 'RA Jadeja', 'M Rawat', 'D Salunkhe', 'SK Warne',
       'SK Trivedi', 'V Sehwag', 'G Gambhir', 'S Dhawan', 'ST Jayasuriya',
       'L Ronchi', 'DJ Thornely', 'RV Uthappa', 'PR Shah', 'AM Nayar',
       'SM Pollock', 'Harbhajan Singh', 'S Chanderpaul', 'LRPL Taylor',
       'Y Venugopal Rao', 'AC Gilchrist', 'VVS Laxman', 'A Symonds',
       'SB Styris', 'AS Yadav', 'SB Bangar', 'WPUJC Vaas', 'RP Singh',
       'PP Ojha', 'WP Saha', 'LR Shukla', 'DPMD Jayawardene', 'S Sohal',
       'PP Chawla', 'WA M

- there are total 663 unique values in non-striker

#### batsmam_runs

In [313]:
deliveries['batsman_runs'].isnull().sum()

0

In [314]:
deliveries['batsman_runs'].unique()

array([0, 4, 6, 1, 2, 5, 3], dtype=int64)

- values are till 6 that a batter can strike

#### extra_runs

In [317]:
deliveries['extra_runs'].isnull().sum()

0

In [318]:
deliveries['extra_runs'].unique()

array([1, 0, 5, 4, 2, 3, 7], dtype=int64)

- because of bowlers mistake some penalties are gives so the value is 7

#### Total_runs

In [321]:
deliveries['total_runs'].isnull().sum()

0

In [322]:
deliveries['total_runs'].unique()

array([1, 0, 4, 6, 5, 2, 3, 7], dtype=int64)

- there is no anamolous behaviour in this column

#### extras_type

In [325]:
deliveries['extras_type'].isnull().sum()

246795

In [326]:
deliveries['extras_type'].unique()

array(['legbyes', nan, 'wides', 'byes', 'noballs', 'penalty'],
      dtype=object)

- here we have 246795 null values which is valid as their isnt any extra runs for these ball.
- there is no need to fill these null values 
- we can futher drop this column as it is of no use.

#### is_wicket

In [329]:
deliveries['is_wicket'].isnull().sum()

0

In [330]:
deliveries['is_wicket'].unique()

array([0, 1], dtype=int64)

- in this column 0 means no wicket and 1 mean they caught a wicket.


#### player_dismissed

In [333]:
deliveries['player_dismissed'].isnull().sum()

247970

In [334]:
deliveries['player_dismissed'].unique()

array([nan, 'SC Ganguly', 'RT Ponting', 'DJ Hussey', 'R Dravid',
       'V Kohli', 'JH Kallis', 'W Jaffer', 'MV Boucher', 'B Akhil',
       'CL White', 'AA Noffke', 'Z Khan', 'SB Joshi', 'PA Patel',
       'ML Hayden', 'MS Dhoni', 'SK Raina', 'JDP Oram', 'K Goel',
       'JR Hopes', 'Yuvraj Singh', 'KC Sangakkara', 'T Kohli',
       'YK Pathan', 'SR Watson', 'DS Lehmann', 'M Kaif', 'M Rawat',
       'RA Jadeja', 'SK Warne', 'V Sehwag', 'L Ronchi', 'DJ Thornely',
       'ST Jayasuriya', 'PR Shah', 'RV Uthappa', 'AM Nayar', 'SM Pollock',
       'S Chanderpaul', 'LRPL Taylor', 'Y Venugopal Rao', 'VVS Laxman',
       'AC Gilchrist', 'RG Sharma', 'SB Styris', 'AS Yadav', 'A Symonds',
       'WPUJC Vaas', 'SB Bangar', 'PP Ojha', 'BB McCullum', 'WP Saha',
       'Mohammad Hafeez', 'DPMD Jayawardene', 'IK Pathan', 'B Lee',
       'S Sohal', 'Kamran Akmal', 'Shahid Afridi', 'G Gambhir',
       'MEK Hussey', 'DJ Bravo', 'MA Khote', 'Harbhajan Singh',
       'GC Smith', 'D Salunkhe', 'RR Sarwan',

- in this column we have the player of whose the wicket is caught
- the data is of every ball so their are many null value which is valid. 
- the column does not gives any specific value so we might delete it later.

#### dismissal_kind

In [337]:
deliveries['dismissal_kind'].isnull().sum()

247970

In [338]:
deliveries['dismissal_kind'].unique()

array([nan, 'caught', 'bowled', 'run out', 'lbw', 'retired hurt',
       'stumped', 'caught and bowled', 'hit wicket',
       'obstructing the field', 'retired out'], dtype=object)

- here also null values are valid due to same reason

#### fielder

In [341]:
deliveries['fielder'].isnull().sum()

251566

In [342]:
deliveries['fielder'].unique()

array([nan, 'JH Kallis', 'P Kumar', 'CL White', 'M Kartik', 'RT Ponting',
       'WP Saha', 'AB Agarkar', 'BB McCullum', 'KC Sangakkara',
       'IK Pathan', 'PA Patel', 'Joginder Sharma', 'P Amarnath',
       'M Manhas', 'B Geeves', 'V Kohli', 'Z Khan', 'MV Boucher',
       'AM Nayar', 'L Ronchi', 'LR Shukla', 'Mohammad Hafeez', 'AB Dinda',
       'DJ Hussey', 'SC Ganguly', 'RG Sharma', 'SB Styris', 'A Symonds',
       'Kamran Akmal', 'RA Jadeja', 'PP Chawla', 'S Dhawan', 'KD Karthik',
       'MK Tiwary', 'GD McGrath', 'G Gambhir', 'AC Gilchrist', 'DJ Bravo',
       'Harbhajan Singh', 'S Badrinath', 'JDP Oram', 'SK Raina',
       'D Salunkhe', 'SB Bangar', 'D Kalyankrishna', 'VVS Laxman',
       'DS Kulkarni', 'AM Rahane', 'A Nehra', 'B Lee', 'RR Sarwan',
       'Yuvraj Singh', 'MM Patel', 'GC Smith', 'M Rawat', 'Sohail Tanvir',
       'A Kumble', 'M Muralitharan', 'MS Dhoni', 'I Sharma', 'PP Ojha',
       'DPMD Jayawardene', 'V Sehwag', 'W Jaffer', 'R Vinay Kumar',
       'R Dravid',

- fielders who caused the dismissal of the player
- therefore null values are valid.

#### checking duplicate rows

In [345]:
deliveries[deliveries.duplicated()]

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,extra_runs,total_runs,extras_type,is_wicket,player_dismissed,dismissal_kind,fielder


- their isn't any duplicate value in our data

#### cleaning matches dataset

In [348]:
matches['id'].isnull().sum()

0

In [349]:
matches['id'].nunique()

1095

In [350]:
matches.rename(columns={'id':'match_id'},inplace=True)

- for our convinience column name is changed into match_id 

#### season

In [353]:
matches['season'].isnull().sum()

0

In [354]:
matches['season'].unique()

array(['2007/08', '2009', '2009/10', '2011', '2012', '2013', '2014',
       '2015', '2016', '2017', '2018', '2019', '2020/21', '2021', '2022',
       '2023', '2024'], dtype=object)

In [355]:
matches[matches['season']=='2020/21']

Unnamed: 0,match_id,season,city,date,match_type,player_of_match,venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin,target_runs,target_overs,super_over,method,umpire1,umpire2
756,1216492,2020/21,Abu Dhabi,2020-09-19,League,AT Rayudu,Sheikh Zayed Stadium,Mumbai Indians,Chennai Super Kings,Chennai Super Kings,field,Chennai Super Kings,wickets,5.0,163.0,20.0,N,,CB Gaffaney,VK Sharma
757,1216493,2020/21,,2020-09-20,League,MP Stoinis,Dubai International Cricket Stadium,Delhi Capitals,Kings XI Punjab,Kings XI Punjab,field,Delhi Capitals,tie,,158.0,20.0,Y,,AK Chaudhary,Nitin Menon
758,1216534,2020/21,,2020-09-21,League,YS Chahal,Dubai International Cricket Stadium,Royal Challengers Bangalore,Sunrisers Hyderabad,Sunrisers Hyderabad,field,Royal Challengers Bangalore,runs,10.0,164.0,20.0,N,,AY Dandekar,Nitin Menon
759,1216496,2020/21,,2020-09-22,League,SV Samson,Sharjah Cricket Stadium,Rajasthan Royals,Chennai Super Kings,Chennai Super Kings,field,Rajasthan Royals,runs,16.0,217.0,20.0,N,,C Shamshuddin,VA Kulkarni
760,1216508,2020/21,Abu Dhabi,2020-09-23,League,RG Sharma,Sheikh Zayed Stadium,Mumbai Indians,Kolkata Knight Riders,Kolkata Knight Riders,field,Mumbai Indians,runs,49.0,196.0,20.0,N,,CB Gaffaney,S Ravi
761,1216510,2020/21,,2020-09-24,League,KL Rahul,Dubai International Cricket Stadium,Kings XI Punjab,Royal Challengers Bangalore,Royal Challengers Bangalore,field,Kings XI Punjab,runs,97.0,207.0,20.0,N,,AK Chaudhary,PR Reiffel
762,1216539,2020/21,,2020-09-25,League,PP Shaw,Dubai International Cricket Stadium,Delhi Capitals,Chennai Super Kings,Chennai Super Kings,field,Delhi Capitals,runs,44.0,176.0,20.0,N,,KN Ananthapadmanabhan,RK Illingworth
763,1216545,2020/21,Abu Dhabi,2020-09-26,League,Shubman Gill,Sheikh Zayed Stadium,Sunrisers Hyderabad,Kolkata Knight Riders,Sunrisers Hyderabad,bat,Kolkata Knight Riders,wickets,7.0,143.0,20.0,N,,CB Gaffaney,VK Sharma
764,1216527,2020/21,,2020-09-27,League,SV Samson,Sharjah Cricket Stadium,Kings XI Punjab,Rajasthan Royals,Rajasthan Royals,field,Rajasthan Royals,wickets,4.0,224.0,20.0,N,,RK Illingworth,K Srinivasan
765,1216547,2020/21,,2020-09-28,League,AB de Villiers,Dubai International Cricket Stadium,Royal Challengers Bangalore,Mumbai Indians,Mumbai Indians,field,Royal Challengers Bangalore,tie,,202.0,20.0,Y,,Nitin Menon,PR Reiffel


- this column has some seasons in 2 year form. we have to convert this to single season 
- may be we can do this using date column. 

#### date

In [358]:
matches['date'].isnull().sum()

0

In [359]:
matches['date']

0       2008-04-18
1       2008-04-19
2       2008-04-19
3       2008-04-20
4       2008-04-20
           ...    
1090    2024-05-19
1091    2024-05-21
1092    2024-05-22
1093    2024-05-24
1094    2024-05-26
Name: date, Length: 1095, dtype: object

In [360]:
matches['date']=pd.to_datetime(matches['date'])
matches['season']=matches['date'].dt.year

- here converted datatype from object to datetime 
- consequently fetched seasons.

In [362]:
matches['season'].value_counts()

season
2013    76
2023    74
2012    74
2022    74
2011    73
2024    71
2018    60
2021    60
2020    60
2019    60
2016    60
2014    60
2010    60
2017    59
2015    59
2008    58
2009    57
Name: count, dtype: int64

#### city

In [432]:
matches['city'].isnull().sum()

51

In [434]:
matches['city'].unique()

array(['Bangalore', 'Chandigarh', 'Delhi', 'Mumbai', 'Kolkata', 'Jaipur',
       'Hyderabad', 'Chennai', 'Cape Town', 'Port Elizabeth', 'Durban',
       'Centurion', 'East London', 'Johannesburg', 'Kimberley',
       'Bloemfontein', 'Ahmedabad', 'Cuttack', 'Nagpur', 'Dharamsala',
       'Kochi', 'Indore', 'Visakhapatnam', 'Pune', 'Raipur', 'Ranchi',
       'Abu Dhabi', nan, 'Rajkot', 'Kanpur', 'Bengaluru', 'Dubai',
       'Sharjah', 'Navi Mumbai', 'Lucknow', 'Guwahati', 'Mohali'],
      dtype=object)

- here we have 51 null values for some matches may be we can fill them in feature engineering.

#### match_type

In [440]:
matches['match_type'].isnull().sum()

0

In [442]:
matches['match_type'].unique()

array(['League', 'Semi Final', 'Final', '3rd Place Play-Off',
       'Qualifier 1', 'Elimination Final', 'Qualifier 2', 'Eliminator'],
      dtype=object)

#### player_of_match

In [447]:
matches['player_of_match'].isnull().sum()

5

- if needed we can handle these missing value

#### venue

In [451]:
matches['venue'].isnull().sum()

0

In [453]:
matches['venue'].unique()

array(['M Chinnaswamy Stadium',
       'Punjab Cricket Association Stadium, Mohali', 'Feroz Shah Kotla',
       'Wankhede Stadium', 'Eden Gardens', 'Sawai Mansingh Stadium',
       'Rajiv Gandhi International Stadium, Uppal',
       'MA Chidambaram Stadium, Chepauk', 'Dr DY Patil Sports Academy',
       'Newlands', "St George's Park", 'Kingsmead', 'SuperSport Park',
       'Buffalo Park', 'New Wanderers Stadium', 'De Beers Diamond Oval',
       'OUTsurance Oval', 'Brabourne Stadium',
       'Sardar Patel Stadium, Motera', 'Barabati Stadium',
       'Brabourne Stadium, Mumbai',
       'Vidarbha Cricket Association Stadium, Jamtha',
       'Himachal Pradesh Cricket Association Stadium', 'Nehru Stadium',
       'Holkar Cricket Stadium',
       'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium',
       'Subrata Roy Sahara Stadium',
       'Maharashtra Cricket Association Stadium',
       'Shaheed Veer Narayan Singh International Stadium',
       'JSCA International Stadium Complex', 'Sh

- here some values denotes same venue but has different values we can solve that 

In [467]:
# Mapping dictionary for cleaning
venue_mapping = {
    'M Chinnaswamy Stadium, Bengaluru': 'M Chinnaswamy Stadium',
    'M.Chinnaswamy Stadium': 'M Chinnaswamy Stadium',
    'Punjab Cricket Association Stadium, Mohali': 'Punjab Cricket Association IS Bindra Stadium',
    'Punjab Cricket Association IS Bindra Stadium, Mohali': 'Punjab Cricket Association IS Bindra Stadium',
    'Punjab Cricket Association IS Bindra Stadium, Mohali, Chandigarh': 'Punjab Cricket Association IS Bindra Stadium',
    'MA Chidambaram Stadium, Chepauk': 'MA Chidambaram Stadium',
    'MA Chidambaram Stadium, Chepauk, Chennai': 'MA Chidambaram Stadium',
    'Wankhede Stadium, Mumbai': 'Wankhede Stadium',
    'Arun Jaitley Stadium, Delhi': 'Arun Jaitley Stadium',
    'Eden Gardens, Kolkata': 'Eden Gardens',
    'Rajiv Gandhi International Stadium, Uppal': 'Rajiv Gandhi International Stadium',
    'Rajiv Gandhi International Stadium, Uppal, Hyderabad': 'Rajiv Gandhi International Stadium',
    'Dr DY Patil Sports Academy, Mumbai': 'Dr DY Patil Sports Academy',
    'Maharashtra Cricket Association Stadium, Pune': 'Maharashtra Cricket Association Stadium',
    'Himachal Pradesh Cricket Association Stadium, Dharamsala': 'Himachal Pradesh Cricket Association Stadium',
    'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium, Visakhapatnam': 'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium',
    'Brabourne Stadium, Mumbai':'Brabourne Stadium',
    'Sawai Mansingh Stadium, Jaipur':'Sawai Mansingh Stadium',
    'Narendra Modi Stadium, Ahmedabad': 'Narendra Modi Stadium',
    'Zayed Cricket Stadium, Abu Dhabi': 'Zayed Cricket Stadium',
    'Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium, Lucknow': 'Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium',
    'Barsapara Cricket Stadium, Guwahati': 'Barsapara Cricket Stadium',
    'Maharaja Yadavindra Singh International Cricket Stadium, Mullanpur': 'Maharaja Yadavindra Singh International Cricket Stadium'
    
    # Add more mappings as necessary
}

# Replace venue names
matches['venue']=matches['venue'].replace(venue_mapping)

In [477]:
matches['venue'].unique()

array(['M Chinnaswamy Stadium',
       'Punjab Cricket Association IS Bindra Stadium', 'Feroz Shah Kotla',
       'Wankhede Stadium', 'Eden Gardens', 'Sawai Mansingh Stadium',
       'Rajiv Gandhi International Stadium', 'MA Chidambaram Stadium',
       'Dr DY Patil Sports Academy', 'Newlands', "St George's Park",
       'Kingsmead', 'SuperSport Park', 'Buffalo Park',
       'New Wanderers Stadium', 'De Beers Diamond Oval',
       'OUTsurance Oval', 'Brabourne Stadium',
       'Sardar Patel Stadium, Motera', 'Barabati Stadium',
       'Vidarbha Cricket Association Stadium, Jamtha',
       'Himachal Pradesh Cricket Association Stadium', 'Nehru Stadium',
       'Holkar Cricket Stadium',
       'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium',
       'Subrata Roy Sahara Stadium',
       'Maharashtra Cricket Association Stadium',
       'Shaheed Veer Narayan Singh International Stadium',
       'JSCA International Stadium Complex', 'Sheikh Zayed Stadium',
       'Sharjah Cricket Stadi

In [479]:
venue_to_city = {
    'M Chinnaswamy Stadium': 'Bengaluru',
    'Punjab Cricket Association IS Bindra Stadium': 'Mohali',
    'Feroz Shah Kotla': 'Delhi',
    'Wankhede Stadium': 'Mumbai',
    'Eden Gardens': 'Kolkata',
    'Sawai Mansingh Stadium': 'Jaipur',
    'Rajiv Gandhi International Stadium': 'Hyderabad',
    'MA Chidambaram Stadium': 'Chennai',
    'Dr DY Patil Sports Academy': 'Mumbai',
    'Newlands': 'Cape Town',
    "St George's Park": 'Port Elizabeth',
    'Kingsmead': 'Durban',
    'SuperSport Park': 'Centurion',
    'Buffalo Park': 'East London',
    'New Wanderers Stadium': 'Johannesburg',
    'De Beers Diamond Oval': 'Kimberley',
    'OUTsurance Oval': 'Bloemfontein',
    'Brabourne Stadium': 'Mumbai',
    'Sardar Patel Stadium, Motera': 'Ahmedabad',
    'Barabati Stadium': 'Cuttack',
    'Vidarbha Cricket Association Stadium, Jamtha': 'Nagpur',
    'Himachal Pradesh Cricket Association Stadium': 'Dharamshala',
    'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium': 'Visakhapatnam',
    'Subrata Roy Sahara Stadium': 'Pune',
    'Shaheed Veer Narayan Singh International Stadium': 'Raipur',
    'JSCA International Stadium Complex': 'Ranchi',
    'Sheikh Zayed Stadium': 'Abu Dhabi',
    'Sharjah Cricket Stadium': 'Sharjah',
    'Dubai International Cricket Stadium': 'Dubai',
    'Maharashtra Cricket Association Stadium': 'Pune',
    'Saurashtra Cricket Association Stadium': 'Rajkot',
    'Green Park': 'Kanpur',
    'Holkar Cricket Stadium': 'Indore',
    'Arun Jaitley Stadium': 'Delhi',
    'Narendra Modi Stadium': 'Ahmedabad',
    'Zayed Cricket Stadium': 'Abu Dhabi',
    'Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium': 'Lucknow',
    'Barsapara Cricket Stadium': 'Guwahati',
    'Maharaja Yadavindra Singh International Cricket Stadium': 'Chandigarh'
}

matches['city']=matches['venue'].map(venue_to_city)

In [485]:
matches['city'].unique()

array(['Bengaluru', 'Mohali', 'Delhi', 'Mumbai', 'Kolkata', 'Jaipur',
       'Hyderabad', 'Chennai', 'Cape Town', 'Port Elizabeth', 'Durban',
       'Centurion', 'East London', 'Johannesburg', 'Kimberley',
       'Bloemfontein', 'Ahmedabad', 'Cuttack', 'Nagpur', 'Dharamshala',
       nan, 'Indore', 'Visakhapatnam', 'Pune', 'Raipur', 'Ranchi',
       'Abu Dhabi', 'Sharjah', 'Dubai', 'Rajkot', 'Kanpur', 'Lucknow',
       'Guwahati', 'Chandigarh'], dtype=object)

- converted cities via venues but still na values are present

#### team1

In [491]:
matches['team1'].isnull().sum()

0

In [493]:
matches['team1'].unique()

array(['Royal Challengers Bangalore', 'Kings XI Punjab',
       'Delhi Daredevils', 'Mumbai Indians', 'Kolkata Knight Riders',
       'Rajasthan Royals', 'Deccan Chargers', 'Chennai Super Kings',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Sunrisers Hyderabad',
       'Gujarat Lions', 'Rising Pune Supergiants',
       'Rising Pune Supergiant', 'Delhi Capitals', 'Punjab Kings',
       'Lucknow Super Giants', 'Gujarat Titans',
       'Royal Challengers Bengaluru'], dtype=object)

- here due to change of same team name we have many teams can be resolved as before.

In [496]:
matches['team1'] = matches['team1'].apply(change_team_name)

In [502]:
matches['team1'].unique()

array(['Royal Challengers Bangalore', 'Punjab Kings', 'Delhi Capitals',
       'Mumbai Indians', 'Kolkata Knight Riders', 'Rajasthan Royals',
       'Sunrisers Hyderabad', 'Chennai Super Kings',
       'Kochi Tuskers Kerala', 'Rising Pune Supergiant', 'Gujarat Titans',
       'Lucknow Super Giants'], dtype=object)

#### team2

In [505]:
matches['team2'].isnull().sum()

0

In [507]:
matches['team2'].unique()

array(['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
       'Royal Challengers Bangalore', 'Deccan Chargers',
       'Kings XI Punjab', 'Delhi Daredevils', 'Mumbai Indians',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Sunrisers Hyderabad',
       'Rising Pune Supergiants', 'Gujarat Lions',
       'Rising Pune Supergiant', 'Delhi Capitals', 'Punjab Kings',
       'Gujarat Titans', 'Lucknow Super Giants',
       'Royal Challengers Bengaluru'], dtype=object)

In [509]:
matches['team2']=matches['team2'].apply(change_team_name)

In [511]:
matches['team2'].unique()

array(['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
       'Royal Challengers Bangalore', 'Sunrisers Hyderabad',
       'Punjab Kings', 'Delhi Capitals', 'Mumbai Indians',
       'Kochi Tuskers Kerala', 'Rising Pune Supergiant', 'Gujarat Titans',
       'Lucknow Super Giants'], dtype=object)

#### toss_winner

In [514]:
matches['toss_winner'].isnull().sum()

0

In [516]:
matches['toss_winner'].unique()

array(['Royal Challengers Bangalore', 'Chennai Super Kings',
       'Rajasthan Royals', 'Mumbai Indians', 'Deccan Chargers',
       'Kings XI Punjab', 'Kolkata Knight Riders', 'Delhi Daredevils',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Sunrisers Hyderabad',
       'Gujarat Lions', 'Rising Pune Supergiants',
       'Rising Pune Supergiant', 'Delhi Capitals', 'Punjab Kings',
       'Gujarat Titans', 'Lucknow Super Giants',
       'Royal Challengers Bengaluru'], dtype=object)

In [518]:
matches['toss_winner']=matches['toss_winner'].apply(change_team_name)

In [520]:
matches['toss_winner'].unique()

array(['Royal Challengers Bangalore', 'Chennai Super Kings',
       'Rajasthan Royals', 'Mumbai Indians', 'Sunrisers Hyderabad',
       'Punjab Kings', 'Kolkata Knight Riders', 'Delhi Capitals',
       'Kochi Tuskers Kerala', 'Rising Pune Supergiant', 'Gujarat Titans',
       'Lucknow Super Giants'], dtype=object)

#### toss_decision

In [523]:
matches['toss_decision'].isnull().sum()

0

In [525]:
matches['toss_decision'].unique()

array(['field', 'bat'], dtype=object)

#### winner

In [528]:
matches['winner'].isnull().sum()

5

In [530]:
matches[matches['winner'].isnull()]

Unnamed: 0,match_id,season,city,date,match_type,player_of_match,venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin,target_runs,target_overs,super_over,method,umpire1,umpire2
241,501265,2011,Delhi,2011-05-21,League,,Feroz Shah Kotla,Delhi Capitals,Rising Pune Supergiant,Delhi Capitals,bat,,no result,,,,N,,SS Hazare,RJ Tucker
485,829763,2015,Bengaluru,2015-04-29,League,,M Chinnaswamy Stadium,Royal Challengers Bangalore,Rajasthan Royals,Rajasthan Royals,field,,no result,,,,N,,JD Cloete,PG Pathak
511,829813,2015,Bengaluru,2015-05-17,League,,M Chinnaswamy Stadium,Royal Challengers Bangalore,Delhi Capitals,Royal Challengers Bangalore,field,,no result,,188.0,20.0,N,,HDPK Dharmasena,K Srinivasan
744,1178424,2019,Bengaluru,2019-04-30,League,,M Chinnaswamy Stadium,Royal Challengers Bangalore,Rajasthan Royals,Rajasthan Royals,field,,no result,,63.0,5.0,N,,NJ Llong,UV Gandhe
994,1359519,2023,Lucknow,2023-05-03,League,,Bharat Ratna Shri Atal Bihari Vajpayee Ekana C...,Lucknow Super Giants,Chennai Super Kings,Chennai Super Kings,field,,no result,,,,N,,AK Chaudhary,NA Patwardhan


In [532]:
matches['winner'].unique()

array(['Kolkata Knight Riders', 'Chennai Super Kings', 'Delhi Daredevils',
       'Royal Challengers Bangalore', 'Rajasthan Royals',
       'Kings XI Punjab', 'Deccan Chargers', 'Mumbai Indians',
       'Pune Warriors', 'Kochi Tuskers Kerala', nan,
       'Sunrisers Hyderabad', 'Rising Pune Supergiants', 'Gujarat Lions',
       'Rising Pune Supergiant', 'Delhi Capitals', 'Punjab Kings',
       'Gujarat Titans', 'Lucknow Super Giants',
       'Royal Challengers Bengaluru'], dtype=object)

In [534]:
matches['winner']=matches['winner'].apply(change_team_name)

In [536]:
matches['winner'].unique()

array(['Kolkata Knight Riders', 'Chennai Super Kings', 'Delhi Capitals',
       'Royal Challengers Bangalore', 'Rajasthan Royals', 'Punjab Kings',
       'Sunrisers Hyderabad', 'Mumbai Indians', 'Rising Pune Supergiant',
       'Kochi Tuskers Kerala', nan, 'Gujarat Titans',
       'Lucknow Super Giants'], dtype=object)

#### result

In [364]:
matches.columns

Index(['match_id', 'season', 'city', 'date', 'match_type', 'player_of_match',
       'venue', 'team1', 'team2', 'toss_winner', 'toss_decision', 'winner',
       'result', 'result_margin', 'target_runs', 'target_overs', 'super_over',
       'method', 'umpire1', 'umpire2'],
      dtype='object')