Import Required Libraries

In [148]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Load the Datasets

In [149]:
matches = pd.read_csv("matches.csv")
deliveries = pd.read_csv("deliveries.csv")


check datasets shape

In [150]:
matches.shape
deliveries.shape


(260920, 17)

View Column Names

In [151]:
print(matches.columns)
print(deliveries.columns)


Index(['id', 'season', 'city', 'date', 'match_type', 'player_of_match',
       'venue', 'team1', 'team2', 'toss_winner', 'toss_decision', 'winner',
       'result', 'result_margin', 'target_runs', 'target_overs', 'super_over',
       'method', 'umpire1', 'umpire2'],
      dtype='object')
Index(['match_id', 'inning', 'batting_team', 'bowling_team', 'over', 'ball',
       'batter', 'bowler', 'non_striker', 'batsman_runs', 'extra_runs',
       'total_runs', 'extras_type', 'is_wicket', 'player_dismissed',
       'dismissal_kind', 'fielder'],
      dtype='object')


View First 5 sample Records

In [152]:
matches.head()
deliveries.head()


Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,extra_runs,total_runs,extras_type,is_wicket,player_dismissed,dismissal_kind,fielder
0,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,1,SC Ganguly,P Kumar,BB McCullum,0,1,1,legbyes,0,,,
1,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,2,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,
2,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,3,BB McCullum,P Kumar,SC Ganguly,0,1,1,wides,0,,,
3,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,4,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,
4,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,5,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,


Dataset Information

In [153]:
print("Matches dataset info:")
matches.info()

print("Deliveries dataset info:")
deliveries.info()


Matches dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1095 entries, 0 to 1094
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               1095 non-null   int64  
 1   season           1095 non-null   object 
 2   city             1044 non-null   object 
 3   date             1095 non-null   object 
 4   match_type       1095 non-null   object 
 5   player_of_match  1090 non-null   object 
 6   venue            1095 non-null   object 
 7   team1            1095 non-null   object 
 8   team2            1095 non-null   object 
 9   toss_winner      1095 non-null   object 
 10  toss_decision    1095 non-null   object 
 11  winner           1090 non-null   object 
 12  result           1095 non-null   object 
 13  result_margin    1076 non-null   float64
 14  target_runs      1092 non-null   float64
 15  target_overs     1092 non-null   float64
 16  super_over       1095 non-null   objec

Find Missing (Null) Values

In [154]:
matches.isnull().sum()
deliveries.isnull().sum()


match_id                 0
inning                   0
batting_team             0
bowling_team             0
over                     0
ball                     0
batter                   0
bowler                   0
non_striker              0
batsman_runs             0
extra_runs               0
total_runs               0
extras_type         246795
is_wicket                0
player_dismissed    247970
dismissal_kind      247970
fielder             251566
dtype: int64

Handle Missing Values (Cleaning)

In [155]:
matches["city"].fillna("Unknown", inplace=True)
matches["player_of_match"].fillna("Not Awarded", inplace=True)
matches["winner"].fillna("No Result", inplace=True)
matches["result_margin"].fillna(0, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  matches["city"].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  matches["player_of_match"].fillna("Not Awarded", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we a

Convert Date Column

In [156]:
matches["date"] = pd.to_datetime(matches["date"])


Verify Match ID Consistency

In [157]:
matches["id"].nunique(), deliveries["match_id"].nunique()


(1095, 1095)

Merge the Datasets

In [158]:
merged_df = pd.merge(
    matches,
    deliveries,
    left_on="id",
    right_on="match_id",
    how="inner"
)


Verify Merged Data

In [159]:
merged_df.head()
merged_df.shape


(260920, 37)

Select Numerical Columns for Normalization

In [160]:
num_cols = ["batsman_runs", "extra_runs", "total_runs"]


Normalization (Min-Max Scaling)

In [161]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
merged_df[num_cols] = scaler.fit_transform(merged_df[num_cols])


Verify Normalized Data

In [162]:
merged_df[num_cols].head()


Unnamed: 0,batsman_runs,extra_runs,total_runs
0,0.0,0.142857,0.142857
1,0.0,0.0,0.0
2,0.0,0.142857,0.142857
3,0.0,0.0,0.0
4,0.0,0.0,0.0


Save Final Cleaned Dataset

In [163]:
merged_df.to_csv("final_cleaned_ipl_data.csv", index=False)
