### Import required libraries

In [1]:
import pandas as pd

### Load the data

In [2]:
# Load the scraped data
df = pd.read_csv("../data/bcci_odi_stats.csv")

In [3]:
# Display initial dataset information
print("Initial Data Info:")
df.info()

Initial Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 236 entries, 0 to 235
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Name           236 non-null    object 
 1   Total Matches  236 non-null    int64  
 2   Innings        236 non-null    int64  
 3   Average Score  236 non-null    float64
 4   Strike Rate    236 non-null    float64
 5   Highest Score  236 non-null    object 
 6   Fours          236 non-null    object 
 7   Sixes          236 non-null    object 
 8   Fifties        236 non-null    object 
 9   Hundreds       236 non-null    object 
 10  Total Runs     236 non-null    int64  
dtypes: float64(2), int64(3), object(6)
memory usage: 20.4+ KB


In [4]:
# Show the first few rows
print("\nFirst 5 rows before cleaning:")
df.head()


First 5 rows before cleaning:


Unnamed: 0,Name,Total Matches,Innings,Average Score,Strike Rate,Highest Score,Fours,Sixes,Fifties,Hundreds,Total Runs
0,Sachin TENDULKAR,463,452,44.83,86.23,200,2016,195,96,49,18426
1,Virat Kohli,297,285,57.93,93.52,183,1310,152,73,50,13963
2,Sourav Ganguly,311,300,41.02,73.7,183,1122,190,72,22,11363
3,Rohit Sharma,268,260,49.05,92.69,264,1024,338,57,32,10988
4,Rahul Dravid,344,318,39.16,71.23,153,950,42,83,12,10889


### Convert Numeric Columns to Proper Data Types

In [5]:
numeric_columns = ["Total Matches", "Innings", "Average Score", "Strike Rate", "Highest Score", 
                   "Fours", "Sixes", "Fifties", "Hundreds", "Total Runs"]

df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Verify data types after conversion
print("\nUpdated Data Types:")
print(df.dtypes)


Updated Data Types:
Name              object
Total Matches      int64
Innings            int64
Average Score    float64
Strike Rate      float64
Highest Score    float64
Fours            float64
Sixes            float64
Fifties          float64
Hundreds         float64
Total Runs         int64
dtype: object


### Handle Missing Values

In [6]:
print("\nMissing Values in Each Column:")
df.isnull().sum()


Missing Values in Each Column:


Name               0
Total Matches      0
Innings            0
Average Score      0
Strike Rate        0
Highest Score      1
Fours             50
Sixes            112
Fifties          136
Hundreds         194
Total Runs         0
dtype: int64

### Handle Missing Values

In [7]:
df.fillna(0, inplace=True)

# Verify again if missing values are handled
print("\nMissing Values After Filling:")
df.isnull().sum()


Missing Values After Filling:


Name             0
Total Matches    0
Innings          0
Average Score    0
Strike Rate      0
Highest Score    0
Fours            0
Sixes            0
Fifties          0
Hundreds         0
Total Runs       0
dtype: int64

### Handle Outliers (Ensure No Negative Values)

In [8]:
for col in ["Total Matches", "Innings", "Average Score", "Strike Rate", "Highest Score", 
            "Fours", "Sixes", "Fifties", "Hundreds", "Total Runs"]:
    df[col] = df[col].apply(lambda x: max(x, 0))  # If any value is negative, set it to 0

# Verify that no negative values exist
print("\nMinimum Values in Each Column (Should Be >= 0):")
df.min()


Minimum Values in Each Column (Should Be >= 0):


Name                -
Total Matches       1
Innings             1
Average Score     0.0
Strike Rate      6.25
Highest Score     0.0
Fours             0.0
Sixes             0.0
Fifties           0.0
Hundreds          0.0
Total Runs          1
dtype: object

### Save the Cleaned Data

In [9]:
df.to_csv("../data/bcci_odi_stats_cleaned.csv", index=False)

print("\nData cleaning completed! Cleaned file saved as '../data/bcci_odi_stats_cleaned.csv'.")



Data cleaning completed! Cleaned file saved as '../data/bcci_odi_stats_cleaned.csv'.
