# In this jupyter notebook, ...

### Import Required Libraries

In [503]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

### Load the data

In [504]:
from pathlib import Path

def load_data(file_path) -> pd.DataFrame:
    p = Path(str(file_path).strip()).expanduser().resolve()

    if not p.exists():
        raise FileNotFoundError(f"File not found: {p}")
    
    return pd.read_csv(p)
    

In [505]:
t20_bat = load_data(r"D:\Languages\Projects\ml-portfolio\EDA\data\Tabular\Batting\t20.csv")
odi_bat = load_data(r"D:\Languages\Projects\ml-portfolio\EDA\data\Tabular\Batting\ODI data.csv")
test_bat = load_data(r"D:\Languages\Projects\ml-portfolio\EDA\data\Tabular\Batting\test.csv")

In [506]:
df_dict = {
    't20_bat': t20_bat,
    'odi_bat': odi_bat,
    'test_bat': test_bat
}

### Preview the data

In [507]:
for name, df in df_dict.items():
    print()
    print(f"DataFrame: {name}")
    print(f"Shape: {df.shape}")
    display(df.head(6))
    print("-"*135)



DataFrame: t20_bat
Shape: (2006, 17)


Unnamed: 0.1,Unnamed: 0,Player,Span,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,4s,6s,Unnamed: 15
0,0,V Kohli (INDIA),2010-2019,75,70,20,2633,94*,52.66,1907,138.07,0,24,2,247,71,
1,1,RG Sharma (INDIA),2007-2019,104,96,14,2633,118,32.1,1905,138.21,4,19,6,234,120,
2,2,MJ Guptill (NZ),2009-2019,83,80,7,2436,105,33.36,1810,134.58,2,15,2,215,113,
3,3,Shoaib Malik (ICC/PAK),2006-2019,111,104,30,2263,75,30.58,1824,124.06,0,7,1,186,61,
4,4,BB McCullum (NZ),2005-2015,71,70,10,2140,123,35.66,1571,136.21,2,13,3,199,91,
5,5,DA Warner (AUS),2009-2019,76,76,8,2079,100*,30.57,1476,140.85,1,15,5,203,86,


---------------------------------------------------------------------------------------------------------------------------------------

DataFrame: odi_bat
Shape: (2500, 15)


Unnamed: 0.1,Unnamed: 0,Player,Span,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,Unnamed: 13
0,0,SR Tendulkar (INDIA),1989-2012,463,452,41,18426,200*,44.83,21367,86.23,49,96,20,
1,1,KC Sangakkara (Asia/ICC/SL),2000-2015,404,380,41,14234,169,41.98,18048,78.86,25,93,15,
2,2,RT Ponting (AUS/ICC),1995-2012,375,365,39,13704,164,42.03,17046,80.39,30,82,20,
3,3,ST Jayasuriya (Asia/SL),1989-2011,445,433,18,13430,189,32.36,14725,91.2,28,68,34,
4,4,DPMD Jayawardene (Asia/SL),1998-2015,448,418,39,12650,144,33.37,16020,78.96,19,77,28,
5,5,Inzamam-ul-Haq (Asia/PAK),1991-2007,378,350,53,11739,137*,39.52,15812,74.24,10,83,20,


---------------------------------------------------------------------------------------------------------------------------------------

DataFrame: test_bat
Shape: (3001, 13)


Unnamed: 0.1,Unnamed: 0,Player,Span,Mat,Inns,NO,Runs,HS,Ave,100,50,0,Unnamed: 11
0,0,SR Tendulkar (INDIA),1989-2013,200,329,33,15921,248*,53.78,51,68,14,
1,1,RT Ponting (AUS),1995-2012,168,287,29,13378,257,51.85,41,62,17,
2,2,JH Kallis (ICC/SA),1995-2013,166,280,40,13289,224,55.37,45,58,16,
3,3,R Dravid (ICC/INDIA),1996-2012,164,286,32,13288,270,52.31,36,63,8,
4,4,AN Cook (ENG),2006-2018,161,291,16,12472,294,45.35,33,57,9,
5,5,KC Sangakkara (SL),2000-2015,134,233,17,12400,319,57.4,38,52,11,


---------------------------------------------------------------------------------------------------------------------------------------


In [508]:
for name, df in df_dict.items():
    print(f"\nDataFrame: {name}\n")
    df.info()
    print('-' * 100)
    display(df.describe())
    print('=' * 135)
    print('=' * 135)


DataFrame: t20_bat

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2006 entries, 0 to 2005
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   2006 non-null   int64  
 1   Player       2006 non-null   object 
 2   Span         2006 non-null   object 
 3   Mat          2006 non-null   int64  
 4   Inns         2006 non-null   object 
 5   NO           2006 non-null   object 
 6   Runs         2006 non-null   object 
 7   HS           2006 non-null   object 
 8   Ave          2006 non-null   object 
 9   BF           2006 non-null   object 
 10  SR           2006 non-null   object 
 11  100          2006 non-null   object 
 12  50           2006 non-null   object 
 13  0            2006 non-null   object 
 14  4s           2006 non-null   object 
 15  6s           2006 non-null   object 
 16  Unnamed: 15  0 non-null      float64
dtypes: float64(1), int64(2), object(14)
memory usage: 266.6+ KB
---------

Unnamed: 0.1,Unnamed: 0,Mat,Unnamed: 15
count,2006.0,2006.0,0.0
mean,24.434197,11.231805,
std,14.463176,14.923332,
min,0.0,1.0,
25%,12.0,3.0,
50%,24.0,5.0,
75%,37.0,13.0,
max,49.0,111.0,



DataFrame: odi_bat

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   2500 non-null   int64  
 1   Player       2500 non-null   object 
 2   Span         2500 non-null   object 
 3   Mat          2500 non-null   int64  
 4   Inns         2500 non-null   object 
 5   NO           2500 non-null   object 
 6   Runs         2500 non-null   object 
 7   HS           2500 non-null   object 
 8   Ave          2500 non-null   object 
 9   BF           2500 non-null   object 
 10  SR           2500 non-null   object 
 11  100          2500 non-null   object 
 12  50           2500 non-null   object 
 13  0            2500 non-null   object 
 14  Unnamed: 13  0 non-null      float64
dtypes: float64(1), int64(2), object(12)
memory usage: 293.1+ KB
---------------------------------------------------------------------------------------------

Unnamed: 0.1,Unnamed: 0,Mat,Unnamed: 13
count,2500.0,2500.0,0.0
mean,24.5,37.1616,
std,14.433757,58.885075,
min,0.0,1.0,
25%,12.0,4.0,
50%,24.5,13.0,
75%,37.0,43.0,
max,49.0,463.0,



DataFrame: test_bat

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3001 entries, 0 to 3000
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   3001 non-null   int64  
 1   Player       3001 non-null   object 
 2   Span         3001 non-null   object 
 3   Mat          3001 non-null   int64  
 4   Inns         3001 non-null   object 
 5   NO           3001 non-null   object 
 6   Runs         3001 non-null   object 
 7   HS           3001 non-null   object 
 8   Ave          3001 non-null   object 
 9   100          3001 non-null   object 
 10  50           3001 non-null   object 
 11  0            3001 non-null   object 
 12  Unnamed: 11  0 non-null      float64
dtypes: float64(1), int64(2), object(10)
memory usage: 304.9+ KB
----------------------------------------------------------------------------------------------------


Unnamed: 0.1,Unnamed: 0,Mat,Unnamed: 11
count,3001.0,3001.0,0.0
mean,24.491836,17.427191,
std,14.437798,24.954654,
min,0.0,1.0,
25%,12.0,2.0,
50%,24.0,7.0,
75%,37.0,21.0,
max,49.0,200.0,




##### There is only one string column in all these three dfs, which is player, and everything else should be integer or float. 
##### Therefore:

## Data Cleaning & Feature engineering

In [509]:
for name, df in df_dict.items():
    print(f"\nDataFrame: {name}")
    print('-'*20)
    display(df.isna().sum())
    print('='*50)


DataFrame: t20_bat
--------------------


Unnamed: 0        0
Player            0
Span              0
Mat               0
Inns              0
NO                0
Runs              0
HS                0
Ave               0
BF                0
SR                0
100               0
50                0
0                 0
4s                0
6s                0
Unnamed: 15    2006
dtype: int64


DataFrame: odi_bat
--------------------


Unnamed: 0        0
Player            0
Span              0
Mat               0
Inns              0
NO                0
Runs              0
HS                0
Ave               0
BF                0
SR                0
100               0
50                0
0                 0
Unnamed: 13    2500
dtype: int64


DataFrame: test_bat
--------------------


Unnamed: 0        0
Player            0
Span              0
Mat               0
Inns              0
NO                0
Runs              0
HS                0
Ave               0
100               0
50                0
0                 0
Unnamed: 11    3001
dtype: int64



- First column and last column are useless. 
- Also, we need to change some dtypes here.
- t20_bat have 15 cols out of which 2 cols (4s and 6s) are not in odi_bat. <--> ( lacks -> [ 4s, 6s ] cols)
- odi_bat have 13 cols out of which 2 cols (BF and SR) are not present in test_bat. <--> ( lacks -> [ 4s, 6s, BF, SR ] cols)

In [510]:
for name, df in df_dict.items():
    if name == 't20_bat':
        i = 156
    elif name == 'odi_bat':
        i = 11
    elif name == 'test_bat':
        i = 21
    print(f"\nDataFrame: {name}")   
    display(df.tail(i))
    print("="*120)


DataFrame: t20_bat


Unnamed: 0.1,Unnamed: 0,Player,Span,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,4s,6s,Unnamed: 15
1850,0,Taijul Islam (BDESH),2019-2019,2,1,1,0,0*,-,1,0.00,0,0,0,0,0,
1851,1,Ziaur Rehman (Fin),2019-2019,1,1,0,0,0,0.00,4,0.00,0,0,1,0,0,
1852,2,M Zondeki (SA),2006-2006,1,1,0,0,0,0.00,1,0.00,0,0,1,0,0,
1853,3,Zulqarnain Haider (ESP),2019-2019,5,1,0,0,0,0.00,0,-,0,0,1,0,0,
1854,4,YA Abdulla (SA),2009-2009,2,-,-,-,-,-,-,-,-,-,-,-,-,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2001,1,SG Whittingham (SCOT),2018-2018,3,-,-,-,-,-,-,-,-,-,-,-,-,
2002,2,LJ Woodcock (NZ),2010-2011,3,-,-,-,-,-,-,-,-,-,-,-,-,
2003,3,Zamir Khan (AFG),2012-2012,1,-,-,-,-,-,-,-,-,-,-,-,-,
2004,4,S Zargar (Mex),2019-2019,1,-,-,-,-,-,-,-,-,-,-,-,-,



DataFrame: odi_bat


Unnamed: 0.1,Unnamed: 0,Player,Span,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,Unnamed: 13
2489,39,Zakir Hossain (BDESH),1998-1998,1,1,0,0,0,0.00,1,0.00,0,0,1,
2490,40,Aamer Hameed (PAK),1977-1978,2,-,-,-,-,-,-,-,-,-,-,
2491,41,Abdur Rauf (PAK),2008-2008,4,-,-,-,-,-,-,-,-,-,-,
2492,42,Abu Jayed (BDESH),2019-2019,2,-,-,-,-,-,-,-,-,-,-,
2493,43,FS Ahangama (SL),1985-1985,1,-,-,-,-,-,-,-,-,-,-,
2494,44,GW Aldridge (NZ),2011-2011,2,-,-,-,-,-,-,-,-,-,-,
2495,45,ZS Ansari (ENG),2015-2015,1,-,-,-,-,-,-,-,-,-,-,
2496,46,Ariful Haque (BDESH),2018-2018,1,-,-,-,-,-,-,-,-,-,-,
2497,47,Ashfaq Ahmed (PAK),1994-1994,3,-,-,-,-,-,-,-,-,-,-,
2498,48,MD Bailey (NZ),1998-1998,1,-,-,-,-,-,-,-,-,-,-,



DataFrame: test_bat


Unnamed: 0.1,Unnamed: 0,Player,Span,Mat,Inns,NO,Runs,HS,Ave,100,50,0,Unnamed: 11
2980,30,Zahir Khan (AFG),2019-2019,2,4,3,0,0*,0.00,0,0,1,
2981,31,Ali Hussain Rizvi (PAK),1997-1997,1,-,-,-,-,-,-,-,-,
2982,32,PJ Allan (AUS),1965-1965,1,-,-,-,-,-,-,-,-,
2983,33,JC Clay (ENG),1935-1935,1,-,-,-,-,-,-,-,-,
2984,34,DJ Cullen (AUS),2006-2006,1,-,-,-,-,-,-,-,-,
2985,35,HT Dani (INDIA),1952-1952,1,-,-,-,-,-,-,-,-,
2986,36,Farrukh Zaman (PAK),1976-1976,1,-,-,-,-,-,-,-,-,
2987,37,AB Howard (WI),1972-1972,1,-,-,-,-,-,-,-,-,
2988,38,A Khan (ENG),2009-2009,1,-,-,-,-,-,-,-,-,
2989,39,JCW MacBryan (ENG),1924-1924,1,-,-,-,-,-,-,-,-,




#### We can see
- In t20_bat df, we have null values from 1853,
- In odi_bat df, we have null values starting from 2490,
- In test_bat df, we have null values from 2981.

-> We should drop these rows and Unnamed columns as they contain No Data

####  We also have some null values in Ave and SR.
- All of these are not in NaN, but they are string '-'. Therefore, we first have to convert them into nan, then change the dtypes.

#### HS has some values like 98*, it means the player was not out at his highest score.
- We will make another feature named as 'hs_not_out', it will contain boolean values (True if '*' else False)
- HS will then contain plain numeric values without '*'

In [511]:
# i = 15
# for name, df in df_dict.items():
#     df.drop(columns=[col for col in [f'Unnamed: 0', f'Unnamed: {i}'] if col in df.columns], inplace=True)
#     i -= 2

In [512]:
# Row indices after which we drop data
drop_start_index = {
    't20_bat': 1853,
    'odi_bat': 2490,
    'test_bat': 2981
}


# Loop over each dataframe
for name, df in df_dict.items():
    # Drop columns named Unnamed
    df.drop(columns=[col for col in df.columns if col.startswith('Unnamed')], inplace=True)

    # Drop rows starting from given index
    df.drop(df.index[drop_start_index[name]:], inplace=True)

    # Replace '-' with NaN (in place)
    df.replace('-', np.nan, inplace=True)

    # Span split
    df[['span_start', 'span_end']] = df['Span'].str.split('-', expand=True)
    
    # Optional: convert to datetime
    # df['span_start'] = pd.to_datetime(df['span_start'], format='%Y')
    # df['span_end'] = pd.to_datetime(df['span_end'], format='%Y')

    df.drop(columns='Span', inplace=True)

    # HS feature extraction
    df['hs_not_out'] = df['HS'].str.contains(r"\*")
    df['HS'] = df['HS'].str.extract(r"(\d+)")
    
    # Convert dtypes accordingly (excluding Player, Span, hs_not_out)
    for col in df.columns[2:-1]:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # Persist changes
    df_dict[name] = df

    tnv = df.isna().sum().sum()  # total null values (in a df)
    print(f"\nDataframe: '{name}'\n")
    print(f"Total null values: {tnv} ({(tnv / df.shape[0])*100:.2f}%)")
    print(f"Total duplicate values: {df.duplicated().sum()}")
    print(f"Shape: {df.shape[0]} rows × {df.shape[1]} columns")
    print('-'*50)
    df.info()
    print('-'*35)
    display(df.sample(10))
    print('='*135)


Dataframe: 't20_bat'

Total null values: 198 (10.69%)
Total duplicate values: 0
Shape: 1853 rows × 17 columns
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1853 entries, 0 to 1852
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Player      1853 non-null   object 
 1   Mat         1853 non-null   int64  
 2   Inns        1853 non-null   int64  
 3   NO          1853 non-null   int64  
 4   Runs        1853 non-null   int64  
 5   HS          1853 non-null   int64  
 6   Ave         1677 non-null   float64
 7   BF          1853 non-null   int64  
 8   SR          1831 non-null   float64
 9   100         1853 non-null   int64  
 10  50          1853 non-null   int64  
 11  0           1853 non-null   int64  
 12  4s          1853 non-null   int64  
 13  6s          1853 non-null   int64  
 14  span_start  1853 non-null   int64  
 15  span_end    1853 non-null   i

Unnamed: 0,Player,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,4s,6s,span_start,span_end,hs_not_out
835,Z Hoque (THAI),3,3,0,46,20,15.33,35,131.42,0,0,0,4,2,2019,2019,False
1733,SD Parry (ENG),5,1,0,1,1,1.0,1,100.0,0,0,0,0,0,2014,2015,False
1818,Nawaf Ahmed (KUW),1,1,1,0,0,,0,,0,0,0,0,0,2019,2019,True
999,VS Padhaal (Fin),5,5,0,31,14,6.2,44,70.45,0,0,0,4,0,2019,2019,False
315,KMDN Kulasekara (SL),58,30,9,215,31,10.23,184,116.84,0,0,4,14,7,2008,2017,False
120,RN ten Doeschate (NL),22,22,10,533,59,44.41,400,133.25,0,3,1,27,19,2008,2019,False
1269,JAH Marshall (NZ),3,2,0,14,13,7.0,15,93.33,0,0,0,2,0,2005,2008,False
1111,AB Dinda (INDIA),9,2,1,22,19,22.0,24,91.66,0,0,0,1,0,2009,2012,False
106,RR Hendricks (SA),22,22,0,593,74,26.95,487,121.76,0,4,2,71,8,2014,2019,False
280,J Mubarak (SL),16,15,4,238,46,21.63,188,126.59,0,0,2,21,8,2007,2009,True



Dataframe: 'odi_bat'

Total null values: 129 (5.18%)
Total duplicate values: 0
Shape: 2490 rows × 15 columns
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2490 entries, 0 to 2489
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Player      2490 non-null   object 
 1   Mat         2490 non-null   int64  
 2   Inns        2490 non-null   int64  
 3   NO          2490 non-null   int64  
 4   Runs        2490 non-null   int64  
 5   HS          2490 non-null   int64  
 6   Ave         2370 non-null   float64
 7   BF          2490 non-null   int64  
 8   SR          2481 non-null   float64
 9   100         2490 non-null   int64  
 10  50          2490 non-null   int64  
 11  0           2490 non-null   int64  
 12  span_start  2490 non-null   int64  
 13  span_end    2490 non-null   int64  
 14  hs_not_out  2490 non-null   bool   
dtypes: bool(1), float64(2), int64(

Unnamed: 0,Player,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,span_start,span_end,hs_not_out
2454,KP Meiyappan (UAE),2,1,0,0,0,0.0,1,0.0,0,0,1,2019,2019,False
1867,UNK Fernando (SL),2,2,2,22,20,,25,88.0,0,0,0,1994,1994,True
1968,S Somasunder (INDIA),2,2,0,16,9,8.0,63,25.39,0,0,0,1996,1996,False
2369,MS Tshabalala (SA),4,1,1,2,2,,3,66.66,0,0,0,2007,2007,True
1916,AG Wharf (ENG),13,5,3,19,9,9.5,28,67.85,0,0,1,2004,2005,False
1545,AH Gray (WI),25,11,5,51,10,8.5,77,66.23,0,0,1,1985,1991,True
1791,BL Kotze (NAM),5,4,1,27,24,9.0,51,52.94,0,0,2,2003,2003,True
1721,Mohammad Asif (Asia/PAK),38,16,7,34,6,3.77,100,34.0,0,0,3,2005,2010,False
1609,BP Patterson (WI),59,20,15,44,13,8.8,101,43.56,0,0,1,1986,1993,True
2077,SC Kuggeleijn (NZ),2,1,1,11,11,,6,183.33,0,0,0,2017,2017,True



Dataframe: 'test_bat'

Total null values: 68 (2.28%)
Total duplicate values: 0
Shape: 2981 rows × 13 columns
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2981 entries, 0 to 2980
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Player      2981 non-null   object 
 1   Mat         2981 non-null   int64  
 2   Inns        2981 non-null   int64  
 3   NO          2981 non-null   int64  
 4   Runs        2981 non-null   int64  
 5   HS          2981 non-null   int64  
 6   Ave         2913 non-null   float64
 7   100         2981 non-null   int64  
 8   50          2981 non-null   int64  
 9   0           2981 non-null   int64  
 10  span_start  2981 non-null   int64  
 11  span_end    2981 non-null   int64  
 12  hs_not_out  2981 non-null   bool   
dtypes: bool(1), float64(1), int64(10), object(1)
memory usage: 282.5+ KB
-----------------------------------


Unnamed: 0,Player,Mat,Inns,NO,Runs,HS,Ave,100,50,0,span_start,span_end,hs_not_out
1623,CS Martin (NZ),71,104,52,123,12,2.36,0,0,36,2000,2013,True
618,W Watson (ENG),23,37,3,879,116,25.85,2,3,3,1951,1959,False
360,HM Nicholls (NZ),31,47,5,1711,162,40.73,5,9,4,2016,2019,True
195,Saeed Ahmed (PAK),41,78,4,2991,172,40.41,5,16,2,1958,1973,False
2612,Ghulam Abbas (PAK),1,2,0,12,12,6.0,0,0,1,1967,1967,False
619,L Amarnath (INDIA),24,40,4,878,118,24.38,1,4,5,1933,1952,False
1160,Zahid Fazal (PAK),9,16,0,288,78,18.0,0,1,0,1990,1995,False
106,PBH May (ENG),66,106,9,4537,285,46.77,13,22,8,1951,1961,True
642,AM Bacher (SA),19,33,1,833,96,26.03,0,5,3,1996,1999,False
1807,SMS Kaluperuma (SL),4,8,0,88,23,11.0,0,0,1,1984,1988,False




#### Now that the dtype is correct and null values are gone, let's do some more feature engineering now.
#### -> Starting from Player column

In [513]:
t20_bat['Player'].head()

0           V Kohli (INDIA)
1         RG Sharma (INDIA)
2           MJ Guptill (NZ)
3    Shoaib Malik (ICC/PAK)
4          BB McCullum (NZ)
Name: Player, dtype: object

##### As you can see, Player column have two components:
- Player Name
- Country Code with some ICC/Asia kind of noise.

#### We need to split this into 'Player_Name' and 'Country_Name' (NOT Country Code)

In [514]:
# import re

# pattern = re.compile(r"^(.*?)\s*\((?:.*/)?([A-Z]+)\)$")

In [515]:
import re

for name, df in df_dict.items():
    print(f"\n\n{name} split started...")

    # Clean leading numbering in the whole Player field
    df["Player"] = (
        df["Player"]
        .str.replace(r"^\d+[\)\]]\s*", "", regex=True)   # remove at start
        .str.strip()
    )

    # Split once at first '('
    df[['Player_Name', 'Country_Code']] = df['Player'].str.split('(', n=1, expand=True)

    # Clean Player_Name
    df['Player_Name'] = (
        df['Player_Name']
        .str.replace(r"^\d+[\)\]]\s*", "", regex=True)  # just in case numbering here too
        .str.strip()
    )

    # Clean Country_Code
    df['Country_Code'] = (
        df['Country_Code']
        .str.replace(r"^\d+[\)\]]\s*", "", regex=True)  # remove 1) etc
        .str.rstrip(')')
        .str.split('/')
        .str[-1]
        .str.replace(r"^[\(]\s*", "", regex=True)
        .str.strip()
    )

    print(f'{name} split is done.\n')




t20_bat split started...
t20_bat split is done.



odi_bat split started...
odi_bat split is done.



test_bat split started...
test_bat split is done.



In [516]:
for name, df in df_dict.items():
    print(f'\nDataFrame: {name}\n')

    print('-'*60)
    print('Unique Player Names: ')
    display(df['Player_Name'].unique())  # You can use loops or adjust pd display options here to show all without truncation

    print('-'*50)
    print('Unique Country Codes:')
    display(df['Country_Code'].unique())
    print('='*100)


DataFrame: t20_bat

------------------------------------------------------------
Unique Player Names: 


array(['V Kohli', 'RG Sharma', 'MJ Guptill', ..., 'Taijul Islam',
       'Ziaur Rehman', 'M Zondeki'], shape=(1843,), dtype=object)

--------------------------------------------------
Unique Country Codes:


array(['INDIA', 'NZ', 'PAK', 'AUS', 'ENG', 'AFG', 'SA', 'IRE', 'SL',
       'ZIM', 'WI', 'World', 'BDESH', 'SCOT', 'NL', 'UAE', 'NEPAL',
       'KENYA', 'PNG', 'OMAN', 'HKG', 'CAN', 'QAT', 'MAL', 'SGP', 'NAM',
       'VAN', 'JER', 'KUW', 'ICC', 'USA', 'BMUDA', 'Mald', 'Moz', 'CZK-R',
       'Aut', 'ITA', 'GUE', 'ESP', 'GER', 'DEN', 'Mex', 'Fin', 'BAH',
       'ROM', 'UGA', 'MWI', 'PNM', 'Saudi', 'THAI', 'PORT', 'GIBR', 'NGA',
       'Botsw', '', 'LUX', 'Serb', 'Blz', 'Caym', 'CRC', 'NOR', 'Arg',
       'Samoa', 'MALTA', 'Peru', 'Chile', 'Belg', 'PHI', 'BHU', 'Ghana',
       'TKY'], dtype=object)


DataFrame: odi_bat

------------------------------------------------------------
Unique Player Names: 


array(['SR Tendulkar', 'KC Sangakkara', 'RT Ponting', ..., 'R Walters',
       'CM Willoughby', 'Zakir Hossain'], shape=(2482,), dtype=object)

--------------------------------------------------
Unique Country Codes:


array(['INDIA', 'SL', 'ICC', 'PAK', 'SA', 'WI', 'AUS', 'NZ', 'IRE',
       'BDESH', 'ZIM', 'ENG', 'KENYA', 'AFG', 'SCOT', 'CAN', 'NL', 'UAE',
       'PNG', 'HKG', 'BMUDA', 'USA', 'NAM', 'NEPAL', 'OMAN', 'EAf'],
      dtype=object)


DataFrame: test_bat

------------------------------------------------------------
Unique Player Names: 


array(['SR Tendulkar', 'RT Ponting', 'JH Kallis', ..., 'P Wilson',
       'CS Wimble', 'Zahir Khan'], shape=(2966,), dtype=object)

--------------------------------------------------
Unique Country Codes:


array(['INDIA', 'AUS', 'SA', 'ENG', 'SL', 'WI', 'PAK', 'NZ', 'ZIM',
       'BDESH', 'ICC', 'AFG', 'IRE'], dtype=object)



### Let's map country names from country code.

In [517]:
for name, df in df_dict.items():
    print(f'\nDataFrame: {name}')
    print('-'*20)
    display(df['Country_Code'].unique())
    print('='*35)


DataFrame: t20_bat
--------------------


array(['INDIA', 'NZ', 'PAK', 'AUS', 'ENG', 'AFG', 'SA', 'IRE', 'SL',
       'ZIM', 'WI', 'World', 'BDESH', 'SCOT', 'NL', 'UAE', 'NEPAL',
       'KENYA', 'PNG', 'OMAN', 'HKG', 'CAN', 'QAT', 'MAL', 'SGP', 'NAM',
       'VAN', 'JER', 'KUW', 'ICC', 'USA', 'BMUDA', 'Mald', 'Moz', 'CZK-R',
       'Aut', 'ITA', 'GUE', 'ESP', 'GER', 'DEN', 'Mex', 'Fin', 'BAH',
       'ROM', 'UGA', 'MWI', 'PNM', 'Saudi', 'THAI', 'PORT', 'GIBR', 'NGA',
       'Botsw', '', 'LUX', 'Serb', 'Blz', 'Caym', 'CRC', 'NOR', 'Arg',
       'Samoa', 'MALTA', 'Peru', 'Chile', 'Belg', 'PHI', 'BHU', 'Ghana',
       'TKY'], dtype=object)


DataFrame: odi_bat
--------------------


array(['INDIA', 'SL', 'ICC', 'PAK', 'SA', 'WI', 'AUS', 'NZ', 'IRE',
       'BDESH', 'ZIM', 'ENG', 'KENYA', 'AFG', 'SCOT', 'CAN', 'NL', 'UAE',
       'PNG', 'HKG', 'BMUDA', 'USA', 'NAM', 'NEPAL', 'OMAN', 'EAf'],
      dtype=object)


DataFrame: test_bat
--------------------


array(['INDIA', 'AUS', 'SA', 'ENG', 'SL', 'WI', 'PAK', 'NZ', 'ZIM',
       'BDESH', 'ICC', 'AFG', 'IRE'], dtype=object)



#### ISO codes won't help here as these are in different formats, Manual mapping is only the sane way as these aren't alot anyway.

In [518]:
country_mapping = {
    'INDIA': 'India',
    'NZ': 'New Zealand',
    'PAK': 'Pakistan',
    'AUS': 'Australia',
    'ENG': 'England',
    'AFG': 'Afghanistan',
    'SA': 'South Africa',
    'IRE': 'Ireland',
    'SL': 'Sri Lanka',
    'ZIM': 'Zimbabwe',
    'WI': 'West Indies',
    'World': 'World XI',
    'BDESH': 'Bangladesh',
    'SCOT': 'Scotland',
    'NL': 'Netherlands',
    'UAE': 'United Arab Emirates',
    'NEPAL': 'Nepal',
    'KENYA': 'Kenya',
    'PNG': 'Papua New Guinea',
    'OMAN': 'Oman',
    'HKG': 'Hong Kong',
    'CAN': 'Canada',
    'QAT': 'Qatar',
    'MAL': 'Malaysia',
    'SGP': 'Singapore',
    'NAM': 'Namibia',
    'VAN': 'Vanuatu',
    'JER': 'Jersey',
    'KUW': 'Kuwait',
    'ICC': 'ICC World XI',
    'USA': 'United States',
    'BMUDA': 'Bermuda',
    'Mald': 'Maldives',
    'Moz': 'Mozambique',
    'CZK-R': 'Czech Republic',
    'Aut': 'Austria',
    'ITA': 'Italy',
    'GUE': 'Guernsey',
    'ESP': 'Spain',
    'GER': 'Germany',
    'DEN': 'Denmark',
    'Mex': 'Mexico',
    'Fin': 'Finland',
    'BAH': 'Bahamas',
    'ROM': 'Romania',
    'UGA': 'Uganda',
    'MWI': 'Malawi',
    'PNM': 'Panama',
    'Saudi': 'Saudi Arabia',
    'THAI': 'Thailand',
    'PORT': 'Portugal',
    'GIBR': 'Gibraltar',
    'NGA': 'Nigeria',
    'Botsw': 'Botswana',
    '': None,  # Missing / Blank
    'LUX': 'Luxembourg',
    'Serb': 'Serbia',
    'Blz': 'Belize',
    'Caym': 'Cayman Islands',
    'CRC': 'Costa Rica',
    'NOR': 'Norway',
    'Arg': 'Argentina',
    'Samoa': 'Samoa',
    'MALTA': 'Malta',
    'Peru': 'Peru',
    'Chile': 'Chile',
    'Belg': 'Belgium',
    'PHI': 'Philippines',
    'BHU': 'Bhutan',
    'Ghana': 'Ghana',
    'TKY': 'Turkey',
    'EAf': 'East Africa'
}


# Order priority
col_priority = [
    "Player_Name", "Country_Name", "span_start", "span_end",
    "Mat", "Inns", "NO", "Runs", "HS", "hs_not_out", "Ave",
    "BF", "SR", "100", "50", "0", "4s", "6s"
]


for name, df in df_dict.items():    
    df['Country_Name'] = df['Country_Code'].map(country_mapping)

    # drop some features
    df.drop(columns=['Player','Country_Code'], inplace=True)

    existing_cols = [col for col in col_priority if col in df.columns]

    df = df[existing_cols]  # reorder
    df_dict[name] = df  # update

In [519]:
for name, df in df_dict.items():
    print(f'\nDataFrame: {name}')
    display(df.head())
    print('-'*135)


DataFrame: t20_bat


Unnamed: 0,Player_Name,Country_Name,span_start,span_end,Mat,Inns,NO,Runs,HS,hs_not_out,Ave,BF,SR,100,50,0,4s,6s
0,V Kohli,India,2010,2019,75,70,20,2633,94,True,52.66,1907,138.07,0,24,2,247,71
1,RG Sharma,India,2007,2019,104,96,14,2633,118,False,32.1,1905,138.21,4,19,6,234,120
2,MJ Guptill,New Zealand,2009,2019,83,80,7,2436,105,False,33.36,1810,134.58,2,15,2,215,113
3,Shoaib Malik,Pakistan,2006,2019,111,104,30,2263,75,False,30.58,1824,124.06,0,7,1,186,61
4,BB McCullum,New Zealand,2005,2015,71,70,10,2140,123,False,35.66,1571,136.21,2,13,3,199,91


---------------------------------------------------------------------------------------------------------------------------------------

DataFrame: odi_bat


Unnamed: 0,Player_Name,Country_Name,span_start,span_end,Mat,Inns,NO,Runs,HS,hs_not_out,Ave,BF,SR,100,50,0
0,SR Tendulkar,India,1989,2012,463,452,41,18426,200,True,44.83,21367,86.23,49,96,20
1,KC Sangakkara,Sri Lanka,2000,2015,404,380,41,14234,169,False,41.98,18048,78.86,25,93,15
2,RT Ponting,ICC World XI,1995,2012,375,365,39,13704,164,False,42.03,17046,80.39,30,82,20
3,ST Jayasuriya,Sri Lanka,1989,2011,445,433,18,13430,189,False,32.36,14725,91.2,28,68,34
4,DPMD Jayawardene,Sri Lanka,1998,2015,448,418,39,12650,144,False,33.37,16020,78.96,19,77,28


---------------------------------------------------------------------------------------------------------------------------------------

DataFrame: test_bat


Unnamed: 0,Player_Name,Country_Name,span_start,span_end,Mat,Inns,NO,Runs,HS,hs_not_out,Ave,100,50,0
0,SR Tendulkar,India,1989,2013,200,329,33,15921,248,True,53.78,51,68,14
1,RT Ponting,Australia,1995,2012,168,287,29,13378,257,False,51.85,41,62,17
2,JH Kallis,South Africa,1995,2013,166,280,40,13289,224,False,55.37,45,58,16
3,R Dravid,India,1996,2012,164,286,32,13288,270,False,52.31,36,63,8
4,AN Cook,England,2006,2018,161,291,16,12472,294,False,45.35,33,57,9


---------------------------------------------------------------------------------------------------------------------------------------


### Let's make some more feature now out of exisiting features.
- Career Length = span_end - span_start + 1
- Innings per Match = Inns / Mat
- Not Out % = (NO / Inns) × 100
- Centuries per Match = 100 / Mat
- Fifties per Match = 50 / Mat
- Ducks per Match = 0 / Mat
- Boundary Count (if 4s and 6s exist) = 4s + 6s
- Boundary % of Runs = (4s × 4 + 6s × 6) / Runs * 100
- Balls per Innings = BF / Inns
- Runs per Innings = Runs / Inns
- Boundary Frequency = (4s + 6s) / BF × 100
- Conversion Rate = 100 / (100 + 50) → how well 50s turn into 100s
- Consistency Index = Ave × SR (blended metric, high means consistent + fast scoring)
- Power Hitter Score = 6s / Inns or 6s / BF

In [None]:
for name, df in df_dict.items():
    
    df['Career_length'] = df['span_end'] - df['span_start'] + 1

    if 'BF' in df.columns:
        df['BF_per_inn'] = (df['BF'] / df['Inns']).round(2)

    df['Runs_per_inn'] = (df['Runs'] / df['Inns']).round(2)
    df['Runs_per_mat'] = (df['Runs'] / df['Mat']).round(2)


    df['Inn_per_mat'] = (df['Inns'] / df['Mat']).round(2)
    
    df['Centuries_per_mat'] = (df['100'] / df['Mat']).apply(lambda x: round(x, 2))
    df['Fifties_per_mat'] = (df['50'] / df['Mat']).apply(lambda x: round(x, 2))
    df['ducks_per_mat'] = (df['0'] / df['Mat']).apply(lambda x: round(x, 2))

    df['Not_out_%'] = ((df['NO'] / df['Inns']) * 100).apply(lambda x: round(x, 2))

    if '4s' in df.columns and '6s' in df.columns:
        df['Boundary_count'] = df['4s'] + df['6s']
        df['Boundary_runs'] = ((df['4s'] * 4 + df['6s'] * 6)).apply(lambda x: round(x, 2))
        df['Boundary_%_of_runs'] = ((df['Boundary_runs'] / df['Runs']) * 100).round(2)
        df['Boundary_freq'] = np.where(
            df['BF'] > 0,
            ((df['4s'] + df['6s']) / df['BF']).round(2),
            0
        )
        df['Power_hitter_score'] = np.where(
            df['BF'] > 0,
            (df['6s'] / df['BF']).round(2),
            0
        )

    df['Conversion_rate'] = df.apply(
        lambda row: round(row['100'] / (row['100'] + row['50']), 2) if (row['100'] + row['50']) > 0 else 0,
        axis=1
    )  # how well 50s turn into 100s

    if 'SR' in df.columns:
        df['Consistency_index'] = (
            df['Ave'].fillna(0) * df['SR'].fillna(0)
        ).round(2)  # blended metric -> High means consistent + fast scoring and vice versa.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Career_length'] = df['span_end'] - df['span_start'] + 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['BF_per_inn'] = (df['BF'] / df['Inns']).round(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Runs_per_inn'] = (df['Runs'] / df['Inns']).round(2)
A value is trying to be set on a copy

In [523]:
for name, df in df_dict.items():
    print(f'\nDataFrame: {name}')
    print(f'Shape: {df.shape[0]} rows × {df.shape[1]} columns')
    # display(df.iloc[:6, 14:])
    display(df.iloc[0, :])
    print('-'*135)
    print('-'*135)


DataFrame: t20_bat
Shape: 1853 rows × 34 columns


Player_Name           V Kohli
Country_Name            India
span_start               2010
span_end                 2019
Mat                        75
Inns                       70
NO                         20
Runs                     2633
HS                         94
hs_not_out               True
Ave                     52.66
BF                       1907
SR                     138.07
100                         0
50                         24
0                           2
4s                        247
6s                         71
Career_length              10
BF_per_inn              27.24
Runs_per_inn            37.61
Runs_per_mat            35.11
Inn_per_mat              0.93
Centuries_per_mat         0.0
Fifties_per_mat          0.32
ducks_per_mat            0.03
Not_out_%               28.57
Boundary_count            318
Boundary_runs            1414
Boundary_%_of_runs       53.7
Boundary_freq            0.17
Power_hitter_score       0.04
Conversion_rate           0.0
Consistenc

---------------------------------------------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------------------------------------------

DataFrame: odi_bat
Shape: 2490 rows × 27 columns


Player_Name          SR Tendulkar
Country_Name                India
span_start                   1989
span_end                     2012
Mat                           463
Inns                          452
NO                             41
Runs                        18426
HS                            200
hs_not_out                   True
Ave                         44.83
BF                          21367
SR                          86.23
100                            49
50                             96
0                              20
Career_length                  24
BF_per_inn                  47.27
Runs_per_inn                40.77
Runs_per_mat                 39.8
Inn_per_mat                  0.98
Centuries_per_mat            0.11
Fifties_per_mat              0.21
ducks_per_mat                0.04
Not_out_%                    9.07
Conversion_rate              0.34
Consistency_index         3865.69
Name: 0, dtype: object

---------------------------------------------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------------------------------------------

DataFrame: test_bat
Shape: 2981 rows × 23 columns


Player_Name          SR Tendulkar
Country_Name                India
span_start                   1989
span_end                     2013
Mat                           200
Inns                          329
NO                             33
Runs                        15921
HS                            248
hs_not_out                   True
Ave                         53.78
100                            51
50                             68
0                              14
Career_length                  25
Runs_per_inn                48.39
Runs_per_mat                 79.6
Inn_per_mat                  1.64
Centuries_per_mat            0.26
Fifties_per_mat              0.34
ducks_per_mat                0.07
Not_out_%                   10.03
Conversion_rate              0.43
Name: 0, dtype: object

---------------------------------------------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------------------------------------------


# EDA 🚀