# In this jupyter notebook, ...

### Import Required Libraries

In [287]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

### Load the data

In [288]:
from pathlib import Path

def load_data(file_path) -> pd.DataFrame:
    p = Path(str(file_path).strip()).expanduser().resolve()

    if not p.exists():
        raise FileNotFoundError(f"File not found: {p}")
    
    return pd.read_csv(p)
    

In [289]:
t20_bat = load_data(r"D:\Languages\Projects\ml-portfolio\EDA\data\Tabular\Batting\t20.csv")
odi_bat = load_data(r"D:\Languages\Projects\ml-portfolio\EDA\data\Tabular\Batting\ODI data.csv")
test_bat = load_data(r"D:\Languages\Projects\ml-portfolio\EDA\data\Tabular\Batting\test.csv")

In [290]:
df_dict = {
    't20_bat': t20_bat,
    'odi_bat': odi_bat,
    'test_bat': test_bat
}

### Preview the data

In [291]:
for name, df in df_dict.items():
    print()
    print(f"DataFrame: {name}")
    print(f"Shape: {df.shape}")
    display(df.head(6))
    print("-"*135)



DataFrame: t20_bat
Shape: (2006, 17)


Unnamed: 0.1,Unnamed: 0,Player,Span,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,4s,6s,Unnamed: 15
0,0,V Kohli (INDIA),2010-2019,75,70,20,2633,94*,52.66,1907,138.07,0,24,2,247,71,
1,1,RG Sharma (INDIA),2007-2019,104,96,14,2633,118,32.1,1905,138.21,4,19,6,234,120,
2,2,MJ Guptill (NZ),2009-2019,83,80,7,2436,105,33.36,1810,134.58,2,15,2,215,113,
3,3,Shoaib Malik (ICC/PAK),2006-2019,111,104,30,2263,75,30.58,1824,124.06,0,7,1,186,61,
4,4,BB McCullum (NZ),2005-2015,71,70,10,2140,123,35.66,1571,136.21,2,13,3,199,91,
5,5,DA Warner (AUS),2009-2019,76,76,8,2079,100*,30.57,1476,140.85,1,15,5,203,86,


---------------------------------------------------------------------------------------------------------------------------------------

DataFrame: odi_bat
Shape: (2500, 15)


Unnamed: 0.1,Unnamed: 0,Player,Span,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,Unnamed: 13
0,0,SR Tendulkar (INDIA),1989-2012,463,452,41,18426,200*,44.83,21367,86.23,49,96,20,
1,1,KC Sangakkara (Asia/ICC/SL),2000-2015,404,380,41,14234,169,41.98,18048,78.86,25,93,15,
2,2,RT Ponting (AUS/ICC),1995-2012,375,365,39,13704,164,42.03,17046,80.39,30,82,20,
3,3,ST Jayasuriya (Asia/SL),1989-2011,445,433,18,13430,189,32.36,14725,91.2,28,68,34,
4,4,DPMD Jayawardene (Asia/SL),1998-2015,448,418,39,12650,144,33.37,16020,78.96,19,77,28,
5,5,Inzamam-ul-Haq (Asia/PAK),1991-2007,378,350,53,11739,137*,39.52,15812,74.24,10,83,20,


---------------------------------------------------------------------------------------------------------------------------------------

DataFrame: test_bat
Shape: (3001, 13)


Unnamed: 0.1,Unnamed: 0,Player,Span,Mat,Inns,NO,Runs,HS,Ave,100,50,0,Unnamed: 11
0,0,SR Tendulkar (INDIA),1989-2013,200,329,33,15921,248*,53.78,51,68,14,
1,1,RT Ponting (AUS),1995-2012,168,287,29,13378,257,51.85,41,62,17,
2,2,JH Kallis (ICC/SA),1995-2013,166,280,40,13289,224,55.37,45,58,16,
3,3,R Dravid (ICC/INDIA),1996-2012,164,286,32,13288,270,52.31,36,63,8,
4,4,AN Cook (ENG),2006-2018,161,291,16,12472,294,45.35,33,57,9,
5,5,KC Sangakkara (SL),2000-2015,134,233,17,12400,319,57.4,38,52,11,


---------------------------------------------------------------------------------------------------------------------------------------


In [292]:
for name, df in df_dict.items():
    print(f"\nDataFrame: {name}\n")
    df.info()
    print('-' * 100)
    display(df.describe())
    print('=' * 135)
    print('=' * 135)


DataFrame: t20_bat

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2006 entries, 0 to 2005
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   2006 non-null   int64  
 1   Player       2006 non-null   object 
 2   Span         2006 non-null   object 
 3   Mat          2006 non-null   int64  
 4   Inns         2006 non-null   object 
 5   NO           2006 non-null   object 
 6   Runs         2006 non-null   object 
 7   HS           2006 non-null   object 
 8   Ave          2006 non-null   object 
 9   BF           2006 non-null   object 
 10  SR           2006 non-null   object 
 11  100          2006 non-null   object 
 12  50           2006 non-null   object 
 13  0            2006 non-null   object 
 14  4s           2006 non-null   object 
 15  6s           2006 non-null   object 
 16  Unnamed: 15  0 non-null      float64
dtypes: float64(1), int64(2), object(14)
memory usage: 266.6+ KB
---------

Unnamed: 0.1,Unnamed: 0,Mat,Unnamed: 15
count,2006.0,2006.0,0.0
mean,24.434197,11.231805,
std,14.463176,14.923332,
min,0.0,1.0,
25%,12.0,3.0,
50%,24.0,5.0,
75%,37.0,13.0,
max,49.0,111.0,



DataFrame: odi_bat

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   2500 non-null   int64  
 1   Player       2500 non-null   object 
 2   Span         2500 non-null   object 
 3   Mat          2500 non-null   int64  
 4   Inns         2500 non-null   object 
 5   NO           2500 non-null   object 
 6   Runs         2500 non-null   object 
 7   HS           2500 non-null   object 
 8   Ave          2500 non-null   object 
 9   BF           2500 non-null   object 
 10  SR           2500 non-null   object 
 11  100          2500 non-null   object 
 12  50           2500 non-null   object 
 13  0            2500 non-null   object 
 14  Unnamed: 13  0 non-null      float64
dtypes: float64(1), int64(2), object(12)
memory usage: 293.1+ KB
---------------------------------------------------------------------------------------------

Unnamed: 0.1,Unnamed: 0,Mat,Unnamed: 13
count,2500.0,2500.0,0.0
mean,24.5,37.1616,
std,14.433757,58.885075,
min,0.0,1.0,
25%,12.0,4.0,
50%,24.5,13.0,
75%,37.0,43.0,
max,49.0,463.0,



DataFrame: test_bat

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3001 entries, 0 to 3000
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   3001 non-null   int64  
 1   Player       3001 non-null   object 
 2   Span         3001 non-null   object 
 3   Mat          3001 non-null   int64  
 4   Inns         3001 non-null   object 
 5   NO           3001 non-null   object 
 6   Runs         3001 non-null   object 
 7   HS           3001 non-null   object 
 8   Ave          3001 non-null   object 
 9   100          3001 non-null   object 
 10  50           3001 non-null   object 
 11  0            3001 non-null   object 
 12  Unnamed: 11  0 non-null      float64
dtypes: float64(1), int64(2), object(10)
memory usage: 304.9+ KB
----------------------------------------------------------------------------------------------------


Unnamed: 0.1,Unnamed: 0,Mat,Unnamed: 11
count,3001.0,3001.0,0.0
mean,24.491836,17.427191,
std,14.437798,24.954654,
min,0.0,1.0,
25%,12.0,2.0,
50%,24.0,7.0,
75%,37.0,21.0,
max,49.0,200.0,




##### There is only one string column in all these three dfs, which is player, and everything else should be integer or float. 
##### Therefore:

## Data Cleaning & Feature engineering

In [293]:
for name, df in df_dict.items():
    print(f"\nDataFrame: {name}")
    print('-'*20)
    display(df.isna().sum())
    print('='*50)


DataFrame: t20_bat
--------------------


Unnamed: 0        0
Player            0
Span              0
Mat               0
Inns              0
NO                0
Runs              0
HS                0
Ave               0
BF                0
SR                0
100               0
50                0
0                 0
4s                0
6s                0
Unnamed: 15    2006
dtype: int64


DataFrame: odi_bat
--------------------


Unnamed: 0        0
Player            0
Span              0
Mat               0
Inns              0
NO                0
Runs              0
HS                0
Ave               0
BF                0
SR                0
100               0
50                0
0                 0
Unnamed: 13    2500
dtype: int64


DataFrame: test_bat
--------------------


Unnamed: 0        0
Player            0
Span              0
Mat               0
Inns              0
NO                0
Runs              0
HS                0
Ave               0
100               0
50                0
0                 0
Unnamed: 11    3001
dtype: int64



- First column and last column are useless. 
- Also, we need to change some dtypes here.
- t20_bat have 15 cols out of which 2 cols (4s and 6s) are not in odi_bat. <--> ( lacks -> [ 4s, 6s ] cols)
- odi_bat have 13 cols out of which 2 cols (BF and SR) are not present in test_bat. <--> ( lacks -> [ 4s, 6s, BF, SR ] cols)

In [294]:
for name, df in df_dict.items():
    if name == 't20_bat':
        i = 156
    elif name == 'odi_bat':
        i = 11
    elif name == 'test_bat':
        i = 21
    print(f"\nDataFrame: {name}")   
    display(df.tail(i))
    print("="*120)


DataFrame: t20_bat


Unnamed: 0.1,Unnamed: 0,Player,Span,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,4s,6s,Unnamed: 15
1850,0,Taijul Islam (BDESH),2019-2019,2,1,1,0,0*,-,1,0.00,0,0,0,0,0,
1851,1,Ziaur Rehman (Fin),2019-2019,1,1,0,0,0,0.00,4,0.00,0,0,1,0,0,
1852,2,M Zondeki (SA),2006-2006,1,1,0,0,0,0.00,1,0.00,0,0,1,0,0,
1853,3,Zulqarnain Haider (ESP),2019-2019,5,1,0,0,0,0.00,0,-,0,0,1,0,0,
1854,4,YA Abdulla (SA),2009-2009,2,-,-,-,-,-,-,-,-,-,-,-,-,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2001,1,SG Whittingham (SCOT),2018-2018,3,-,-,-,-,-,-,-,-,-,-,-,-,
2002,2,LJ Woodcock (NZ),2010-2011,3,-,-,-,-,-,-,-,-,-,-,-,-,
2003,3,Zamir Khan (AFG),2012-2012,1,-,-,-,-,-,-,-,-,-,-,-,-,
2004,4,S Zargar (Mex),2019-2019,1,-,-,-,-,-,-,-,-,-,-,-,-,



DataFrame: odi_bat


Unnamed: 0.1,Unnamed: 0,Player,Span,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,Unnamed: 13
2489,39,Zakir Hossain (BDESH),1998-1998,1,1,0,0,0,0.00,1,0.00,0,0,1,
2490,40,Aamer Hameed (PAK),1977-1978,2,-,-,-,-,-,-,-,-,-,-,
2491,41,Abdur Rauf (PAK),2008-2008,4,-,-,-,-,-,-,-,-,-,-,
2492,42,Abu Jayed (BDESH),2019-2019,2,-,-,-,-,-,-,-,-,-,-,
2493,43,FS Ahangama (SL),1985-1985,1,-,-,-,-,-,-,-,-,-,-,
2494,44,GW Aldridge (NZ),2011-2011,2,-,-,-,-,-,-,-,-,-,-,
2495,45,ZS Ansari (ENG),2015-2015,1,-,-,-,-,-,-,-,-,-,-,
2496,46,Ariful Haque (BDESH),2018-2018,1,-,-,-,-,-,-,-,-,-,-,
2497,47,Ashfaq Ahmed (PAK),1994-1994,3,-,-,-,-,-,-,-,-,-,-,
2498,48,MD Bailey (NZ),1998-1998,1,-,-,-,-,-,-,-,-,-,-,



DataFrame: test_bat


Unnamed: 0.1,Unnamed: 0,Player,Span,Mat,Inns,NO,Runs,HS,Ave,100,50,0,Unnamed: 11
2980,30,Zahir Khan (AFG),2019-2019,2,4,3,0,0*,0.00,0,0,1,
2981,31,Ali Hussain Rizvi (PAK),1997-1997,1,-,-,-,-,-,-,-,-,
2982,32,PJ Allan (AUS),1965-1965,1,-,-,-,-,-,-,-,-,
2983,33,JC Clay (ENG),1935-1935,1,-,-,-,-,-,-,-,-,
2984,34,DJ Cullen (AUS),2006-2006,1,-,-,-,-,-,-,-,-,
2985,35,HT Dani (INDIA),1952-1952,1,-,-,-,-,-,-,-,-,
2986,36,Farrukh Zaman (PAK),1976-1976,1,-,-,-,-,-,-,-,-,
2987,37,AB Howard (WI),1972-1972,1,-,-,-,-,-,-,-,-,
2988,38,A Khan (ENG),2009-2009,1,-,-,-,-,-,-,-,-,
2989,39,JCW MacBryan (ENG),1924-1924,1,-,-,-,-,-,-,-,-,




#### We can see
- In t20_bat df, we have null values from 1853,
- In odi_bat df, we have null values starting from 2490,
- In test_bat df, we have null values from 2981.

-> We should drop these rows and Unnamed columns as they contain No Data

####  We also have some null values in Ave and SR.
- All of these are not in NaN, but they are string '-'. Therefore, we first have to convert them into nan, then change the dtypes.

#### HS has some values like 98*, it means the player was not out at his highest score.
- We will make another feature named as 'hs_not_out', it will contain boolean values (True if '*' else False)
- HS will then contain plain numeric values without '*'

In [295]:
# i = 15
# for name, df in df_dict.items():
#     df.drop(columns=[col for col in [f'Unnamed: 0', f'Unnamed: {i}'] if col in df.columns], inplace=True)
#     i -= 2

In [296]:
# Row indices after which we drop data
drop_start_index = {
    't20_bat': 1853,
    'odi_bat': 2490,
    'test_bat': 2981
}


# Loop over each dataframe
for name, df in df_dict.items():
    # Drop columns named Unnamed
    df.drop(columns=[col for col in df.columns if col.startswith('Unnamed')], inplace=True)

    # Drop rows starting from given index
    df.drop(df.index[drop_start_index[name]:], inplace=True)

    # Replace '-' with NaN (in place)
    df.replace('-', np.nan, inplace=True)

    # Span split
    df[['span_start', 'span_end']] = df['Span'].str.split('-', expand=True)
    
    # Optional: convert to datetime
    # df['span_start'] = pd.to_datetime(df['span_start'], format='%Y')
    # df['span_end'] = pd.to_datetime(df['span_end'], format='%Y')

    df.drop(columns='Span', inplace=True)

    # HS feature extraction
    df['hs_not_out'] = df['HS'].str.contains(r"\*")
    df['HS'] = df['HS'].str.extract(r"(\d+)")
    
    # Convert dtypes accordingly (excluding Player, Span, hs_not_out)
    for col in df.columns[2:-1]:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # Persist changes
    df_dict[name] = df

    tnv = df.isna().sum().sum()  # total null values (in a df)
    print(f"\nDataframe: '{name}'\n")
    print(f"Total null values: {tnv} ({(tnv / df.shape[0])*100:.2f}%)")
    print(f"Total duplicate values: {df.duplicated().sum()}")
    print(f"Shape: {df.shape[0]} rows × {df.shape[1]} columns")
    print('-'*50)
    df.info()
    print('-'*35)
    display(df.sample(10))
    print('='*135)


Dataframe: 't20_bat'

Total null values: 198 (10.69%)
Total duplicate values: 0
Shape: 1853 rows × 17 columns
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1853 entries, 0 to 1852
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Player      1853 non-null   object 
 1   Mat         1853 non-null   int64  
 2   Inns        1853 non-null   int64  
 3   NO          1853 non-null   int64  
 4   Runs        1853 non-null   int64  
 5   HS          1853 non-null   int64  
 6   Ave         1677 non-null   float64
 7   BF          1853 non-null   int64  
 8   SR          1831 non-null   float64
 9   100         1853 non-null   int64  
 10  50          1853 non-null   int64  
 11  0           1853 non-null   int64  
 12  4s          1853 non-null   int64  
 13  6s          1853 non-null   int64  
 14  span_start  1853 non-null   int64  
 15  span_end    1853 non-null   i

Unnamed: 0,Player,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,4s,6s,span_start,span_end,hs_not_out
1741,O Sam Arthur (CRC),3,2,1,1,1,1.0,1,100.0,0,0,0,0,0,2019,2019,False
1759,JD Wildermuth (AUS),2,1,1,1,1,,1,100.0,0,0,0,0,0,2018,2018,True
1484,NM Hauritz (AUS),3,2,0,6,4,3.0,10,60.0,0,0,0,1,0,2009,2009,False
602,SM Pollock (SA),12,9,2,86,36,12.28,70,122.85,0,0,3,4,4,2005,2008,True
659,AR McBrine (IRE),19,11,5,73,14,12.16,70,104.28,0,0,1,6,1,2014,2017,True
74,Sarfaraz Ahmed (PAK),58,41,12,812,89,28.0,641,126.67,0,3,2,79,15,2010,2019,True
125,V Sibanda (ZIM),26,26,1,511,59,20.44,503,101.59,0,1,3,52,11,2007,2016,False
430,DF Watts (SCOT),11,9,0,137,46,15.22,127,107.87,0,0,2,13,3,2007,2012,False
398,SP Narine (WI),51,23,8,155,30,10.33,138,112.31,0,0,2,13,5,2012,2019,False
1823,A Orfila (GIBR),1,1,0,0,0,0.0,2,0.0,0,0,1,0,0,2019,2019,False



Dataframe: 'odi_bat'

Total null values: 129 (5.18%)
Total duplicate values: 0
Shape: 2490 rows × 15 columns
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2490 entries, 0 to 2489
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Player      2490 non-null   object 
 1   Mat         2490 non-null   int64  
 2   Inns        2490 non-null   int64  
 3   NO          2490 non-null   int64  
 4   Runs        2490 non-null   int64  
 5   HS          2490 non-null   int64  
 6   Ave         2370 non-null   float64
 7   BF          2490 non-null   int64  
 8   SR          2481 non-null   float64
 9   100         2490 non-null   int64  
 10  50          2490 non-null   int64  
 11  0           2490 non-null   int64  
 12  span_start  2490 non-null   int64  
 13  span_end    2490 non-null   int64  
 14  hs_not_out  2490 non-null   bool   
dtypes: bool(1), float64(2), int64(

Unnamed: 0,Player,Mat,Inns,NO,Runs,HS,Ave,BF,SR,100,50,0,span_start,span_end,hs_not_out
2416,LB Taylor (ENG),2,1,1,1,1,,6,16.66,0,0,0,1986,1986,True
1370,MW Rushmere (SA),4,4,0,78,35,19.5,174,44.82,0,0,0,1992,1992,False
366,Asif Mujtaba (PAK),66,55,14,1068,113,26.04,1671,63.91,1,6,8,1986,1996,True
27,V Sehwag (Asia/ICC/INDIA),251,245,9,8273,219,35.05,7929,104.33,15,38,14,1999,2013,False
1001,Hasibul Hossain (BDESH),32,26,6,172,21,8.6,247,69.63,0,0,3,1995,2004,True
698,S Bau (PNG),21,21,1,378,59,18.9,602,62.79,0,1,4,2016,2019,False
1996,Ahsan Malik (NL),12,5,5,14,10,,27,51.85,0,0,0,2011,2014,True
2019,Asad Ali (PAK),4,2,0,13,11,6.5,16,81.25,0,0,0,2013,2013,False
1196,MJ Cosgrove (AUS),3,3,0,112,74,37.33,116,96.55,0,1,0,2006,2006,False
1077,MRJ Watt (SCOT),27,17,7,147,31,14.7,202,72.77,0,0,0,2016,2019,True



Dataframe: 'test_bat'

Total null values: 68 (2.28%)
Total duplicate values: 0
Shape: 2981 rows × 13 columns
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2981 entries, 0 to 2980
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Player      2981 non-null   object 
 1   Mat         2981 non-null   int64  
 2   Inns        2981 non-null   int64  
 3   NO          2981 non-null   int64  
 4   Runs        2981 non-null   int64  
 5   HS          2981 non-null   int64  
 6   Ave         2913 non-null   float64
 7   100         2981 non-null   int64  
 8   50          2981 non-null   int64  
 9   0           2981 non-null   int64  
 10  span_start  2981 non-null   int64  
 11  span_end    2981 non-null   int64  
 12  hs_not_out  2981 non-null   bool   
dtypes: bool(1), float64(1), int64(10), object(1)
memory usage: 282.5+ KB
-----------------------------------


Unnamed: 0,Player,Mat,Inns,NO,Runs,HS,Ave,100,50,0,span_start,span_end,hs_not_out
2436,SV Samuelson (SA),1,2,0,22,15,11.0,0,0,0,1910,1910,False
2961,RL Park (AUS),1,1,0,0,0,0.0,0,0,1,1920,1921,False
783,UDU Chandana (SL),16,24,1,616,92,26.78,0,2,0,1999,2005,False
2161,BC Hollioake (ENG),2,4,0,44,28,11.0,0,0,1,1997,1998,False
2151,J Vine (ENG),2,3,2,46,36,46.0,0,0,0,1912,1912,False
1015,RIC Holder (WI),11,17,2,380,91,25.33,0,2,1,1997,1999,False
2030,JA Morkel (SA),1,1,0,58,58,58.0,0,1,0,2009,2009,False
1064,SCG MacGill (AUS),44,47,11,349,43,9.69,0,0,12,1998,2008,False
1517,SR Patel (ENG),6,9,0,151,42,16.77,0,0,2,2012,2015,False
778,TW Jarvis (NZ),13,22,1,625,182,29.76,1,2,6,1965,1973,False




#### Now that the dtype is correct and null values are gone, let's do some more feature engineering now.
#### -> Starting from Player column

In [297]:
t20_bat['Player'].head()

0           V Kohli (INDIA)
1         RG Sharma (INDIA)
2           MJ Guptill (NZ)
3    Shoaib Malik (ICC/PAK)
4          BB McCullum (NZ)
Name: Player, dtype: object

##### As you can see, Player column have two components:
- Player Name
- Country Code with some ICC/Asia kind of noise.

#### We need to split this into 'Player_Name' and 'Country_Name' (NOT Country Code)

In [298]:
# import re

# pattern = re.compile(r"^(.*?)\s*\((?:.*/)?([A-Z]+)\)$")

In [None]:
import re

for name, df in df_dict.items():
    print(f"\n\n{name} split started...")

    # Clean leading numbering in the whole Player field
    df["Player"] = (
        df["Player"]
        .str.replace(r"^\d+[\)\]]\s*", "", regex=True)   # remove at start
        .str.strip()
    )

    # Split once at first '('
    df[['Player_Name', 'Country_Code']] = df['Player'].str.split('(', n=1, expand=True)

    # Clean Player_Name
    df['Player_Name'] = (
        df['Player_Name']
        .str.replace(r"^\d+[\)\]]\s*", "", regex=True)  # just in case numbering here too
        .str.strip()
    )

    # Clean Country_Code
    df['Country_Code'] = (
        df['Country_Code']
        .str.replace(r"^\d+[\)\]]\s*", "", regex=True)  # remove 1) etc
        .str.rstrip(')')
        .str.split('/')
        .str[-1]
        .str.replace(r"^\d+[\(\[]\s*", "", regex=True)
        .str.strip()
    )

    print(f'{name} split is done.\n')




t20_bat split started...
t20_bat split is done.



odi_bat split started...
odi_bat split is done.



test_bat split started...
test_bat split is done.



In [300]:
for name, df in df_dict.items():
    print(f'\nDataFrame: {name}\n')
    print('-'*60)
    print('Unique Player Names: ')
    display(df['Player_Name'].unique())
    print('-'*50)
    print('Unique Country Codes:')
    display(df['Country_Code'].unique())
    print('='*100)


DataFrame: t20_bat

------------------------------------------------------------
Unique Player Names: 


array(['V Kohli', 'RG Sharma', 'MJ Guptill', ..., 'Taijul Islam',
       'Ziaur Rehman', 'M Zondeki'], shape=(1843,), dtype=object)

--------------------------------------------------
Unique Country Codes:


array(['INDIA', 'NZ', 'PAK', 'AUS', 'ENG', 'AFG', 'SA', 'IRE', 'SL',
       'ZIM', 'WI', 'World', 'BDESH', 'SCOT', 'NL', 'UAE', 'NEPAL',
       'KENYA', 'PNG', 'OMAN', 'HKG', 'CAN', 'QAT', 'MAL', 'SGP', 'NAM',
       'VAN', 'JER', 'KUW', 'ICC', 'USA', 'BMUDA', 'Mald', 'Moz', 'CZK-R',
       'Aut', 'ITA', 'GUE', 'ESP', 'GER', 'DEN', 'Mex', 'Fin', 'BAH',
       'ROM', 'UGA', 'MWI', 'PNM', 'Saudi', 'THAI', 'PORT', 'GIBR', 'NGA',
       'Botsw', '', 'LUX', 'Serb', 'Blz', 'Caym', 'CRC', 'NOR', 'Arg',
       'Samoa', 'MALTA', 'Peru', 'Chile', 'Belg', 'PHI', 'BHU', 'Ghana',
       'TKY', '(PAK', '(DEN'], dtype=object)


DataFrame: odi_bat

------------------------------------------------------------
Unique Player Names: 


array(['SR Tendulkar', 'KC Sangakkara', 'RT Ponting', ..., 'R Walters',
       'CM Willoughby', 'Zakir Hossain'], shape=(2482,), dtype=object)

--------------------------------------------------
Unique Country Codes:


array(['INDIA', 'SL', 'ICC', 'PAK', 'SA', 'WI', 'AUS', 'NZ', 'IRE',
       'BDESH', 'ZIM', 'ENG', 'KENYA', 'AFG', 'SCOT', 'CAN', 'NL', 'UAE',
       'PNG', 'HKG', 'BMUDA', 'USA', 'NAM', 'NEPAL', 'OMAN', '(PAK',
       'EAf', '(UAE'], dtype=object)


DataFrame: test_bat

------------------------------------------------------------
Unique Player Names: 


array(['SR Tendulkar', 'RT Ponting', 'JH Kallis', ..., 'P Wilson',
       'CS Wimble', 'Zahir Khan'], shape=(2966,), dtype=object)

--------------------------------------------------
Unique Country Codes:


array(['INDIA', 'AUS', 'SA', 'ENG', 'SL', 'WI', 'PAK', 'NZ', 'ZIM',
       'BDESH', 'ICC', 'AFG', 'IRE', '(PAK'], dtype=object)



### Preprocessing

In [301]:
# import pandas as pd

# # URL from the gist
# url = "https://gist.githubusercontent.com/radcliff/f09c0f88344a7fcef373/raw/wikipedia-iso-country-codes.csv"
# df_codes = pd.read_csv(url)  # columns: country name, alpha-2, alpha-3, etc.
# iso_map = dict(zip(df_codes['Alpha-3 code'].str.upper(), df_codes['English short name lower case'].str.title()))