In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import shap
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import RFE
from xgboost import XGBRegressor
from pyswarms.single.global_best import GlobalBestPSO 

In [3]:
df_original_data = pd.read_csv('mw_pw_profiles.csv')
print("Original Data")
df_original_data 

Original Data


Unnamed: 0,player_id,match_id,gender,balls_per_over,start_date,series_name,match_type,name_x,runs_scored,player_out,...,key_pulse,key_pulse_2,full_name,batting_style,bowling_style,playing_role,teams,fantasy_score_batting,fantasy_score_bowling,fantasy_score_total
0,00015688,1158348,female,6,2018-08-21,Botswana Cricket Association Women's T20I Series,T20,M King,10,1,...,,,,,,,[],5,8,17
1,00015688,1182644,female,6,2019-05-05,ICC Women's T20 World Cup Africa Region Qualifier,T20,M King,8,1,...,,,,,,,[],3,0,7
2,00015688,1275107,female,6,2021-09-09,ICC Women's T20 World Cup Africa Region Qualifier,T20,M King,1,0,...,,,,,,,[],1,0,5
3,00015688,1275113,female,6,2021-09-15,ICC Women's T20 World Cup Africa Region Qualifier,T20,M King,0,0,...,,,,,,,[],0,0,4
4,00015688,1275125,female,6,2021-09-14,ICC Women's T20 World Cup Africa Region Qualifier,T20,M King,1,1,...,,,,,,,[],-5,0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
416707,fffa744b,1473475,male,6,2025-04-20,Indian Premier League,T20,Naman Dhir,0,0,...,,,,,,,[],0,0,4
416708,fffa744b,1473478,male,6,2025-04-23,Indian Premier League,T20,Naman Dhir,0,0,...,,,,,,,[],0,8,12
416709,fffa744b,1473482,male,6,2025-04-27,Indian Premier League,T20,Naman Dhir,25,0,...,,,,,,,[],37,8,49
416710,fffa744b,1473487,male,6,2025-05-01,Indian Premier League,T20,Naman Dhir,0,0,...,,,,,,,[],0,0,4


In [18]:
df_original_data.columns

Index(['player_id', 'match_id', 'gender', 'balls_per_over', 'start_date',
       'series_name', 'match_type', 'name_x', 'runs_scored', 'player_out',
       'balls_faced', 'fours_scored', 'sixes_scored', 'catches_taken',
       'run_out_direct', 'run_out_throw', 'stumpings_done', 'out_kind',
       'dot_balls_as_batsman', 'order_seen', 'balls_bowled', 'runs_conceded',
       'wickets_taken', 'bowled_done', 'lbw_done', 'maidens',
       'dot_balls_as_bowler', 'player_team', 'opposition_team', 'name_y',
       'unique_name', 'key_bcci', 'key_bcci_2', 'key_bigbash', 'key_cricbuzz',
       'key_cricheroes', 'key_crichq', 'key_cricinfo', 'key_cricinfo_2',
       'key_cricinfo_3', 'key_cricingif', 'key_cricketarchive',
       'key_cricketarchive_2', 'key_cricketworld', 'key_nvplay',
       'key_nvplay_2', 'key_opta', 'key_opta_2', 'key_pulse', 'key_pulse_2',
       'full_name', 'batting_style', 'bowling_style', 'playing_role', 'teams',
       'fantasy_score_batting', 'fantasy_score_bowling',


In [20]:
df_original_data['unique_name'].nunique()

11769

In [22]:
df_original_data['unique_name'].head(20)

0          M King
1          M King
2          M King
3          M King
4          M King
5       M Mwamadi
6       M Mwamadi
7       M Mwamadi
8       M Mwamadi
9       M Mwamadi
10      M Mwamadi
11      M Mwamadi
12     Hongki Kim
13    M Tarimiala
14    M Tarimiala
15    M Tarimiala
16    M Tarimiala
17    M Tarimiala
18    M Tarimiala
19    M Tarimiala
Name: unique_name, dtype: object

In [24]:
df_original_data['unique_name'] = (
    df_original_data['unique_name']
      .str.strip()           # leading/trailing spaces
      .str.replace(r'\s+', ' ', regex=True)   # double spaces, tabs, newlines
      .str.title()           # uniform capitalisation (optional)
)

In [25]:
df_original_data['unique_name'].nunique()

11769

In [26]:
df_original_data['player_id'].nunique()

11771

In [32]:
name_to_many_ids = (
    df_original_data.groupby('unique_name')['player_id']
      .agg(ids=lambda s: sorted(s), id_count='nunique')
      .query('id_count > 1')        # keep only the problematic rows
      .reset_index()
      .sort_values('id_count', ascending=False)
)

print(f"Problematic full_name rows: {len(name_to_many_ids)}")

Problematic full_name rows: 0


In [33]:
df_original_data['start_date'] = pd.to_datetime(df_original_data['start_date'], errors='coerce')

df_original_data.sort_values(by=['start_date'], inplace=True)
df_original_data.reset_index(drop=True, inplace=True)

In [100]:
cutoff_date = pd.to_datetime('2023-01-01')

df_latest = df_original_data[df_original_data['start_date'] >= cutoff_date]

In [101]:
mask_aus = (
    df_latest['player_team']
      .astype(str)           # in case the column has NaNs
      .str.strip()           # trim spaces
      .str.casefold()        # robust lower‑case
      .eq('australia')       # True/False Series
)

In [102]:
aus_players = (
    df_latest.loc[mask_aus, ['unique_name', 'player_id']]
      .drop_duplicates()          # keep unique rows only
      .sort_values('unique_name')      # nice alphabetical order
      .reset_index(drop=True)
)

In [103]:
aus_players.head(55)

Unnamed: 0,unique_name,player_id
0,A Gardner,bc969efb
1,A King,83558266
2,A Sutherland,65d9b6b6
3,A Zampa,14f96089
4,Ac Agar,a2421394
5,Aj Healy,321644de
6,Aj Turner,ff1e12a0
7,Am Hardie,d405c0d5
8,At Carey,69d03465
9,Bj Dwarshuis,e1b9f3a9


In [104]:
australlia_playing_players = ['Nm Lyon', 'Ma Starc', 'C Green', 'Pj Cummins', 'At Carey', 'Bj Webster', 'Jp Inglis', 'Ut Khawaja', 'Mp Kuhnemann', 'Tm Head', 'Sa Abbott', 'Sm Boland', 'Spd Smith', 'Jr Hazlewood', 'S Konstas', 'M Labuschagne']

filtered = aus_players.loc[aus_players['unique_name'].isin(australlia_playing_players)]

In [105]:
filtered = filtered.reset_index(drop=True)

In [106]:
filtered

Unnamed: 0,unique_name,player_id
0,At Carey,69d03465
1,Bj Webster,56b93d46
2,C Green,eaa76d3c
3,Jp Inglis,989889ff
4,Jr Hazlewood,03806cf8
5,M Labuschagne,fa433be6
6,Ma Starc,3fb19989
7,Mp Kuhnemann,7b953689
8,Nm Lyon,96a6a7ad
9,Pj Cummins,ded9240e


In [107]:
filtered.to_csv('australlia_players.csv', index=False)

In [108]:
mask_aus = (
    df_latest['player_team']
      .astype(str)           # in case the column has NaNs
      .str.strip()           # trim spaces
      .str.casefold()        # robust lower‑case
      .eq('west indies')       # True/False Series
)

In [109]:
wi_players = (
    df_latest.loc[mask_aus, ['unique_name', 'player_id']]
      .drop_duplicates()          # keep unique rows only
      .sort_values('unique_name')      # nice alphabetical order
      .reset_index(drop=True)
)

In [110]:
wi_players

Unnamed: 0,unique_name,player_id
0,A Athanaze,58c2fac4
1,A Munisar,a811cb83
2,Aa Alleyne,4b0e3049
3,Aa Jangoo,88626ed2
4,Ad Russell,bbd41817
...,...,...
78,T Holder,41fd2907
79,Ta Imlach,f0f8f105
80,Tj Bishop,c68554a6
81,Y Cariah,78bb68d4


In [111]:
wi_players.head(60)

Unnamed: 0,unique_name,player_id
0,A Athanaze,58c2fac4
1,A Munisar,a811cb83
2,Aa Alleyne,4b0e3049
3,Aa Jangoo,88626ed2
4,Ad Russell,bbd41817
5,Ads Fletcher,1bae756b
6,Aj Hosein,4d7f517e
7,Akeem Jordan,77bf0483
8,As Joseph,b0946605
9,Ass Fletcher,42d9dd09


In [112]:
wi_players.tail(20)

Unnamed: 0,unique_name,player_id
63,Sc Selman,c29ada99
64,Sd Hope,1fc6ef83
65,Se Rutherford,d014d5ac
66,Sj Bruce,aeccf5cc
67,Sk Springer,4175d211
68,So Hetmyer,48a1d7b7
69,Song Yangyang,cf33e1ce
70,Sr Taylor,92cf79a8
71,Ss Connell,2f9a6bd1
72,Ss Cottrell,a1d053dd


In [113]:
wi_playing_players = ['Ba King','Ku Carty','Kc Brathwaite','M Louis','Jp Greaves', 'Rl Chase', 'Sd Hope','Ta Imlach','As Joseph', 'Jnt Seales', 'Ja Warrican', 'S Joseph']

In [116]:
filtered2 = wi_players.loc[wi_players['unique_name'].isin(wi_playing_players)]

In [117]:
filtered2 = filtered2.reset_index(drop=True)

In [118]:
filtered2

Unnamed: 0,unique_name,player_id
0,As Joseph,b0946605
1,Ba King,7fca84b7
2,Ja Warrican,cc23e91e
3,Jnt Seales,ded9ff1e
4,Jp Greaves,74d12124
5,Kc Brathwaite,7d5af2ea
6,Ku Carty,6020a3c6
7,M Louis,1d8c298b
8,Rl Chase,3feda4fa
9,S Joseph,97290faf


In [119]:
filtered2.to_csv('west_indies_players.csv', index=False)