In [1]:
import time
import difflib
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

#to display all rows columns 
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)

In [2]:
len(pd.read_csv('raw_playerprofile_dataset.csv'))

13335

In [3]:
df = pd.read_csv('raw_playerprofile_dataset.csv')

In [4]:
all_roles = list(df.role.unique())

standard_roles = [' Batsman ', ' -- ', ' Bowler ', ' Captain ',
                  ' Batting Allrounder ', ' WK-Batsman ', 
                  ' Bowling Allrounder ', ' Vice captain ']

misaligned_roles = [i for i in all_roles if i not in standard_roles ]

In [5]:
# for all misaligned roles
df['birth_place'] = df.apply(lambda x: x['height'] if x['role'] in misaligned_roles else x['birth_place'], axis = 1) 
df['height'] = df.apply(lambda x: x['role'] if x['role'] in misaligned_roles else x['height'], axis = 1) 
df['role'] = df.apply(lambda x: x['bat_style'] if x['role'] in misaligned_roles else x['role'], axis = 1) 
df['bat_style'] = df.apply(lambda x: x['bowl_style'] if x['role'] in misaligned_roles else x['bat_style'], axis = 1) 

# bat - bowl style misaligned
df['bowl_style'] = df['bowl_style'].apply(lambda x: x if (x[:6] == ' Right') | (x[:6] == ' Left-') else ' -- ' )
df['bat_style'] = df.apply(lambda x: x['bowl_style'] if x['bowl_style'] == ' Right Handed Bat ' else x['bat_style'], axis = 1)
df['bowl_style'] = df.apply(lambda x: ' -- ' if ((x['bat_style'] == ' Right Handed Bat ') & (x['bowl_style'] == ' Right Handed Bat ')) else x['bowl_style'], axis = 1)

In [6]:
df.head()

Unnamed: 0,player_id,player,img_url,born,birth_place,height,role,bat_style,bowl_style
0,25,Sachin Tendulkar,/a/img/v1/152x152/i1/c171004/sachin-tendulkar.jpg,"Apr 24, 1973 (48 years)","Bombay (now Mumbai), Maharashtra",5 ft 5 in,Batsman,Right Handed Bat,Right-arm legbreak
1,26,Virender Sehwag,/a/img/v1/152x152/i1/c171002/virender-sehwag.jpg,"Oct 20, 1978 (42 years)",Delhi,5 ft 8 in (173 cm),Batsman,Right Handed Bat,Right-arm offbreak
2,27,Rahul Dravid,/a/img/v1/152x152/i1/c156286/rahul-dravid.jpg,"Jan 11, 1973 (48 years)","Indore, Madhya Pradesh",--,Batsman,Right Handed Bat,Right-arm offbreak
3,28,VVS Laxman,/a/img/v1/152x152/i1/c156770/vvs-laxman.jpg,"Nov 01, 1974 (46 years)","Hyderabad, Andhra Pradesh",--,--,Right Handed Bat,Right-arm offbreak
4,29,Sourav Ganguly,/a/img/v1/152x152/i1/c171006/sourav-ganguly.jpg,"Jul 08, 1972 (49 years)","Calcutta (now Kolkata), Bengal",5 ft 11 in,Batsman,Left Handed Bat,Right-arm medium


In [7]:
df.bat_style.unique()

array([' Right Handed Bat ', ' Left Handed Bat ', ' Batsman ',
       ' Bowling Allrounder ', ' -- '], dtype=object)

In [8]:
df.bowl_style.unique()

array([' Right-arm legbreak ', ' Right-arm offbreak ',
       ' Right-arm medium ', ' Right-arm fast ',
       ' Right-arm fast-medium ', ' Left-arm orthodox ', ' -- ',
       ' Left-arm fast-medium ', ' Left-arm chinaman ',
       ' Left-arm medium ', ' Left-arm fast '], dtype=object)

In [9]:
# cricsheet data - ex: ipl, t20i

In [10]:
ipl_df = pd.read_csv("IPL_ball_by_ball_updated.csv")
t20i_df = pd.read_csv("T20I_ball_by_ball_updated.csv")

In [11]:
# striker, non-striker, bowler
strikers = list(ipl_df.striker.unique()) + list(t20i_df.striker.unique())
non_strikers = list(ipl_df.non_striker.unique()) + list(t20i_df.non_striker.unique())
bowler = list(ipl_df.bowler.unique()) + list(t20i_df.bowler.unique())

all_players = set(strikers).union(set(non_strikers)).union(set(bowler))

In [14]:
df.head()

Unnamed: 0,player_id,player,img_url,born,birth_place,height,role,bat_style,bowl_style
0,25,Sachin Tendulkar,/a/img/v1/152x152/i1/c171004/sachin-tendulkar.jpg,"Apr 24, 1973 (48 years)","Bombay (now Mumbai), Maharashtra",5 ft 5 in,Batsman,Right Handed Bat,Right-arm legbreak
1,26,Virender Sehwag,/a/img/v1/152x152/i1/c171002/virender-sehwag.jpg,"Oct 20, 1978 (42 years)",Delhi,5 ft 8 in (173 cm),Batsman,Right Handed Bat,Right-arm offbreak
2,27,Rahul Dravid,/a/img/v1/152x152/i1/c156286/rahul-dravid.jpg,"Jan 11, 1973 (48 years)","Indore, Madhya Pradesh",--,Batsman,Right Handed Bat,Right-arm offbreak
3,28,VVS Laxman,/a/img/v1/152x152/i1/c156770/vvs-laxman.jpg,"Nov 01, 1974 (46 years)","Hyderabad, Andhra Pradesh",--,--,Right Handed Bat,Right-arm offbreak
4,29,Sourav Ganguly,/a/img/v1/152x152/i1/c171006/sourav-ganguly.jpg,"Jul 08, 1972 (49 years)","Calcutta (now Kolkata), Bengal",5 ft 11 in,Batsman,Left Handed Bat,Right-arm medium


In [13]:
# sample closest name: cricbuzz vs cricinfo
difflib.get_close_matches(df.player[0], all_players)[0]

'SR Tendulkar'

In [15]:
s = time.time()
df['closest_cricsheet_names'] = df['player'].apply(lambda x: difflib.get_close_matches(x, all_players))
e = time.time()

print(e-s)

164.9946792125702


In [16]:
df.head()

Unnamed: 0,player_id,player,img_url,born,birth_place,height,role,bat_style,bowl_style,closest_cricsheet_names
0,25,Sachin Tendulkar,/a/img/v1/152x152/i1/c171004/sachin-tendulkar.jpg,"Apr 24, 1973 (48 years)","Bombay (now Mumbai), Maharashtra",5 ft 5 in,Batsman,Right Handed Bat,Right-arm legbreak,[SR Tendulkar]
1,26,Virender Sehwag,/a/img/v1/152x152/i1/c171002/virender-sehwag.jpg,"Oct 20, 1978 (42 years)",Delhi,5 ft 8 in (173 cm),Batsman,Right Handed Bat,Right-arm offbreak,"[V Sehwag, Virandeep Singh, Joginder Sharma]"
2,27,Rahul Dravid,/a/img/v1/152x152/i1/c156286/rahul-dravid.jpg,"Jan 11, 1973 (48 years)","Indore, Madhya Pradesh",--,Batsman,Right Handed Bat,Right-arm offbreak,"[R Dravid, TH David, H Davids]"
3,28,VVS Laxman,/a/img/v1/152x152/i1/c156770/vvs-laxman.jpg,"Nov 01, 1974 (46 years)","Hyderabad, Andhra Pradesh",--,--,Right Handed Bat,Right-arm offbreak,"[VVS Laxman, S Lomani, S Lamichhane]"
4,29,Sourav Ganguly,/a/img/v1/152x152/i1/c171006/sourav-ganguly.jpg,"Jul 08, 1972 (49 years)","Calcutta (now Kolkata), Bengal",5 ft 11 in,Batsman,Left Handed Bat,Right-arm medium,[SC Ganguly]


In [17]:
df['closest_cricsheet_name'] = df['closest_cricsheet_names'].apply(lambda x: x[0] if len(x) > 0 else ' -- ')

In [19]:
df['img_url'] = df['img_url'].apply(lambda x: 'cricbuzz.com' + x)

In [20]:
df.head()

Unnamed: 0,player_id,player,img_url,born,birth_place,height,role,bat_style,bowl_style,closest_cricsheet_names,closest_cricsheet_name
0,25,Sachin Tendulkar,cricbuzz.com/a/img/v1/152x152/i1/c171004/sachin-tendulkar.jpg,"Apr 24, 1973 (48 years)","Bombay (now Mumbai), Maharashtra",5 ft 5 in,Batsman,Right Handed Bat,Right-arm legbreak,[SR Tendulkar],SR Tendulkar
1,26,Virender Sehwag,cricbuzz.com/a/img/v1/152x152/i1/c171002/virender-sehwag.jpg,"Oct 20, 1978 (42 years)",Delhi,5 ft 8 in (173 cm),Batsman,Right Handed Bat,Right-arm offbreak,"[V Sehwag, Virandeep Singh, Joginder Sharma]",V Sehwag
2,27,Rahul Dravid,cricbuzz.com/a/img/v1/152x152/i1/c156286/rahul-dravid.jpg,"Jan 11, 1973 (48 years)","Indore, Madhya Pradesh",--,Batsman,Right Handed Bat,Right-arm offbreak,"[R Dravid, TH David, H Davids]",R Dravid
3,28,VVS Laxman,cricbuzz.com/a/img/v1/152x152/i1/c156770/vvs-laxman.jpg,"Nov 01, 1974 (46 years)","Hyderabad, Andhra Pradesh",--,--,Right Handed Bat,Right-arm offbreak,"[VVS Laxman, S Lomani, S Lamichhane]",VVS Laxman
4,29,Sourav Ganguly,cricbuzz.com/a/img/v1/152x152/i1/c171006/sourav-ganguly.jpg,"Jul 08, 1972 (49 years)","Calcutta (now Kolkata), Bengal",5 ft 11 in,Batsman,Left Handed Bat,Right-arm medium,[SC Ganguly],SC Ganguly


In [21]:
# df.to_csv('players_dump_updated.csv', index = False)