In [22]:
#this file does web scraping to extract data from ESPNCricInfo website about matches and teams
#follows ETL process of Extraction, Transform and Load

**I am following ETL(Extraction, Transform, Load) process here.**
**Data is Extracted from web sources, then it is transformed to json which makes it more flexible and then it is loaded locally**

# This File does Extraction

In [23]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import codecs
import re
import json
import requests
import pandas as pd
from lxml import html,etree
from webdriver_manager.chrome import ChromeDriverManager

**Set options for webdriver so no window pop ups when driver calls get function**

In [24]:
options = webdriver.ChromeOptions()
options.add_argument('--headless')

**Extract match results data of World Cup**

In [27]:
url = 'https://www.espncricinfo.com/records/season/team-match-results/2022to23-2022to23?trophy=89'
match_results = pd.read_html(url)
match_results = pd.DataFrame(match_results[0])

In [28]:
match = []
for x in range(0,len(match_results)):
    dict = {'team 1' : match_results.iloc[x,0],'team 2' : match_results.iloc[x,1],'winner' : match_results.iloc[x,2],'margin' : match_results.iloc[x,3],'ground' : match_results.iloc[x,4],'matchDate' : match_results.iloc[x,5],'scorecard' : match_results.iloc[x,6]}
    match.append(dict)

match = {"matchSummary" : match}
match_res = []
match_res.append(match)

**Dump JSON file of match results**

In [12]:
with open('t20_json_files/t20_world_cup_match_results.json','w') as file:
    json.dump(match_res,file)

**Get all the links from the match result webpage**

In [32]:
#get all the links in the webpage
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=options)
wait = WebDriverWait(driver,10)
driver.get(url)
links = driver.find_elements(By.TAG_NAME,'a')
list_of_scorecards = []
for link in links:
    list_of_scorecards.append(link.get_attribute('href'))
driver.quit()
#this cell runs for 3-4 minutes to get all the links

**Get links of scorecards only from existing links**

In [49]:
#extracts links of scorecards of each match
#only 42 matches were played partially or completely, no ball was bowled in 3 matches hence their data is not added here
scorecard_links = []
for link in list_of_scorecards:
    if "icc-men-s-t20-world-cup-2022-23" in link:
        scorecard_links.append(link)

**Extract Batting and Bowling scorecard for each match from each page**

In [50]:
#variable all_batting_scorecard has the batting scorecard of both innings of all matches
#variable all_bowling_scorecard has the bowling scorecard of both innings of all matches

all_batting_scorecard = []
all_bowling_scorecard = []


for link in scorecard_links:
    #variable match_name has match name 

    r = requests.get(link)
    soup = BeautifulSoup(r.content,'html.parser')
    match_tags = soup.find_all("h1",{"class":"ds-text-title-xs ds-font-bold ds-mb-2 ds-m-1"})
    match_name = match_tags[0]
    for data in match_name:
        match_name = data.string
        break

    #innings has the order in which teams batted
    batting_tags = soup.find_all("span",{"class":"ds-text-title-xs ds-font-bold ds-capitalize"})
    innings = []
    for bat in batting_tags:
        innings.append(bat.string)

    if innings[0] == 'United Arab Emirates':
        innings[0] = 'U.A.E.'
    if innings[1] == 'United Arab Emirates':
        innings[1] = 'U.A.E.'
    #0th table -> Batting Scorecard of 1st innings
    #1st table -> Bowling Scorecard of 1st innings with wicket commentary
    #2nd table -> Batting Scorecard of 2nd innings
    #3rd table -> Bowling Scorecard of 2nd innings with wicket commentary
    #4th table -> Ground info, Umpire Info and Match info
    #5th table -> Brief Batter Info of 2nd innings
    #6th table -> Bowling Scorecard of 2nd innings
    #7th table -> Partnership of batters in 2nd innings
    #8th and 9th table -> Table of preliminary round groups
    #10th and 11th table ->Table of group stages

    
    data1 = pd.read_html(link)

    
    #1st innings batter info
    batter_info = data1[0].iloc[:,0:8].dropna()[:-2]
    batting_card = []
    pos = 1
    actual_len = len(data1[0].iloc[:,0:8].dropna())
    
    if 'Fall of wickets' not in data1[0].iloc[:,0:8].dropna().iloc[actual_len-1,0]:
        actual_len = actual_len + 1
        batter_info2 = data1[0].iloc[:,0:8].dropna()[:-1]
    elif actual_len == 12:
        if 'Did not bat' not in data1[0].iloc[:,0:8].dropna().iloc[10,0]:
            batter_info = data1[0].iloc[:,0:8].dropna()[:-1]
            actual_len = actual_len + 1
            
    actual_len = actual_len - 2
        
    for i in range(0,actual_len):
        dict = {"match" : match_name, "teamInnings" : innings[0], "battingPos" : pos,"batsmanName" : batter_info.iloc[i,0], "dismissal" : batter_info.iloc[i,1], "runs" : batter_info.iloc[i,2], "balls" : batter_info.iloc[i,3], "4s" : batter_info.iloc[i,5], "6s" : batter_info.iloc[i,6], "SR" : batter_info.iloc[i,7]}
        pos += 1
        batting_card.append(dict)
    batter_info2 = data1[2].iloc[:,0:8].dropna()[:-2]
    batting_card2 = []
    pos = 1

    #1st innings bowler info
    bowlers = []
    bowling_card = []
    for i in range(0,len(data1[1].iloc[:,0])):
        if data1[1].iloc[i,0][0].isdigit():
            pass
        else:
            bowlers.append(data1[1].iloc[i,:])
    for l in bowlers:
        dict = {"match" : match_name, "bowlingTeam" : innings[1], "bowlerName" : l.iloc[0], "overs" : l.iloc[1], "maiden" : l.iloc[2], "runs" : l.iloc[3], "wickets" : l.iloc[4],"economy" : l.iloc[5],"0s" : l.iloc[6], "4s" : l.iloc[7],"6s" : l.iloc[8], "wides" : l.iloc[9], "noBalls" : l.iloc[10]}
        bowling_card.append(dict)    

    
    #2nd innings batter info
    actual_len = len(data1[2].iloc[:,0:8].dropna())

    if 'Fall of wickets' not in data1[2].iloc[:,0:8].dropna().iloc[actual_len-1,0]:
        actual_len = actual_len + 1
        batter_info2 = data1[2].iloc[:,0:8].dropna()[:-1]
    elif actual_len == 12:
        if 'Did not bat' not in data1[2].iloc[:,0:8].dropna().iloc[10,0]:
            batter_info2 = data1[2].iloc[:,0:8].dropna()[:-1]
            actual_len = actual_len + 1
    actual_len = actual_len - 2
        
    for i in range(0,actual_len):
        dict = {"match" : match_name, "teamInnings" : innings[1], "battingPos" : pos,"batsmanName" : batter_info2.iloc[i,0], "dismissal" : batter_info2.iloc[i,1], "runs" : batter_info2.iloc[i,2], "balls" : batter_info2.iloc[i,3], "4s" : batter_info2.iloc[i,5], "6s" : batter_info2.iloc[i,6], "SR" : batter_info2.iloc[i,7]}
        pos += 1
        batting_card2.append(dict)
    batting_card3 = batting_card + batting_card2
    batting_card3 = {"battingSummary" : batting_card3}
    all_batting_scorecard.append(batting_card3)
    # print(batting_card3)
    # break

    #2nd innings bowler info
    bowlers2 = []
    bowling_card2 = []
    
    for i in range(0,len(data1[3].iloc[:,0])):
        if data1[3].iloc[i,0][0].isdigit():
            pass
        else:
            bowlers2.append(data1[3].iloc[i,:])
    for l in bowlers2:
        dict = {"match" : match_name, "bowlingTeam" : innings[0], "bowlerName" : l.iloc[0], "overs" : l.iloc[1], "maiden" : l.iloc[2], "runs" : l.iloc[3], "wickets" : l.iloc[4],"economy" : l.iloc[5],"0s" : l.iloc[6], "4s" : l.iloc[7],"6s" : l.iloc[8], "wides" : l.iloc[9], "noBalls" : l.iloc[10]}
        bowling_card2.append(dict)
    bowling_card3 = bowling_card + bowling_card2
    bowling_card3 = {"bowlingSummary":bowling_card3}
    all_bowling_scorecard.append(bowling_card3)

with open('t20_json_files/t20_wc_batting_scorecard.json','w') as file:
    json.dump(all_batting_scorecard,file,ensure_ascii=False)

with open('t20_json_files/t20_wc_bowling_scorecard.json','w') as file:
    json.dump(all_bowling_scorecard,file,default=str,ensure_ascii=False)


#abandoned matches are not included in the list at all

**Extract player data from squad page**

In [39]:
#scrape the links from squad page

url = 'https://www.espncricinfo.com/series/icc-men-s-t20-world-cup-2022-23-1298134/squads'
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=options)
wait = WebDriverWait(driver,10)
driver.get(url)
links = driver.find_elements(By.TAG_NAME,'a')
list_of_squads = []
for link in links:
    list_of_squads.append(link.get_attribute('href'))
driver.quit()

#filter links containing squad data
squads = []
for s in list_of_squads:
    if 'series-squads' in s:
        squads.append(s)

**Extract player info from each squad**

In [55]:
#all_squad_player_info stores all player details

all_squad_player_info = []

for link in squads:
    
    #scrape all links from squad page
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=options)
    wait = WebDriverWait(driver,10)
    driver.get(link)
    links = driver.find_elements(By.TAG_NAME,'a')
    list_of_links = []

    #note --> link is different from alink
    for alink in links:
        list_of_links.append(alink.get_attribute('href'))
    driver.quit()

    #scrape team_name from the squad page
    
    r = requests.get(link,verify=False)
    soup = BeautifulSoup(r.content,'html.parser')
    squad_name = soup.find_all('span',{'class':'ds-text-title-xs ds-font-bold ds-text-typo ds-capitalize'})
    cnt = 0
    team_name = ''
    for st in squad_name:
        cnt += 1
        if cnt > 1:
            team_name = st.string
    team_name = team_name[:-6]
    if team_name == 'United Arab Emirates':
        team_name = 'U.A.E.'

    #scrape links of cricketers from the squad page
    cricketer_list = set()
    for link in list_of_links:
        if 'cricketers' in link:
            cricketer_list.add(link)
    cricketer_list = list(cricketer_list)
    for cric in cricketer_list:
        url_components = cric.split('/')
        name_component = url_components[-1]
        player_name_comp = name_component.split('-')
        actual_name = []
        for p in range(0,len(player_name_comp)-1):
            actual_name.append(player_name_comp[p])
        player_name = ''
        for p in actual_name:
            player_name = player_name + p[0].upper() + p[1:] + " "
        player_name = player_name[:len(player_name)-1]
        player_data = requests.get(cric,verify=False)
        temp_soup = BeautifulSoup(player_data.content, 'html.parser')
        data_cric = temp_soup.find_all('span','ds-text-title-s ds-font-bold ds-text-typo')
        data_title = temp_soup.find_all('p','ds-text-tight-m ds-font-regular ds-uppercase ds-text-typo-mid3')
        title = []
        for d in data_title:
            title.append(d.string)
        player = []
        for d in data_cric:
            player.append(d.string)
        extracted_player_data = {}
        for i in range(len(title)):
            extracted_player_data.update({title[i]: player[i]})
        desc_cric = temp_soup.find_all('div','ci-player-bio-content')
        desc = []
        cnt = 0
        for d in desc_cric:
            desc.append(d.text)
        if desc != []:
            desc = str(*desc)
            sentences = desc.split('.')
            desc=''
            cnt = 0
            for cent in sentences:
                desc = desc + cent + '.'
                cnt +=1
                if cnt > 2:
                    break
        if 'Bowling Style' in extracted_player_data:
            pass
        else:
            extracted_player_data['Bowling Style'] = []
        player_data = {"name" : player_name,"team" : team_name, "battingStyle" :extracted_player_data['Batting Style'] ,"bowlingStyle": extracted_player_data['Bowling Style'],"playingRole":extracted_player_data['Playing Role'],"description": desc}
        all_squad_player_info.append(player_data)
        
with open('t20_json_files/t20_wc_all_squad_player_info.json','w') as file:
    json.dump(all_squad_player_info,file)

