# Web Scraping for Ashes Cricket Data

by Jason Jiang

### Ashes Team Batting Innings Data

In [1]:
# Importing packages
import requests # HTML page requesting
from bs4 import BeautifulSoup # HTML page parser
import pandas as pd # data processing
from espncricinfo.match import Match # ashes cricket match data
from urllib.parse import quote

In [2]:
# Request to the Ashes Series results page and extract HTML contents
all_ashes_url = requests.get("https://www.espncricinfo.com/records/trophy/team-series-results/the-ashes-1")
all_ashes_read = BeautifulSoup(all_ashes_url.content,"html.parser")

In [3]:
# Get Ashes series urls and Ashes series season
ashes_table = all_ashes_read.find("table",class_="ds-w-full ds-table ds-table-xs ds-table-auto ds-w-full ds-overflow-scroll ds-scrollbar-hide")
ashes_series_urls = ['https://www.espncricinfo.com'+a['href'] for a in ashes_table.find_all('a')]

ashes_rows = ashes_table.find_all('tr')
ashes_series_season = [row.find_all('td')[1].text for row in ashes_rows[1:]]

For each series url, get hyperlinks for each match in the series and extract match ids from hyperlinks.

Every cricket match has its own match id and can be extracted from the match hyperlink in the series url html pages. 

Store the match ids by series in a dictionary (keys = series, values = match id lists)

In [4]:
# Create match id dictionary
dict_series_match_ids = {}
for i in range(len(ashes_series_urls)):
    ashes_series_url = requests.get(ashes_series_urls[i]) # Request to a series page and extract HTML contents
    ashes_series_read = BeautifulSoup(ashes_series_url.content,"html.parser")
    ashes_series_div = ashes_series_read.find("div",class_="ds-p-0")
    ashes_series_matches = ashes_series_div.find_all('a',class_="ds-no-tap-higlight")
    ashes_series_match_ids = [j['href'].split('/')[-2].split('-')[-1] for j in ashes_series_matches]
    dict_series_match_ids[ashes_series_season[i]] = ashes_series_match_ids

In [5]:
# Display match id dictionary
dict_series_match_ids

{'1882/83': ['62405', '62406', '62407'],
 '1884': ['62409', '62410', '62411'],
 '1884/85': ['62412', '62413', '62414', '62415', '62416'],
 '1886': ['62417', '62418', '62419'],
 '1886/87': ['62420', '62421'],
 '1887/88': ['62422'],
 '1888': ['62423', '62424', '62425'],
 '1890': ['62428', '62429', '64135'],
 '1891/92': ['62430', '62431', '62432'],
 '1893': ['62434', '62435', '62436'],
 '1894/95': ['62437', '62438', '62439', '62440', '62441'],
 '1896': ['62445', '62446', '62447'],
 '1897/98': ['62448', '62449', '62450', '62451', '62452'],
 '1899': ['62455', '62456', '62457', '62458', '62459'],
 '1901/02': ['62460', '62461', '62462', '62463', '62464'],
 '1902': ['62465', '62466', '62467', '62468', '62469'],
 '1903/04': ['62473', '62474', '62475', '62476', '62477'],
 '1905': ['62478', '62479', '62480', '62481', '62482'],
 '1907/08': ['62491', '62492', '62493', '62494', '62495'],
 '1909': ['62496', '62497', '62498', '62499', '62500'],
 '1911/12': ['62511', '62512', '62513', '62514', '62515']

Since we now have every Ashes test match stored in a dictionary, we will look into each match and extract the data about every innings in the match. 

This is done using the Python ESPNCricinfo Library to help web scrape the required data.

The final dataset contains every Ashes batting innings with the following columns/features:

|FEATURE(S)|DESCRIPTION|
|----|----|
|Match ID|Ashes test match ID|
|Runs|Runs scored in innings by batting team|
|Wickets|Wickets taken in innings by bowling team|
|Overs|Overs bowled by bowling team|
|Balls|Balls bowled by bowling team|
|BPO|Number of balls bowled per over|
|Run Rate|Runs scored per over|
|Minutes|Time taken in innings|
|Total Extras|Runs scored not by batting team|
|No-Balls|Runs when bowler commits illegal action when bowling|
|Wides|Runs when bowler delivers too far from batter to hit|
|Byes|Runs when batters run without striking batter's bat or body|
|Leg-Byes|Runs when batters run when batter is struck on body|
|Penalty Runs|Runs awarded for breaches of cricketing laws|
|Home Team & Home Team ID|Team which is playing on own ground|
|Season & Season ID|Time of year match is played|
|Series Name & Series ID|Name of series match played in|
|Team & Team ID|Country team of innings|
|Town Area, Town Name & Town ID|Geographic location of match played|
|Ground Name & Ground ID|Name of ground match played in|



In [6]:
# Retrieve the series/season keys and match id lists values from dictionary
series_list = list(dict_series_match_ids.keys())
match_id_list = list(dict_series_match_ids.values())

# Create dataframe for dataset
ashes_team_innings_data = pd.DataFrame(columns=['match_id','inns','runs', 'wickets','overs','balls','bpo','run_rate','minutes',
                                                'total_extras','no balls','wides','byes','leg-byes','penalty_runs','home_team',
                                                'home_team_id','season','season_id','series_name','series_id','team','team_id',
                                                'town_area','town_name','town_id','ground_name','ground_id'])
for i in range(len(match_id_list)):
    for j in range(len(match_id_list[i])):
        m = Match(match_id_list[i][j]) # Every single match
        if m.home_team == m.team_1_abbreviation:
            home_team_id = m.team_1_id
        elif m.home_team == m.team_2_abbreviation:
            home_team_id = m.team_2_id
        for k in range(len(m.innings)): # For every innings of match, create new row
            if int(m.innings[k]['batting_team_id']) == int(m.team_1_id):
                batting_team_name = m.team_1_abbreviation
            elif int(m.innings[k]['batting_team_id']) == int(m.team_2_id):
                batting_team_name = m.team_2_abbreviation
            # New row creation using dictionary with data values
            new_row = {'match_id':m.match_id,'inns':k+1,'runs':m.innings[k]['runs'],'wickets':m.innings[k]['wickets'],'overs':m.innings[k]['overs'],
                       'balls':m.innings[k]['balls'],'bpo':m.innings[k]['bpo'],'run_rate':m.innings[k]['run_rate'],'minutes':m.innings[k]['minutes'],
                       'total_extras':m.innings[k]['extras'],'no balls':m.innings[k]['noballs'],'wides':m.innings[k]['wides'],'byes':m.innings[k]['byes'],
                       'leg-byes':m.innings[k]['legbyes'],'penalty_runs':m.innings[k]['penalties'],'home_team':m.home_team,'home_team_id':home_team_id,
                       'season':m.season,'season_id':i,'series_name':m.series_name,'series_id':m.series_id,'team':batting_team_name,
                       'team_id':m.innings[k]['batting_team_id'],'town_area':m.town_area,'town_name':m.town_name,'town_id':m.town_id,
                       'ground_name':m.ground_name,'ground_id':m.ground_id}
            # Adding to dataframe
            ashes_team_innings_data.loc[len(ashes_team_innings_data)] = new_row

In [7]:
# Display five random Ashes batting innings in dataset
ashes_team_innings_data.sample(n=5)

Unnamed: 0,match_id,inns,runs,wickets,overs,balls,bpo,run_rate,minutes,total_extras,...,season_id,series_name,series_id,team,team_id,town_area,town_name,town_id,ground_name,ground_id
1062,64010,2,552,9,139.2,836,6,3.96,594.0,43,...,61,England tour of Australia,15278,AUS,2,South Australia,Adelaide,2,Adelaide Oval,131
862,63326,1,438,10,156.5,941,6,2.79,,24,...,51,England tour of Australia,16969,AUS,2,South Australia,Adelaide,2,Adelaide Oval,131
1265,1336043,4,282,8,92.3,555,6,3.04,,18,...,72,Australia tour of England,21044,AUS,2,,Birmingham,100,"Edgbaston, Birmingham",164
495,62668,1,460,10,153.0,1224,8,3.0,,11,...,32,England tour of Australia,17515,ENG,1,South Australia,Adelaide,2,Adelaide Oval,131
646,62924,1,393,10,103.1,825,8,3.81,,6,...,40,England [Marylebone Cricket Club] tour of Aust...,17355,AUS,2,South Australia,Adelaide,2,Adelaide Oval,131


In [8]:
# Write dataframe to CSV file
ashes_team_innings_data.to_csv("ashes_team_innings.csv", index=False)

_______________________________________________________________________

### Ashes Player Batting/Bowling Data

For this section, we are going to look into 12 players who have taken part in the Ashes and have done incredibly well. 6 batsmen and 6 bowlers. 6 from Australia and 6 from England.

|AUSTRALIA|ENGLAND|
|----|----|
|**Batting**||
|Steve Waugh|Alastair Cook|
|Allan Border|David Gower|
|Steven Smith|Graham Gooch|
|**Bowling**||
|Shane Warne|Stuart Broad|
|Dennis Lillee|Ian Botham|
|Glenn McGrath|James Anderson|

Each player's performances in Ashes test matches are gathered from their stats profiles.

In [9]:
series_list = list(dict_series_match_ids.keys())
match_id_list = list(dict_series_match_ids.values())

batting_inns_df = pd.DataFrame(columns=['name','dismissal','runs','bf','mins','4s','6s','sr','team','inns','pos','start date','ground','grnd_ctry'])
bowling_inns_df = pd.DataFrame(columns=['name','overs','maidens','runs','wickets','economy','0s','4s','6s','wides','no-balls','bpo','balls','team',
                                        'inns','pos','start date','ground','grnd_ctry'])
def get_player_stats(player,df,type):
    qstr = quote(player+'profile')
    google_player_url = f"https://www.google.com/search?q={qstr}"
    while True:
        google_player_request = requests.get(google_player_url)
        google_player_content = BeautifulSoup(google_player_request.content,"html.parser")
        player_profile_url = [a['href'] for a in google_player_content.find_all('a') if 'https://www.espncricinfo.com/cricketers/' in a['href']][0]
        start_idx = player_profile_url.find("/url?q=")
        end_idx = player_profile_url.find("&sa=U&ved=")
        if start_idx != -1 and end_idx != -1 and end_idx > start_idx:
            break
    player_profile_url = player_profile_url[start_idx + len("/url?q="):end_idx]
    player_profile_request = requests.get(player_profile_url)
    player_profile_content = BeautifulSoup(player_profile_request.content,"html.parser")
    player_info = player_profile_content.find_all('span',class_="ds-text-comfortable-s")
    player_team = player_info[0].text
    player_careerspan = [int(player_info[-1].text.split(' ')[-3]),int(player_info[-1].text.split(' ')[-1])]
    for i in range(len(series_list)):
        if int(series_list[i].split('/')[0]) >= player_careerspan[0]:
            series_start_idx = i-1
            break
    for i in range(series_start_idx,len(series_list)):      
        if int(series_list[i].split('/')[0]) >= player_careerspan[1]:
            series_end_idx = i+1
            break
    for i in range(series_start_idx,series_end_idx):
        for j in match_id_list[i]:
            m = Match(j)
            match_url_request = requests.get(m.match_url)
            match_content = BeautifulSoup(match_url_request.content, "html.parser")
            match_inns_content = match_content.find_all("div",class_="ds-rounded-lg ds-mt-2")
            for k in range(len(match_inns_content)): # k = inns number (1,2,3,4)
                match_batting_team_inns = match_inns_content[k].find("span").text.replace('\xa0',' ').strip().split(' ')[0]
                if type == 'Batter' and match_batting_team_inns == player_team:
                    match_batting_inns_table = match_inns_content[k].find("table")
                    batting_inns_table_rows = match_batting_inns_table.find('tbody').find_all('tr')
                    batting_inns_table_rows = [c for c in batting_inns_table_rows if len(c.get('class')) == 0]
                    for l in range(len(batting_inns_table_rows)):
                        if batting_inns_table_rows[l].find('td').text.replace('\xa0',' ').replace('(c)','').strip() == player:
                            inns_row = [n.text for n in batting_inns_table_rows[l].find_all('td')]
                            inns_row[0] = inns_row[0].replace('\xa0',' ').replace('(c)','').strip()
                            inns_row[1] = inns_row[1].replace('\xa0',' ').strip()
                            if inns_row[1].split(' ')[0] == 'c':
                                inns_row[1] = 'caught'
                            elif inns_row[1].split(' ')[0] == 'b':
                                inns_row[1] = 'bowled'
                            elif inns_row[1].split(' ')[0] == 'lbw':
                                inns_row[1] = 'lbw'
                            elif inns_row[1].split(' ')[0] == 'st':
                                inns_row[1] = 'stumped'
                            elif inns_row[1].split(' ')[0] == 'not' and inns_row[1].split(' ')[1] == 'out':
                                inns_row[1] = 'not out'
                            elif inns_row[1].split(' ')[0] == 'run' and inns_row[1].split(' ')[1] == 'out':
                                inns_row[1] = 'run out'
                            elif inns_row[1].split(' ')[0] == 'retired' and inns_row[1].split(' ')[1] == 'hurt':
                                inns_row[1] = 'retired notout'
                            elif inns_row[1].split(' ')[0] == 'handled' and inns_row[1].split(' ')[1] == 'the' and inns_row[1].split(' ')[1] == 'ball':
                                inns_row[1] = 'handled ball'
                            inns_row[2:7] = [int(inns_row[n]) for n in range(2,7)]
                            inns_row[7] = float(inns_row[7])
                            inns_row.append(player_team)
                            inns_row.append(k+1)
                            inns_row.append(l+1)
                            inns_row.append(m.date)
                            if m.town_name == 'London': # Append ground of match
                                if m.ground_name.split(",")[0] == 'Kennington Oval':
                                    inns_row.append('The Oval')
                                elif m.ground_name.split(",")[0] == "Lord's":
                                    inns_row.append("Lord's")
                            else:
                                inns_row.append(m.town_name)
                            if m.continent == 'Europe':
                                inns_row.append("England")
                            elif m.continent == 'Oceania':
                                inns_row.append("Australia")
                            df.loc[len(df)] = inns_row
                            break
                elif type == 'Bowler' and match_batting_team_inns != player_team:
                    match_bowling_inns_table = match_inns_content[k].find_all("table")[1]
                    bowling_inns_table_rows = match_bowling_inns_table.find('tbody').find_all('tr')
                    bowling_inns_table_rows = [c for c in bowling_inns_table_rows if len(c.get('class')) == 0]
                    for l in range(len(bowling_inns_table_rows)):
                        if bowling_inns_table_rows[l].find('td').text.replace('\xa0',' ').strip() == player:
                            inns_row = [n.text for n in bowling_inns_table_rows[l].find_all('td')]
                            inns_row[0] = inns_row[0].replace('\xa0',' ').strip()
                            inns_row[2:5] = [int(inns_row[n]) for n in range(2,5)]
                            inns_row[5] = float(inns_row[5])
                            inns_row[9:11] = [int(inns_row[n]) for n in range(9,11)]
                            inns_row.append([n['bpo'] for n in m.innings if int(n['innings_number']) == k+1][0])
                            if len(inns_row[1].split(".")) == 1: # Find total balls bowled and append
                                inns_row.append(int(inns_row[1].split(".")[0])*inns_row[-1])
                            elif len(inns_row[1].split(".")) == 2:
                                inns_row.append(int(inns_row[1].split(".")[0])*inns_row[-1] + int(inns_row[1].split(".")[1]))
                            inns_row.append(player_team)
                            inns_row.append(k+1)
                            inns_row.append(l+1)
                            inns_row.append(m.date)
                            if m.town_name == 'London': # Append ground of match
                                if m.ground_name.split(",")[0] == 'Kennington Oval':
                                    inns_row.append('The Oval')
                                elif m.ground_name.split(",")[0] == "Lord's":
                                    inns_row.append("Lord's")
                            else:
                                inns_row.append(m.town_name)
                            if m.continent == 'Europe':
                                inns_row.append("England")
                            elif m.continent == 'Oceania':
                                inns_row.append("Australia")
                            df.loc[len(df)] = inns_row
                            break

get_player_stats('Steve Waugh',batting_inns_df,'Batter')
get_player_stats('Steven Smith',batting_inns_df,'Batter')
get_player_stats('Allan Border',batting_inns_df,'Batter')
get_player_stats('Alastair Cook',batting_inns_df,'Batter')
get_player_stats('David Gower',batting_inns_df,'Batter')
get_player_stats('Graham Gooch',batting_inns_df,'Batter')
                    
get_player_stats('Shane Warne',bowling_inns_df,'Bowler')
get_player_stats('Dennis Lillee',bowling_inns_df,'Bowler')
get_player_stats('Glenn McGrath',bowling_inns_df,'Bowler')
get_player_stats('James Anderson',bowling_inns_df,'Bowler')
get_player_stats('Ian Botham',bowling_inns_df,'Bowler')
get_player_stats('Stuart Broad',bowling_inns_df,'Bowler')

In [11]:
# Display five random Ashes player innings in batting df
batting_inns_df.sample(n=5)

Unnamed: 0,name,dismissal,runs,bf,mins,4s,6s,sr,team,inns,pos,start date,ground,grnd_ctry
32,Steve Waugh,not out,26,73,92,0,0,35.61,Australia,3,6,1994-12-24,Melbourne,Australia
396,Graham Gooch,handled the ball,133,247,309,21,2,53.84,England,4,1,1993-06-03,Manchester,England
71,Steve Waugh,bowled,6,11,16,1,0,54.54,Australia,4,6,2003-01-02,Sydney,Australia
405,Graham Gooch,caught,56,66,89,10,0,84.84,England,1,1,1993-08-19,The Oval,England
7,Steve Waugh,caught,73,172,185,5,0,42.44,Australia,3,6,1987-01-10,Sydney,Australia


In [12]:
# Display five random Ashes player innings in bowling df
bowling_inns_df.sample(n=5)

Unnamed: 0,name,overs,maidens,runs,wickets,economy,0s,4s,6s,wides,no-balls,bpo,balls,team,inns,pos,start date,ground,grnd_ctry
38,Shane Warne,5.3,0,16,2,2.9,26,2,0,1,0,6,33,Australia,1,4,2001-07-19,Lord's,England
120,Glenn McGrath,19.0,4,61,0,3.21,-,-,-,0,0,6,114,Australia,4,1,1994-11-25,Brisbane,Australia
372,Stuart Broad,18.0,2,51,3,2.83,79,4,0,0,0,6,108,England,3,1,2022-01-14,Hobart,Australia
285,Ian Botham,30.0,8,75,4,2.5,-,-,-,0,0,6,180,England,1,3,1983-01-02,Sydney,Australia
52,Shane Warne,7.0,2,19,2,2.71,31,1,1,0,0,6,42,Australia,2,4,2005-07-21,Lord's,England


In [13]:
# Write dataframes to CSV files
batting_inns_df.to_csv("ashes_player_batting.csv", index=False)
bowling_inns_df.to_csv("ashes_player_bowling.csv", index=False)

_______________________________________________________________________