In [None]:
## This section below is two parts: 1. Get all match report URLs; 2. Use the match report URLs to get corresponding MatchIDs

In [1]:
## This section below gets all match report URLs into a short version
## When run, don't click as Chrome driver is in session, might take a while to load up
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import time
import pandas as pd
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
driver.implicitly_wait(10)

##click accept cookies
driver.get("https://www.superleague.co.uk/match-centre")
cookies = driver.find_element(By.XPATH, "//button[@class='consent-give']")
cookies.click()

##click Results tab
time.sleep(1)
results = driver.find_element(By.XPATH, "//button[@class='btn btn-primary pe-3 px-md-4 results ']")
results.click()

##click Competition dropdown
time.sleep(1)
button = driver.find_element(By.XPATH, "//input[@class='select-dropdown']")
button.click()

##click Super League in dropdown
time.sleep(1)
drop = driver.find_element(By.XPATH, "//li[@class='   '][1]//span")
drop.click()

##click Season dropdown, don't need to run for 2024, 2024 is selected by default
time.sleep(1)
season = driver.find_element(By.XPATH, "//div[@class='select-wrapper mdb-select seasons md-form']")
season.click()

##click Year in season dropdown, 2024 is [7], 2023 is [8] etc.
time.sleep(1)
year = driver.find_element(By.XPATH, "(//span[@class='filtrable'])[7]")
year.click()

##click load more button x times (max number ~15, 20 to be safe) to get all the games within a season
time.sleep(1)
for _ in range(20):
    loadmore = driver.find_element(By.XPATH, "//button[@class='load-more-matches btn btn-lg btn-primary btn-block font my-3']")
    driver.execute_script("arguments[0].click();", loadmore)
    time.sleep(1)

##Find a and href tags where all match report URLs are stored
html = driver.page_source
soup = BeautifulSoup(html)
matches_list = soup.find_all('a')
links = [l.get("href") for l in matches_list]

##Delete last 3 rows to remove None values
K = 3
del links[-3:]

##Keep links that only have report in them
links = [d for d in links if '/report/' in d]
links

['/match-centre/report/4455',
 '/match-centre/report/4453',
 '/match-centre/report/4454',
 '/match-centre/report/4452',
 '/match-centre/report/4451',
 '/match-centre/report/4450',
 '/match-centre/report/4449',
 '/match-centre/report/4448',
 '/match-centre/report/4447',
 '/match-centre/report/4446',
 '/match-centre/report/4445',
 '/match-centre/report/4444',
 '/match-centre/report/4443',
 '/match-centre/report/4442',
 '/match-centre/report/4441',
 '/match-centre/report/4439',
 '/match-centre/report/4440',
 '/match-centre/report/4438',
 '/match-centre/report/4437',
 '/match-centre/report/4436',
 '/match-centre/report/4434',
 '/match-centre/report/4433',
 '/match-centre/report/4435',
 '/match-centre/report/4432',
 '/match-centre/report/4430',
 '/match-centre/report/4429',
 '/match-centre/report/4431',
 '/match-centre/report/4428',
 '/match-centre/report/4427',
 '/match-centre/report/4426',
 '/match-centre/report/4425',
 '/match-centre/report/4424',
 '/match-centre/report/4423',
 '/match-c

In [2]:
##Convert data into a table so we can add a string in the next step
data = pd.DataFrame(links, columns = ['URL'])
##Remove any duplicates, seasons before 2024 have a few
data = data.drop_duplicates(subset=['URL'], keep='first')
data

Unnamed: 0,URL
0,/match-centre/report/4455
1,/match-centre/report/4453
2,/match-centre/report/4454
3,/match-centre/report/4452
4,/match-centre/report/4451
...,...
66,/match-centre/report/4388
67,/match-centre/report/4385
68,/match-centre/report/4386
69,/match-centre/report/4387


In [3]:
##Add beginning of URL back into each row to create full URLs
data1 = 'https://www.superleague.co.uk' + data.astype(str)
data1

Unnamed: 0,URL
0,https://www.superleague.co.uk/match-centre/rep...
1,https://www.superleague.co.uk/match-centre/rep...
2,https://www.superleague.co.uk/match-centre/rep...
3,https://www.superleague.co.uk/match-centre/rep...
4,https://www.superleague.co.uk/match-centre/rep...
...,...
66,https://www.superleague.co.uk/match-centre/rep...
67,https://www.superleague.co.uk/match-centre/rep...
68,https://www.superleague.co.uk/match-centre/rep...
69,https://www.superleague.co.uk/match-centre/rep...


In [4]:
##Add beginning of URL back into each row to create full URLs
URLdata = (data1["URL"].to_list())
URLdata

['https://www.superleague.co.uk/match-centre/report/4455',
 'https://www.superleague.co.uk/match-centre/report/4453',
 'https://www.superleague.co.uk/match-centre/report/4454',
 'https://www.superleague.co.uk/match-centre/report/4452',
 'https://www.superleague.co.uk/match-centre/report/4451',
 'https://www.superleague.co.uk/match-centre/report/4450',
 'https://www.superleague.co.uk/match-centre/report/4449',
 'https://www.superleague.co.uk/match-centre/report/4448',
 'https://www.superleague.co.uk/match-centre/report/4447',
 'https://www.superleague.co.uk/match-centre/report/4446',
 'https://www.superleague.co.uk/match-centre/report/4445',
 'https://www.superleague.co.uk/match-centre/report/4444',
 'https://www.superleague.co.uk/match-centre/report/4443',
 'https://www.superleague.co.uk/match-centre/report/4442',
 'https://www.superleague.co.uk/match-centre/report/4441',
 'https://www.superleague.co.uk/match-centre/report/4439',
 'https://www.superleague.co.uk/match-centre/report/4440

In [5]:
## This section below uses the match report URLs to get corresponding MatchIDs
## When run, don't click as Chrome driver is in session, might take a while to load up and takes about -4min to finish

##Setup Chrome driver options
chrome_options = Options()
chrome_options.add_argument("--headless")  ##Run in headless mode (no GUI)

driver = webdriver.Chrome()

##Create an empty dataset to put data into when the for loop is run
matchiddata = []

##A loop to run all the full URLs into the driver
for d in URLdata:
    driver.get(d)
    ##Print out full match report URL for table later
    page = (f"{d}")
    ##Find all div tags and get 'data-livedata' info out, which contains MatchID and filter for not None values
    html2 = driver.page_source
    soup = BeautifulSoup(html2)
    match_id = soup.select('div')
    match_ids = [m.get("data-livedata") for m in match_id]
    match_ids = [x for x in match_ids if x is not None]
    ##Print out Match IDs and create a table with columns of Match Report URL and corresponding MatchID
    match_ids2 = (f"{match_ids}")
    matchiddata.append({'URL': page, 'MatchID': match_ids2})

##Convert output into Pandas table
MatchIDdata = pd.DataFrame(matchiddata)
##Remove any duplicates, seasons before 2024 have a few
MatchIDdata = MatchIDdata.drop_duplicates(subset=['MatchID'], keep='first')
MatchIDdata

Unnamed: 0,URL,MatchID
0,https://www.superleague.co.uk/match-centre/rep...,['231192639']
1,https://www.superleague.co.uk/match-centre/rep...,['231192638']
2,https://www.superleague.co.uk/match-centre/rep...,['231192637']
3,https://www.superleague.co.uk/match-centre/rep...,['231192636']
4,https://www.superleague.co.uk/match-centre/rep...,['231192635']
...,...,...
66,https://www.superleague.co.uk/match-centre/rep...,['231192572']
67,https://www.superleague.co.uk/match-centre/rep...,['231192569']
68,https://www.superleague.co.uk/match-centre/rep...,['231192570']
69,https://www.superleague.co.uk/match-centre/rep...,['231192571']


In [6]:
##Remove square brackets and quotations marks from MatchID string
MatchIDdata['MatchID'] = MatchIDdata['MatchID'].str.replace(r'[\[\]]', '', regex=True)
MatchIDdata['MatchID'] = MatchIDdata['MatchID'].str.replace("'", "", regex=False)
MatchIDdata

Unnamed: 0,URL,MatchID
0,https://www.superleague.co.uk/match-centre/rep...,231192639
1,https://www.superleague.co.uk/match-centre/rep...,231192638
2,https://www.superleague.co.uk/match-centre/rep...,231192637
3,https://www.superleague.co.uk/match-centre/rep...,231192636
4,https://www.superleague.co.uk/match-centre/rep...,231192635
...,...,...
66,https://www.superleague.co.uk/match-centre/rep...,231192572
67,https://www.superleague.co.uk/match-centre/rep...,231192569
68,https://www.superleague.co.uk/match-centre/rep...,231192570
69,https://www.superleague.co.uk/match-centre/rep...,231192571


In [7]:
##Save Table as CSV
MatchIDdata.to_csv('RSL_MatchReport_2024.csv')

In [None]:
## This second section below is the code to get the data within the load_match_stats.php file for all the matches

In [8]:
## Go to curlcommand.com and copy in a cURL from a match-report and replace code below
import requests

##Static cookies and headers
cookies = {
    'AWSALB': '1uAALraEjNlDV7vELHh0fl2KhGDexn+5xPb2pk7Mw9ZK6eMnzE4f4r0K0DPSDfvY73NcNCDvEJKGyZXcqbwyZNjS2BGa85t4+V/2WaDjP8iJzo5AzvXSX4r0+cvC',
    'AWSALBCORS': '1uAALraEjNlDV7vELHh0fl2KhGDexn+5xPb2pk7Mw9ZK6eMnzE4f4r0K0DPSDfvY73NcNCDvEJKGyZXcqbwyZNjS2BGa85t4+V/2WaDjP8iJzo5AzvXSX4r0+cvC',
    '_fbp': 'fb.2.1715165415002.1972851130',
    'cconsent': '{"version":1,"categories":{"necessary":{"wanted":true}},"services":[]}',
    '_ga': 'GA1.1.483648116.1715165414',
    '_ga_D2W90RWCWW': 'GS1.1.1715165414.1.0.1715165414.60.0.0',
    '_gcl_au': '1.1.679758680.1715165414',
    'PHPSESSID': 'sckr8kj36ep7lckrgv5ukt5tag',
}

headers = {
    'Accept': '*/*',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-Dest': 'empty',
    'Accept-Language': 'en-GB,en;q=0.9',
    'Sec-Fetch-Mode': 'cors',
    'Host': 'www.superleague.co.uk',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2.1 Safari/605.1.15',
    'Connection': 'keep-alive',
}

all_data = []

##Iterating over the DataFrame and making requests
for index, row in MatchIDdata.iterrows():
    headers['Referer'] = row['URL']
    params = {'match': row['MatchID']}
    
    response = requests.get(
        'https://www.superleague.co.uk/plugins/match_report/load_match_stats.php',
        params=params,
        cookies=cookies,
        headers=headers,
    )
    
    if response.status_code == 200:
        data = response.json()
        normalized_df = pd.json_normalize(data)
        all_data.append(normalized_df)
        print(f"Response for index {index}: {response.status_code}")
    else:
        print(f"Failed to retrieve data for index {index}: {response.status_code}")

##Concatenate all DataFrames if there is any data
if all_data:
    result_df = pd.concat(all_data, ignore_index=True)
    print(result_df)
else:
    print("No data retrieved")


Response for index 0: 200
Response for index 1: 200
Response for index 2: 200
Response for index 3: 200
Response for index 4: 200
Response for index 5: 200
Response for index 6: 200
Response for index 7: 200
Response for index 8: 200
Response for index 9: 200
Response for index 10: 200
Response for index 11: 200
Response for index 12: 200
Response for index 13: 200
Response for index 14: 200
Response for index 15: 200
Response for index 16: 200
Response for index 17: 200
Response for index 18: 200
Response for index 19: 200
Response for index 20: 200
Response for index 21: 200
Response for index 22: 200
Response for index 23: 200
Response for index 24: 200
Response for index 25: 200
Response for index 26: 200
Response for index 27: 200
Response for index 28: 200
Response for index 29: 200
Response for index 30: 200
Response for index 31: 200
Response for index 32: 200
Response for index 33: 200
Response for index 34: 200
Response for index 35: 200
Response for index 36: 200
Response fo

In [9]:
##Check output
result_df

Unnamed: 0,Team,@attributes.id,@attributes.attendance,@attributes.away_ht_score,@attributes.away_score,@attributes.away_team,@attributes.away_team_id,@attributes.away_teamNickname,@attributes.comp_id,@attributes.comp_name,...,@attributes.home_team_id,@attributes.home_teamNickname,@attributes.live_scores,@attributes.period_minute,@attributes.period_second,@attributes.season_id,@attributes.status,@attributes.venue_id,@attributes.venue_name,Officials.Official
0,"[{'@attributes': {'home_or_away': 'home', 'tea...",231192639,,46,64,Hull KR,3600,The Robins,1,Super League,...,300,Broncos,11,80,0,2024,Result,361,The Cherry Red Records Stadium,"[{'@attributes': {'id': '22', 'official_name':..."
1,"[{'@attributes': {'home_or_away': 'home', 'tea...",231192638,,18,26,Wigan,550,Warriors,1,Super League,...,400,Red Devils,11,80,0,2024,Result,130,Salford Stadium,"[{'@attributes': {'id': '204', 'official_name'..."
2,"[{'@attributes': {'home_or_away': 'home', 'tea...",231192637,,2,16,Warrington,500,Wolves,1,Super League,...,800,Dragons,11,80,0,2024,Result,29,Stade Gilbert Brutus,"[{'@attributes': {'id': '143', 'official_name'..."
3,"[{'@attributes': {'home_or_away': 'home', 'tea...",231192636,,6,10,Leeds,250,Rhinos,1,Super League,...,350,Saints,11,80,0,2024,Result,127,Totally Wicked Stadium,"[{'@attributes': {'id': '157', 'official_name'..."
4,"[{'@attributes': {'home_or_away': 'home', 'tea...",231192635,,14,16,Leigh,700,Leopards,1,Super League,...,150,Giants,11,80,0,2024,Result,32,John Smith's Stadium,"[{'@attributes': {'id': '230', 'official_name'..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,"[{'@attributes': {'home_or_away': 'home', 'tea...",231192572,,16,32,Wigan,550,Warriors,1,Super League,...,50,Tigers,11,80,0,2024,Result,35,The Mend-A-Hose Jungle,"[{'@attributes': {'id': '204', 'official_name'..."
67,"[{'@attributes': {'home_or_away': 'home', 'tea...",231192569,,14,16,Salford,400,Red Devils,1,Super League,...,250,Rhinos,11,80,0,2024,Result,13,AMT Headingley Stadium,"[{'@attributes': {'id': '281', 'official_name'..."
68,"[{'@attributes': {'home_or_away': 'home', 'tea...",231192570,,0,4,London Broncos,300,Broncos,1,Super League,...,350,Saints,11,80,0,2024,Result,127,Totally Wicked Stadium,"[{'@attributes': {'id': '230', 'official_name'..."
69,"[{'@attributes': {'home_or_away': 'home', 'tea...",231192571,,12,16,Huddersfield,150,Giants,1,Super League,...,700,Leopards,11,80,0,2024,Result,93,Leigh Sports Village Stadium,"[{'@attributes': {'id': '158', 'official_name'..."


In [10]:
##Save Table as CSV
result_df.to_csv('RSL_MatchReportData_2024.csv')