Scrape Cricket Series Links from cricbuzz to Structured Data

In [5]:
import os
import time
import pandas as pd
import re
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException, ElementClickInterceptedException, ElementNotInteractableException

In [6]:
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-notifications")

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
driver.implicitly_wait(5)

driver.get('https://www.cricbuzz.com/cricket-scorecard-archives')

In [15]:
if os.path.exists('supporting_cache_data/all_years_series_hrefs.csv'):
  years_series_data = pd.read_csv('supporting_cache_data/all_years_series_hrefs.csv').to_dict(orient='records')
  print("Fetched from cache")
else:
  years = driver.find_elements(By.CSS_SELECTOR, ".cb-yr-tmline > a")
  print(f"Total of {len(years)} years found")
  years_series_data = []
  year_i = 0
  year_retry = 0
  while year_i < len(years) and year_retry <= 5:
    try:
      ActionChains(driver).move_to_element(years[year_i]).click().perform()
      year_name = years[year_i].text.strip()
      time.sleep(2)
      try:
        series_containers = driver.find_elements(By.CSS_SELECTOR, ".cb-schdl > div.cb-col")
        for series_container in series_containers:
          try:
            series_type = series_container.find_element(By.XPATH, "./preceding-sibling::h2[1][contains(@class, 'cb-srs-cat')]").text
            if not series_type:
              raise ValueError("Series type not found")
            series = series_container.find_elements(By.CSS_SELECTOR, ".cb-srs-lst-itm > a")

            for serie in series:
              years_series_data.append({
                "year": year_name,
                "series_type": series_type,
                "series_name": serie.text.strip(),
                "series_href": serie.get_attribute("href"),
              })
          except (TimeoutException, NoSuchElementException) as e:
            print("Element not found or timeout occurred.", e.msg)
      except (TimeoutException, NoSuchElementException) as e:
        print("Element not found or timeout occurred.", e.msg)
      year_i += 1
      year_retry = 0
      
      progress = (year_i / len(years)) * 100
      print(f"Year {year_name} processed after {year_retry} retries. Progress: {progress:.2f}%")
    except StaleElementReferenceException:
      years = driver.find_elements(By.CSS_SELECTOR, ".cb-yr-tmline > a")
      year_retry += 1
      if year_retry == 5: print(f"StaleElementReferenceException Error with {year_name} after 5 retries")
  
  if years_series_data:
    pd.DataFrame(years_series_data).to_csv('cricbuzz_series_data.csv', index=False, header=True)
  else:
    print("No data collected. Skipping CSV save.")

pd.DataFrame(years_series_data)

Total of 124 years found
Year 2021 processed after 0 retries. Progress: 0.81%
Year 2022 processed after 0 retries. Progress: 1.61%
Year 2023 processed after 0 retries. Progress: 2.42%
Year 2024 processed after 0 retries. Progress: 3.23%
Year 2025 processed after 0 retries. Progress: 4.03%
Year 2011 processed after 0 retries. Progress: 4.84%
Year 2012 processed after 0 retries. Progress: 5.65%
Year 2013 processed after 0 retries. Progress: 6.45%
Year 2014 processed after 0 retries. Progress: 7.26%
Year 2015 processed after 0 retries. Progress: 8.06%
Year 2016 processed after 0 retries. Progress: 8.87%
Year 2017 processed after 0 retries. Progress: 9.68%
Year 2018 processed after 0 retries. Progress: 10.48%
Year 2019 processed after 0 retries. Progress: 11.29%
Year 2020 processed after 0 retries. Progress: 12.10%
Year 2001 processed after 0 retries. Progress: 12.90%
Year 2002 processed after 0 retries. Progress: 13.71%
Year 2003 processed after 0 retries. Progress: 14.52%
Year 2004 proce

Unnamed: 0,year,series_type,series_name,series_href
0,2021,International,"Bangladesh tour of New Zealand, 2022",https://www.cricbuzz.com/cricket-series/3876/b...
1,2021,International,"India tour of South Africa, 2021-22",https://www.cricbuzz.com/cricket-series/3656/i...
2,2021,International,"Ireland tour of USA, 2021",https://www.cricbuzz.com/cricket-series/3866/i...
3,2021,International,"West Indies tour of Pakistan, 2021-22",https://www.cricbuzz.com/cricket-series/3858/w...
4,2021,International,"The Ashes, 2021-22",https://www.cricbuzz.com/cricket-series/3532/t...
...,...,...,...,...
3267,1887,International,"England in Australia, 1887",https://www.cricbuzz.com/cricket-series/1022/e...
3268,1888,International,"Australia in England, 1888",https://www.cricbuzz.com/cricket-series/1024/a...
3269,1888,International,"England in Australia, 1888",https://www.cricbuzz.com/cricket-series/1023/e...
3270,1889,International,England in South Africa Test Series,https://www.cricbuzz.com/cricket-series/1025/e...
