In [None]:
# Create a new virtual environment
# python3 -m venv venv

In [2]:
pip install requests beautifulsoup4 pandas selenium webdriver-manager

Note: you may need to restart the kernel to use updated packages.


In [6]:
import random
from bs4 import BeautifulSoup
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

def extract_html_content(url):
    print(f'Fetching URL: {url}')
    
    # Set up the Selenium WebDriver
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run in headless mode for faster execution
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(url)
    
    # Scroll until no more content is loaded
    last_height = driver.execute_script("return document.body.scrollHeight")
    scroll_attempts = 0
    scroll_successful = 0
    total_scroll_attempts = 0
    screenshot_count = 0
    
    while True:
        # Scroll down using large scrolls
        driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END)
        time.sleep(2)  # Wait for new content to load
        
        # Scroll up and down to load content in the viewport
        for _ in range(10):
            total_scroll_attempts += 1
            key = random.choice([Keys.PAGE_UP, Keys.PAGE_DOWN])
            driver.find_element(By.TAG_NAME, 'body').send_keys(key)
            time.sleep(0.5)
        
        # Fine-tune scrolling to ensure all content is loaded
        for _ in range(10):
            total_scroll_attempts += 1
            key = random.choice([Keys.ARROW_UP, Keys.ARROW_DOWN])
            driver.find_element(By.TAG_NAME, 'body').send_keys(key)
            time.sleep(0.2)

        # Capture a screenshot after scrolling
        screenshot_path = f'screenshot_{screenshot_count}.png'
        driver.save_screenshot(screenshot_path)
        print(f'Screenshot saved at {screenshot_path}')
        screenshot_count += 1

        # Check if we reached the end of the page
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            scroll_attempts += 1
            if scroll_attempts >= 3:
                break
        else:
            scroll_attempts = 0
            scroll_successful += 1

        last_height = new_height

        # Print progress
        print(f'Scroll Attempts: {total_scroll_attempts}, Scrolls Successful: {scroll_successful}', end='\r')

    # Get the page source and close the driver
    page_source = driver.page_source
    driver.quit()
    
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(page_source, 'html.parser')
    print(f'\nSuccessfully fetched and parsed URL: {url}')
    return soup

def extract_commentary_data(soup):
    print('Extracting commentary data')
    data = []
    # Find all commentary blocks
    commentary_blocks = soup.find_all('div', class_='ds-text-tight-m ds-font-regular ds-flex ds-px-3 ds-py-2 lg:ds-px-4 lg:ds-py-[10px] ds-items-start ds-select-none lg:ds-select-auto')
    print(f'Found {len(commentary_blocks)} commentary blocks')

    for index, block in enumerate(commentary_blocks, start=1):
        try:
            # Extract the over
            over_elem = block.find('span', class_='ds-text-tight-s ds-font-regular ds-mb-1 lg:ds-mb-0 lg:ds-mr-3 ds-block ds-text-center ds-text-typo-mid1')
            over = over_elem.text.strip() if over_elem else None
            
            # Extract the runs
            runs_block = block.find('div', class_='lg:ds-flex lg:ds-items-center lg:ds-px-2')
            runs = None
            if runs_block:
                runs = runs_block.find('div', class_='ds-flex ds-items-center ds-justify-center ds-rounded ds-overflow-hidden ds-bg-raw-green-d2 ds-text-raw-white') or \
                runs_block.find('div', class_='ds-text-tight-m ds-font-bold ds-flex ds-items-center ds-justify-center ds-text-center ds-w-10 ds-h-10 ds-text-raw-white') or \
                runs_block.find('div', class_='ds-flex ds-items-center ds-justify-center ds-rounded ds-overflow-hidden ds-bg-ui-fill-default-translucent ds-text-typo') or \
                runs_block.find('div', class_='ds-flex ds-items-center ds-justify-center ds-rounded ds-overflow-hidden ds-bg-raw-red ds-text-raw-white') or \
                runs_block.find('div', class_='ds-flex ds-items-center ds-justify-center ds-rounded ds-overflow-hidden ds-bg-ui-fill-default-translucent ds-text-typo') or \
                runs_block.find('div', class_='ds-flex ds-items-center ds-justify-center ds-rounded ds-overflow-hidden ds-bg-raw-purple ds-text-raw-white')
                runs = runs.find('span').text.strip() if runs else None
            
            # Extract the main message
            main_message_elem = block.find('div', class_='ds-leading-none ds-mb-0.5')
            main_message = main_message_elem.find('span').text.strip() if main_message_elem else None
            
            # Extract the complete commentary
            complete_commentary_elem = block.find('div', class_='first-letter:ds-capitalize').find('p', class_='ci-html-content')
            complete_commentary = complete_commentary_elem.text.strip() if complete_commentary_elem else None
            
            # Append the extracted data to the list
            data.append({
                'Over': over,
                'Runs': runs,
                'Main Message': main_message,
                'Complete Commentary': complete_commentary
            })

            # Print progress for commentary processing
            print(f'Processing progress for commentary: {index}/{len(commentary_blocks)}', end='\r')
        except Exception as e:
            print(f'Skipping block {block} due to error: {e}')
    
    print('\nCompleted extracting commentary data')
    return data

def get_processed_data(data):
    print('Processing data into DataFrame')
    columns = ['Over', 'Runs', 'Main Message', 'Complete Commentary']
    df = pd.DataFrame(data, columns=columns)
    df.to_csv('commentary_results.csv', index=False)
    print('Data saved to commentary_results.csv')
    return df

def get_urls_to_scrape():
    template_url = 'https://www.espncricinfo.com/series/icc-men-s-t20-world-cup-2024-1411166/west-indies-vs-papua-new-guinea-2nd-match-group-c-1415702/ball-by-ball-commentary'
    urls = [f'{template_url}']
    return urls

# Main execution
urls = get_urls_to_scrape()
for url in urls:
    soup = extract_html_content(url)
    data = extract_commentary_data(soup)
    df = get_processed_data(data)
    print(df)


Fetching URL: https://www.espncricinfo.com/series/icc-men-s-t20-world-cup-2024-1411166/west-indies-vs-papua-new-guinea-2nd-match-group-c-1415702/ball-by-ball-commentary
Screenshot saved at screenshot_0.png
Screenshot saved at screenshot_1.pngful: 1
Screenshot saved at screenshot_2.pngful: 2
Screenshot saved at screenshot_3.pngful: 3
Screenshot saved at screenshot_4.pngful: 4
Screenshot saved at screenshot_5.pngsful: 5
Screenshot saved at screenshot_6.pngsful: 5
Screenshot saved at screenshot_7.pngsful: 5

Successfully fetched and parsed URL: https://www.espncricinfo.com/series/icc-men-s-t20-world-cup-2024-1411166/west-indies-vs-papua-new-guinea-2nd-match-group-c-1415702/ball-by-ball-commentary
Extracting commentary data
Found 114 commentary blocks
Processing progress for commentary: 114/114
Completed extracting commentary data
Processing data into DataFrame
Data saved to commentary_results.csv
     Over Runs               Main Message  \
0    18.6    1   Morea to Russell,  1 run   
1  