In [None]:
# Create a new virtual environment
# python3 -m venv venv

In [2]:
pip install requests beautifulsoup4 pandas selenium webdriver-manager

Note: you may need to restart the kernel to use updated packages.


In [16]:
import re
from bs4 import BeautifulSoup
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager


In [29]:
def extract_html_content(idx, url):
    print(f'Fetching URL: {url}')
    
    # Set up the Selenium WebDriver
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run in headless mode for faster execution
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(url)
    
    soups = []
    dropdown_xpath = '//div[contains(@class, "ds-flex ds-items-center ds-border-ui-stroke ds-h-6 ds-px-4 ds-border ds-bg-ui-fill ds-rounded-full ds-w-full ds-min-w-max ds-cursor-pointer")]'
    dropdown_items_xpath = '//ul[contains(@class, "ds-flex ds-flex-col ds-text-typo-mid2 ds-justify-center ds-overflow-ellipsis ds-overflow-y-auto ds-w-full ds-grid ds-grid-cols-1 ds-items-center ds-gap-x-2 ds-max-h-96 ds-overflow-y-auto")]/li/div'
    
    # Find and click the dropdown to expand it
    dropdown = driver.find_element(By.XPATH, dropdown_xpath)
    driver.execute_script("arguments[0].click();", dropdown)
    time.sleep(2)  # Wait for the dropdown to open
    
    # Find all items in the dropdown
    dropdown_items = driver.find_elements(By.XPATH, dropdown_items_xpath)
    
    for item_idx, item in enumerate(dropdown_items):
        try:
            print(f'Clicking on item {item_idx}: {item.text.strip()}')  # Print the item text for debugging
            driver.execute_script("arguments[0].click();", item)
            time.sleep(2)  # Wait for the page to load

            # Scroll until no more content is loaded
            last_height = driver.execute_script("return document.body.scrollHeight")
            scroll_attempts = 0
            scroll_successful = 0
            total_scroll_attempts = 0
            screenshot_count = 0

            while True:
                # Scroll down using large scrolls
                driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END)
                time.sleep(2)  # Wait for new content to load

                # Scroll up and down to load content in the viewport
                for i in range(150):
                    total_scroll_attempts += 1
                    key = Keys.ARROW_UP
                    driver.find_element(By.TAG_NAME, 'body').send_keys(key)
                    if i % 8 == 0:
                        time.sleep(0.5)  # Wait for new content to load

                # Fine-tune scrolling to ensure all content is loaded
                for i in range(100):
                    total_scroll_attempts += 1
                    key = Keys.ARROW_DOWN
                    driver.find_element(By.TAG_NAME, 'body').send_keys(key)
                    if i % 8 == 0:
                        time.sleep(0.5)  # Wait for new content to load

                # Capture a screenshot after scrolling
                screenshot_path = f'screenshot_{idx}_{item_idx}_{screenshot_count}.png'
                driver.save_screenshot(screenshot_path)
                print(f'Screenshot saved at {screenshot_path}')
                screenshot_count += 1

                # Check if we reached the end of the page
                new_height = driver.execute_script("return document.body.scrollHeight")
                if new_height == last_height:
                    scroll_attempts += 1
                    if scroll_attempts >= 2:
                        break
                else:
                    scroll_attempts = 0
                    scroll_successful += 1

                last_height = new_height

                # Print progress
                print(f'Scroll Attempts: {total_scroll_attempts}, Scrolls Successful: {scroll_successful}', end='\r')

            # Get the page source and parse the HTML content with BeautifulSoup
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            soups.append(soup)

            # Click the dropdown again to select the next item
            dropdown = driver.find_element(By.XPATH, dropdown_xpath)
            driver.execute_script("arguments[0].click();", dropdown)
            time.sleep(2)  # Wait for the dropdown to open
            dropdown_items = driver.find_elements(By.XPATH, dropdown_items_xpath)
        except Exception as e:
            print(f"Error executing the click of dropdown for dropdown item index:{item_idx}")

    driver.quit()
    print(f'\nSuccessfully fetched and parsed URL: {url}')
    return soups


def extract_match_details(url):
    # Updated regular expression to capture the match details
    pattern = re.compile(
        r"https://www\.espncricinfo\.com/series/icc-men-s-t20-world-cup-2024-1411166/([a-zA-Z-]+)-vs-([a-zA-Z-]+)-(\d+)[a-z]{2}-match-group-([a-z])-([0-9]+)/ball-by-ball-commentary"
    )
    match = pattern.match(url)
    if match:
        details = match.groups()
        print(f"Match details found: {details}")  # Debug print

        team_1 = details[0].replace('-', ' ').title()
        team_2 = details[1].replace('-', ' ').title()
        match_number = details[2]
        group_id = details[3].upper()
        match_id = details[4]

        return team_1, team_2, match_number, match_id, group_id
    else:
        print("No match found.")  # Debug print
        return None

def extract_commentary_data(idx, soup, url):
    print('Extracting commentary data')
    data = []
    # Extract match details
    team_1, team_2, match_number, match_id, group_id = extract_match_details(url)
    # Find all commentary blocks
    commentary_blocks = soup.find_all('div', class_='ds-text-tight-m ds-font-regular ds-flex ds-px-3 ds-py-2 lg:ds-px-4 lg:ds-py-[10px] ds-items-start ds-select-none lg:ds-select-auto')
    print(f'Found {len(commentary_blocks)} commentary blocks')

    for index, block in enumerate(commentary_blocks, start=1):
        try:
            # Extract the over
            over_elem = block.find('span', class_='ds-text-tight-s ds-font-regular ds-mb-1 lg:ds-mb-0 lg:ds-mr-3 ds-block ds-text-center ds-text-typo-mid1')
            over = over_elem.text.strip() if over_elem else None
            
            # Extract the runs
            runs_block = block.find('div', class_='lg:ds-flex lg:ds-items-center lg:ds-px-2')
            runs = None
            if runs_block:
                runs = runs_block.find('div', class_='ds-flex ds-items-center ds-justify-center ds-rounded ds-overflow-hidden ds-bg-raw-green-d2 ds-text-raw-white') or \
                runs_block.find('div', class_='ds-text-tight-m ds-font-bold ds-flex ds-items-center ds-justify-center ds-text-center ds-w-10 ds-h-10 ds-text-raw-white') or \
                runs_block.find('div', class_='ds-flex ds-items-center ds-justify-center ds-rounded ds-overflow-hidden ds-bg-ui-fill-default-translucent ds-text-typo') or \
                runs_block.find('div', class_='ds-flex ds-items-center ds-justify-center ds-rounded ds-overflow-hidden ds-bg-raw-red ds-text-raw-white') or \
                runs_block.find('div', class_='ds-flex ds-items-center ds-justify-center ds-rounded ds-overflow-hidden ds-bg-ui-fill-default-translucent ds-text-typo') or \
                runs_block.find('div', class_='ds-flex ds-items-center ds-justify-center ds-rounded ds-overflow-hidden ds-bg-raw-purple ds-text-raw-white')
                runs = runs.find('span').text.strip() if runs else None
            
            # Extract the main message
            main_message_elem = block.find('div', class_='ds-leading-none ds-mb-0.5')
            main_message = main_message_elem.find('span').text.strip() if main_message_elem else None
            
            # Extract the complete commentary
            complete_commentary_elem = block.find('div', class_='first-letter:ds-capitalize').find('p', class_='ci-html-content')
            complete_commentary = complete_commentary_elem.text.strip() if complete_commentary_elem else None
            
            # Append the extracted data to the list
            data.append({
                'Match Id': match_id,
                'Match Number': match_number,
                'Group Number': group_id,
                'Team A': team_1,
                'Team B': team_2,
                'Innings': idx+1,
                'Over': over,
                'Runs': runs,
                'Main Message': main_message,
                'Complete Commentary': complete_commentary
            })

            # Print progress for commentary processing
            print(f'Processing progress for commentary: {index}/{len(commentary_blocks)}', end='\r')
        except Exception as e:
            print(f'Skipping block {block} due to error: {e}')
    
    print('\nCompleted extracting commentary data')
    return data

def get_processed_data(data, file_appender=''):
    print('Processing data into DataFrame')
    columns = ['Match Id', 'Match Number', 'Group Number', 'Team A', 'Team B', 'Innings', 'Over', 'Runs', 'Main Message', 'Complete Commentary']
    df = pd.DataFrame(data, columns=columns)
    file_name = f'commentary_results_{file_appender}.csv'
    df.to_csv(file_name, index=False)
    print(f'Data saved to {file_name}')
    return df

def get_urls_to_scrape(csv_file='urls_to_scrape.csv'):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file)
    # Extract the URLs column into a list
    urls = df['Urls'].tolist()
    return urls

# Main execution
combined_data = []
urls = get_urls_to_scrape('urls_to_scrape.csv')
for idx, url in enumerate(urls):
    print(f'Processing URL {idx + 1}/{len(urls)}: {url}')
    try:
        soups = extract_html_content(idx, url)
        for idx, soup in enumerate(soups):
            data = extract_commentary_data(idx, soup, url)
            combined_data.extend(data)
            get_processed_data(data, idx)
    except Exception as e:
        print(f'Error processing URL{url}')

df = get_processed_data(combined_data)
df

Processing URL 1/3: https://www.espncricinfo.com/series/icc-men-s-t20-world-cup-2024-1411166/south-africa-vs-sri-lanka-4th-match-group-d-1415704/ball-by-ball-commentary
Fetching URL: https://www.espncricinfo.com/series/icc-men-s-t20-world-cup-2024-1411166/south-africa-vs-sri-lanka-4th-match-group-d-1415704/ball-by-ball-commentary
Clicking on item 0: SL
Screenshot saved at screenshot_0_0_0.png
Screenshot saved at screenshot_0_0_1.png: 1
Screenshot saved at screenshot_0_0_2.png: 1
Screenshot saved at screenshot_0_0_3.png: 2
Screenshot saved at screenshot_0_0_4.pngl: 3
Screenshot saved at screenshot_0_0_5.pngl: 4
Screenshot saved at screenshot_0_0_6.pngl: 5
Screenshot saved at screenshot_0_0_7.pngl: 6
Screenshot saved at screenshot_0_0_8.pngl: 6
Clicking on item 1: SA
Screenshot saved at screenshot_0_1_0.png
Screenshot saved at screenshot_0_1_1.png: 1
Screenshot saved at screenshot_0_1_2.png: 2
Screenshot saved at screenshot_0_1_3.png: 3
Screenshot saved at screenshot_0_1_4.pngl: 4
Screen

Unnamed: 0,Match Id,Match Number,Group Number,Team A,Team B,Innings,Over,Runs,Main Message,Complete Commentary
0,1415704,4,D,South Africa,Sri Lanka,1,19.1,1W,"Jansen to Theekshana, 1 run, OUT","full on off, mistimed heave down the ground. T..."
1,1415704,4,D,South Africa,Sri Lanka,1,18.6,1,"Baartman to Theekshana, 1 run","full and straight, worked down to long-on for one"
2,1415704,4,D,South Africa,Sri Lanka,1,18.5,•,"Baartman to Theekshana, no run",Swing and a miss! Short outside off. Quicker a...
3,1415704,4,D,South Africa,Sri Lanka,1,18.4,4,"Baartman to Theekshana, FOUR runs",Put away as a big cheer goes up! Sits up from ...
4,1415704,4,D,South Africa,Sri Lanka,1,18.3,•,"Baartman to Theekshana, no run",swing and a miss as this short one zips through
...,...,...,...,...,...,...,...,...,...,...
709,1415703,3,B,Namibia,Oman,3,0.5,4,"Bilal Khan to Erasmus, FOUR runs",Now Erasmus joins the party! Almost a Rashid K...
710,1415703,3,B,Namibia,Oman,3,0.4,1,"Bilal Khan to Wiese, 1 run","Full and wide outside off, he can hit it only ..."
711,1415703,3,B,Namibia,Oman,3,0.3,2,"Bilal Khan to Wiese, 2 runs",Bilal goes around teh wicket and bowls the yor...
712,1415703,3,B,Namibia,Oman,3,0.2,6,"Bilal Khan to Wiese, SIX runs","Gift, gift, gift! Wiese accepts it gleefully! ..."
