In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def extract_html_content(url):
    print(f'Fetching URL: {url}')
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser')
    print(f'Successfully fetched and parsed URL: {url}')
    return soup

def extract_commentary_data(soup):
    print('Extracting commentary data')
    data = []
    # Find all commentary blocks
    commentary_blocks = soup.find_all('div', class_='ds-text-tight-m ds-font-regular ds-flex ds-px-3 ds-py-2 lg:ds-px-4 lg:ds-py-[10px] ds-items-start ds-select-none lg:ds-select-auto')
    print(f'Found {len(commentary_blocks)} commentary blocks')

    for block in commentary_blocks:
        # Extract the over
        over_elem = block.find('span', class_='ds-text-tight-s ds-font-regular ds-mb-1 lg:ds-mb-0 lg:ds-mr-3 ds-block ds-text-center ds-text-typo-mid1')
        over = over_elem.text.strip() if over_elem else None
        
        # Extract the runs
        runs_block = block.find('div', class_='lg:ds-flex lg:ds-items-center lg:ds-px-2')
        runs = None
        if runs_block:
            runs = runs_block.find('div', class_='ds-flex ds-items-center ds-justify-center ds-rounded ds-overflow-hidden ds-bg-raw-green-d2 ds-text-raw-white') or \
                   runs_block.find('div', class_='ds-text-tight-m ds-font-bold ds-flex ds-items-center ds-justify-center ds-text-center ds-w-10 ds-h-10 ds-text-raw-white') or \
                   runs_block.find('div', class_='ds-flex ds-items-center ds-justify-center ds-rounded ds-overflow-hidden ds-bg-ui-fill-default-translucent ds-text-typo')
            runs = runs.find('span').text.strip() if runs else None
        
        # Extract the main message
        main_message_elem = block.find('div', class_='ds-leading-none ds-mb-0.5')
        main_message = main_message_elem.find('span').text.strip() if main_message_elem else None
        
        # Extract the complete commentary
        complete_commentary_elem = block.find('div', class_='first-letter:ds-capitalize').find('p', class_='ci-html-content')
        complete_commentary = complete_commentary_elem.text.strip() if complete_commentary_elem else None
        
        # Append the extracted data to the list
        data.append({
            'Over': over,
            'Runs': runs,
            'Main Message': main_message,
            'Complete Commentary': complete_commentary
        })
    
    print('Completed extracting commentary data')
    return data

def get_processed_data(data):
    print('Processing data into DataFrame')
    columns = ['Over', 'Runs', 'Main Message', 'Complete Commentary']
    df = pd.DataFrame(data, columns=columns)
    df.to_csv('commentary_results_without_scroll.csv', index=False)
    print('Data saved to commentary_results.csv')
    return df

def get_urls_to_scrape():
    template_url = 'https://www.espncricinfo.com/series/icc-men-s-t20-world-cup-2024-1411166/west-indies-vs-papua-new-guinea-2nd-match-group-c-1415702/ball-by-ball-commentary'
    urls = [f'{template_url}']
    return urls

# Main execution
urls = get_urls_to_scrape()
for url in urls:
    soup = extract_html_content(url)
    data = extract_commentary_data(soup)
    df = get_processed_data(data)
    print(df)


Fetching URL: https://www.espncricinfo.com/series/icc-men-s-t20-world-cup-2024-1411166/west-indies-vs-papua-new-guinea-2nd-match-group-c-1415702/ball-by-ball-commentary
Successfully fetched and parsed URL: https://www.espncricinfo.com/series/icc-men-s-t20-world-cup-2024-1411166/west-indies-vs-papua-new-guinea-2nd-match-group-c-1415702/ball-by-ball-commentary
Extracting commentary data
Found 20 commentary blocks
Completed extracting commentary data
Processing data into DataFrame
Data saved to commentary_results.csv
    Over Runs                  Main Message  \
0   18.6    1      Morea to Russell,  1 run   
1   18.5    1        Morea to Chase,  1 run   
2   18.4    4     Morea to Chase, FOUR runs   
3   18.3    4     Morea to Chase, FOUR runs   
4   18.2    1      Morea to Russell,  1 run   
5   18.1    2     Morea to Russell,  2 runs   
6   17.6    4      Vala to Chase, FOUR runs   
7   17.5    6      Vala to Chase,  SIX runs   
8   17.4    1       Vala to Russell,  1 run   
9   17.3  