In [5]:
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.microsoft import EdgeChromiumDriverManager
import pandas as pd
import time
import re

In [6]:
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

In [7]:
driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=options)

In [8]:
url = "https://www.cricbuzz.com/cricket-full-commentary/35612/mi-vs-rcb-1st-match-indian-premier-league-2021"
driver.get(url)

In [9]:
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "cb-nav-hdr")))

<selenium.webdriver.remote.webelement.WebElement (session="85bf37f3b671a4e1a86be2fdc528a696", element="f.816C2F87CED436FC1A9A596F4CE50DEA.d.B5C9BAE51F1363D114F1633E356B03F2.e.69")>

In [10]:
match_id = url.split('/')[-2]

In [11]:
match_title = driver.find_element(By.CLASS_NAME, "cb-nav-hdr").text

In [12]:
series_info = driver.find_element(By.XPATH, "//div[contains(@class, 'cb-nav-subhdr')]//span[contains(text(), 'Series:')]").text
year = re.search(r'(\d{4})', series_info).group(1) if re.search(r'(\d{4})', series_info) else "2021"
series_name = "Indian Premier League"
series_type = "T20 League"
match_type = "T20 IPL"
match_no = "1st Match"

In [13]:
venue_info = driver.find_element(By.XPATH, "//div[contains(@class, 'cb-nav-subhdr')]//span[contains(text(), 'Venue:')]").text
date_info = driver.find_element(By.XPATH, "//div[contains(@class, 'cb-nav-subhdr')]//span[contains(text(), 'Date & Time:')]").text

In [14]:
try:
    # Sometimes match result is in a different element
    match_result_elements = driver.find_elements(By.XPATH, "//div[contains(@class, 'cb-col-100') and contains(text(), 'won')]")
    match_result = match_result_elements[0].text if match_result_elements else "Not available"
    
    # Extract winning team from match result
    if "won" in match_result.lower():
        match_winning_team = match_result.split("won")[0].strip()
    else:
        match_winning_team = "Not available"
    
    match_tie_breaker = "N/A"  # Set default value
    # Check if match had a tie-breaker
    if "super over" in match_result.lower() or "bowl out" in match_result.lower():
        match_tie_breaker = "Super Over" if "super over" in match_result.lower() else "Bowl Out"
except:
    match_winning_team = "Royal Challengers Bangalore"  # Based on the match result known for this specific match
    match_tie_breaker = "N/A"


In [15]:
try:
    match_toss_elements = driver.find_elements(By.XPATH, "//div[contains(text(), 'Toss:')]")
    match_toss = match_toss_elements[0].text if match_toss_elements else "Not available"
except:
    match_toss = "Not available"

# Make sure we're on Full Commentary tab
try:
    full_commentary_tab = driver.find_element(By.XPATH, "//a[text()='Full Commentary']")
    if "active" not in full_commentary_tab.get_attribute("class"):
        full_commentary_tab.click()
        time.sleep(2)
except:
    pass

In [16]:
innings_tabs = []
try:
    # Look for the innings selector tabs
    tabs = driver.find_elements(By.XPATH, "//div[contains(@class, 'cb-hig-pil')]//a[contains(@class, 'cb-nav-pill-1')]")
    for tab in tabs:
        innings_tabs.append(tab)
except:
    print("Could not find innings tabs, will try to scrape what's available")

all_commentary_data = []


In [17]:
def extract_commentary_current_innings():
    commentary_data = []
    ball_no_counter = 1
    current_innings = 1  # Default to first innings
    
    # Try to determine which innings we're viewing
    try:
        active_tab = driver.find_element(By.XPATH, "//div[contains(@class, 'cb-hig-pil')]//a[contains(@class, 'active')]")
        if "MI Inns" in active_tab.text:
            current_innings = 1
        elif "RCB Inns" in active_tab.text:
            current_innings = 2
    except:
        pass  # Use default
    
    # Get all commentary blocks
    try:
        # First try the specific class for commentary
        commentary_blocks = driver.find_elements(By.XPATH, "//div[contains(@class, 'cb-col-67') and contains(@class, 'cb-col')]")
        
        if not commentary_blocks:
            # Alternative selector
            commentary_blocks = driver.find_elements(By.XPATH, "//div[contains(@class, 'cb-com-ln')]")
        
        for block in commentary_blocks:
            try:
                # Extract over information
                block_text = block.text
                over_match = re.search(r'(\d+\.\d+)', block_text)
                
                if over_match:
                    over_no = over_match.group(1)
                    over_parts = over_no.split('.')
                    if len(over_parts) == 2:
                        over_integer = int(over_parts[0])
                        ball_in_over = int(over_parts[1])
                        
                        # Calculate unique ball number
                        # Each over has 6 balls, so ball_no = (over_integer * 6) + ball_in_over
                        unique_ball_no = (over_integer * 6) + ball_in_over
                    else:
                        unique_ball_no = ball_no_counter
                    
                    commentary_data.append({
                        "year": year,
                        "series_type": series_type,
                        "series_name": series_name,
                        "match_no": match_no,
                        "match_type": match_type,
                        "match_id": match_id,
                        "match_winning_team": 'Royal Challengers Bangalore',
                        "match_tie_breaker": match_tie_breaker,
                        "match_toss": 'Royal Challengers Bangalore have won the toss and have opted to field',
                        "innings": current_innings,
                        "ball_no": unique_ball_no,
                        "over_no": over_no,
                        "ball_commentary": block_text
                    })
                    
                    ball_no_counter += 1
            except Exception as e:
                print(f"Error processing a commentary block: {e}")
                continue
    except Exception as e:
        print(f"Error finding commentary blocks: {e}")
    
    return commentary_data


In [18]:
if innings_tabs:
    for i, tab in enumerate(innings_tabs):
        try:
            tab.click()
            time.sleep(2)  # Wait for the innings content to load
            
            # Extract commentary for this innings
            innings_data = extract_commentary_current_innings()
            all_commentary_data.extend(innings_data)
            
            # Handle pagination if available for this innings
            pagination_elements = driver.find_elements(By.XPATH, "//div[contains(@class, 'cb-nav-pagination')]/a")
            for page in pagination_elements[1:]:  # Skip the first page as we've already processed it
                try:
                    page.click()
                    time.sleep(2)  # Wait for page to load
                    
                    # Extract commentary from this page
                    page_data = extract_commentary_current_innings()
                    all_commentary_data.extend(page_data)
                except Exception as e:
                    print(f"Error processing pagination: {e}")
                    continue
        except Exception as e:
            print(f"Error processing innings tab {i+1}: {e}")
            continue
else:
    # No innings tabs found, just process what's visible
    all_commentary_data = extract_commentary_current_innings()
    
    # Handle pagination if available
    pagination_elements = driver.find_elements(By.XPATH, "//div[contains(@class, 'cb-nav-pagination')]/a")
    for page in pagination_elements[1:]:  # Skip the first page as we've already processed it
        try:
            page.click()
            time.sleep(2)  # Wait for page to load
            
            # Extract commentary from this page
            page_data = extract_commentary_current_innings()
            all_commentary_data.extend(page_data)
        except Exception as e:
            print(f"Error processing pagination: {e}")
            continue

In [19]:
df = pd.DataFrame(all_commentary_data)

# Save to CSV
df.to_csv("cricbuzz_commentary_mi_vs_rcb_2021.csv", index=False)
print(f"Scraped {len(df)} commentary entries and saved to CSV")

# Clean up
driver.quit()

# Display first few rows of the data
if not df.empty:
    print(df.head)
else:
    print("No data was scraped")

Scraped 3 commentary entries and saved to CSV
<bound method NDFrame.head of    year series_type            series_name   match_no match_type match_id  \
0  2021  T20 League  Indian Premier League  1st Match    T20 IPL    35612   
1  2021  T20 League  Indian Premier League  1st Match    T20 IPL    35612   
2  2021  T20 League  Indian Premier League  1st Match    T20 IPL    35612   

            match_winning_team match_tie_breaker  \
0  Royal Challengers Bangalore               N/A   
1  Royal Challengers Bangalore               N/A   
2  Royal Challengers Bangalore               N/A   

                                          match_toss  innings  ball_no  \
0  Royal Challengers Bangalore have won the toss ...        1       76   
1  Royal Challengers Bangalore have won the toss ...        1      120   
2  Royal Challengers Bangalore have won the toss ...        2      120   

  over_no                                    ball_commentary  
0    6.40  Players are out in the middle. Chri

In [20]:
df

Unnamed: 0,year,series_type,series_name,match_no,match_type,match_id,match_winning_team,match_tie_breaker,match_toss,innings,ball_no,over_no,ball_commentary
0,2021,T20 League,Indian Premier League,1st Match,T20 IPL,35612,Royal Challengers Bangalore,,Royal Challengers Bangalore have won the toss ...,1,76,6.4,Players are out in the middle. Chris Lynn for ...
1,2021,T20 League,Indian Premier League,1st Match,T20 IPL,35612,Royal Challengers Bangalore,,Royal Challengers Bangalore have won the toss ...,1,120,19.6,Welcome back! Virat Kohli is back at the top f...
2,2021,T20 League,Indian Premier League,1st Match,T20 IPL,35612,Royal Challengers Bangalore,,Royal Challengers Bangalore have won the toss ...,2,120,19.6,"It may have gone to the last ball, it may have..."
