In [1]:
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.microsoft import EdgeChromiumDriverManager
import pandas as pd
import time
import random

In [3]:
def scrape_cricket_commentary():
    match_url = "https://www.cricbuzz.com/cricket-full-commentary/35612/mi-vs-rcb-1st-match-indian-premier-league-2021"
    match_id = "35612"
    print(f"Processing match: {match_id}")
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--start-maximized")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36")
    service = Service(EdgeChromiumDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    all_commentary_data = []    
    try:
        driver.get(match_url)
        print("Opened the match page")
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, "cb-com-ln")))
        match_details = {
            "year": 2021,
            "series_name": "Indian Premier League",
            "match_type": "T20 IPL",
            "match_no": "1st Match",
            "match_date": "April 9, 2021",
            "match_winning_team": "Royal Challengers Bangalore",
            "match_toss": "Royal Challengers Bangalore, who chose to field"
        }
        innings_names = ["Mumbai Indians", "Royal Challengers Bangalore"]        
        for innings_name in innings_names:
            print(f"\nGetting data for {innings_name} innings")
            if innings_name == "Royal Challengers Bangalore":  
                innings_tabs = driver.find_elements(By.CSS_SELECTOR, ".cb-nav-pill-1")
                if len(innings_tabs) >= 2:
                    innings_tabs[1].click()
                    time.sleep(2)
            show_more_count = 0
            while show_more_count < 30:  
                try:
                    show_more_buttons = driver.find_elements(By.XPATH, "//a[contains(text(), 'Show More')]")
                    if not show_more_buttons:
                        break                        
                    for button in show_more_buttons:
                        if button.is_displayed():
                            driver.execute_script("arguments[0].scrollIntoView();", button)
                            time.sleep(1)
                            button.click()
                            show_more_count += 1
                            print(f"Clicked 'Show More' button {show_more_count} times")
                            time.sleep(2)
                            break
                except Exception as e:
                    print(f"Error clicking 'Show More': {e}")
                    break
            ball_counter = 0
            current_over = "1"
            rows = driver.find_elements(By.CSS_SELECTOR, ".cb-col-100")           
            for row in rows:
                try:
                    text = row.text.strip()
                    if not text:
                        continue
                    if "OVER" in text.upper() and len(text) < 100:
                        try:
                            over_text = text.upper().split("OVER")[0].strip()
                            if over_text.isdigit():
                                current_over = over_text
                        except:
                            pass
                        continue
                    commentary_element = None
                    try:
                        commentary_element = row.find_element(By.CSS_SELECTOR, ".cb-col-60")
                    except:
                        try:
                            commentary_element = row.find_element(By.CSS_SELECTOR, ".cb-com-ln")
                        except:
                            pass                    
                    if commentary_element:
                        commentary_text = commentary_element.text.strip()
                        if commentary_text and len(commentary_text) > 15:
                            ball_counter += 1
                            ball_data = {
                                **match_details,
                                "innings": innings_name,
                                "ball_no": ball_counter,
                                "over_no": current_over,
                                "ball_commentary": commentary_text
                            }                            
                            all_commentary_data.append(ball_data)                            
                            if ball_counter % 10 == 0:
                                print(f"Processed {ball_counter} balls")                               
                except Exception as e:
                    print(f"Error processing row: {e}")
        df = pd.DataFrame(all_commentary_data)
        if df.empty:
            print("Trying one more approach to get any data")
            all_text_elements = driver.find_elements(By.XPATH, "//*[contains(text(), 'to')]")  
            ball_counter = 0
            for element in all_text_elements:
                text = element.text.strip()
                if text and " to " in text and len(text) > 15:
                    ball_counter += 1                    
                    ball_data = {
                        **match_details,
                        "innings": "Unknown",
                        "ball_no": ball_counter,
                        "over_no": "Unknown",
                        "ball_commentary": text
                    }                    
                    all_commentary_data.append(ball_data)           
            if ball_counter > 0:
                print(f"Found {ball_counter} ball commentaries")
                df = pd.DataFrame(all_commentary_data)        
        return df    
    except Exception as e:
        print(f"Error: {e}")
        return pd.DataFrame()    
    finally:
        driver.quit()
if __name__ == "__main__":
    print("Starting cricket commentary scraper...")
    df = scrape_cricket_commentary()    
    output_filename = "cricket_commentary.csv"
    df.to_csv(output_filename, index=False)
    print(f"Saved {len(df)} ball commentaries to {output_filename}")

Starting cricket commentary scraper...
Processing match: 35612
Opened the match page

Getting data for Mumbai Indians innings
Processed 10 balls
Processed 20 balls

Getting data for Royal Challengers Bangalore innings
Processed 10 balls
Processed 20 balls
Processed 30 balls
Processed 40 balls
Processed 50 balls
Processed 60 balls
Processed 70 balls
Processed 80 balls
Processed 90 balls
Processed 100 balls
Processed 110 balls
Processed 120 balls
Processed 130 balls
Processed 140 balls
Processed 150 balls
Processed 160 balls
Saved 189 ball commentaries to cricket_commentary.csv
