In [19]:
import time
import random
import pandas as pd
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException
from webdriver_manager.chrome import ChromeDriverManager
def scrape_mi_rcb_commentary():
    match_url = "https://www.cricbuzz.com/cricket-full-commentary/35612/mi-vs-rcb-1st-match-indian-premier-league-2021"
    match_id = "35612"
    print(f"Processing match ID: {match_id}")
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--start-maximized")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36")
    chrome_options.add_experimental_option("prefs", {
        "profile.default_content_setting_values.notifications": 2,
        "credentials_enable_service": False,
        "profile.password_manager_enabled": False
    })
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    debug_folder = "debug_screenshots"
    os.makedirs(debug_folder, exist_ok=True)
    all_commentary_data = []  # Initialize data storage
    try:
        time.sleep(random.uniform(1, 3))
        driver.get(match_url)
        print("Successfully navigated to the match URL")
        driver.save_screenshot(f"{debug_folder}/01_initial_page.png")
        max_retries = 3
        retry_count = 0
        page_loaded = False
        while retry_count < max_retries and not page_loaded:
            try:
                WebDriverWait(driver, 30).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "cb-com-ln"))
                )
                print("Primary page element loaded")
                page_loaded = True
            except TimeoutException:
                retry_count += 1
                print(f"Retry {retry_count}/{max_retries} - Refreshing page...")
                driver.refresh()
                time.sleep(3)
        if not page_loaded:
            print("Timeout waiting for primary page element. Trying alternative approach...")
            # Wait for any content to indicate page is loaded
            WebDriverWait(driver, 30).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))
            )
        driver.save_screenshot(f"{debug_folder}/02_page_loaded.png")
        year = 2021
        series_name = "Indian Premier League"
        series_type = "T20 League"
        match_type = "T20 IPL"
        match_no = "1st Match"
        match_date = "April 9, 2021"
        match_winning_team = "Royal Challengers Bangalore"  # We know this from the match
        match_tie_breaker = "None"
        match_toss = "Royal Challengers Bangalore, who chose to field"
        print(f"Page title: {driver.title}")
        def find_and_click_with_retry(selectors, max_attempts=3, delay=1):
            for attempt in range(max_attempts):
                for selector_type, selector in selectors:
                    try:
                        if selector_type == "xpath":
                            elements = driver.find_elements(By.XPATH, selector)
                        elif selector_type == "css":
                            elements = driver.find_elements(By.CSS_SELECTOR, selector)
                        elif selector_type == "class":
                            elements = driver.find_elements(By.CLASS_NAME, selector)
                        else:
                            continue
                        for element in elements:
                            if element.is_displayed():
                                driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", element)
                                time.sleep(delay)
                                element.click()
                                time.sleep(delay)
                                return True
                    except (NoSuchElementException, StaleElementReferenceException) as e:
                        print(f"Attempt {attempt+1}: Error with selector {selector}: {e}")
                time.sleep(delay)
            return False
        commentary_selectors = [
            ("xpath", "//a[contains(text(), 'Commentary')]"),
            ("xpath", "//li[contains(@class, 'cb-nav-tab')]/a[contains(text(), 'Commentary')]"),
            ("css", ".cb-nav-tab a"),
            ("css", ".cb-nav-pill-1")
        ]
        if find_and_click_with_retry(commentary_selectors):
            print("Successfully clicked on Commentary tab")
        else:
            print("Could not click on Commentary tab, checking if already on Commentary page")
        driver.save_screenshot(f"{debug_folder}/03_after_commentary_navigation.png")
        innings_names = ["Mumbai Indians", "Royal Challengers Bangalore"]
        innings_tab_selectors = [
            ("css", ".cb-nav-tab-container .cb-nav-pill-1"),
            ("css", ".cb-nav-pill-1"),
            ("css", "ul.cb-nav-tab-container li"),
            ("css", ".cb-col-100 .cb-nav-pill-1"),
            ("xpath", "//li[contains(@class, 'cb-nav-pill-1')]")
        ]
        all_tabs = []
        for selector_type, selector in innings_tab_selectors:
            if selector_type == "css":
                all_tabs = driver.find_elements(By.CSS_SELECTOR, selector)
            else:
                all_tabs = driver.find_elements(By.XPATH, selector)
                
            if len(all_tabs) >= 2:
                print(f"Found {len(all_tabs)} innings tabs using {selector}")
                break
        if len(all_tabs) < 2:
            print("WARNING: Could not find enough innings tabs!")
            driver.save_screenshot(f"{debug_folder}/error_insufficient_tabs.png")           
        for i, innings_name in enumerate(innings_names):
            try:
                print(f"\nProcessing {innings_name} innings")
                for selector_type, selector in innings_tab_selectors:
                    if selector_type == "css":
                        all_tabs = driver.find_elements(By.CSS_SELECTOR, selector)
                    else:
                        all_tabs = driver.find_elements(By.XPATH, selector)
                        
                    if len(all_tabs) >= 2:
                        break             
                if i < len(all_tabs):
                    tab_needs_click = True
                    if i == 0:
                        current_commentary_elements = driver.find_elements(By.CLASS_NAME, "cb-com-ln")
                        if len(current_commentary_elements) > 5:
                            tab_needs_click = False
                            print("Already on the first innings tab")
                    if tab_needs_click:
                        try:
                            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", all_tabs[i])
                            time.sleep(1)
                            all_tabs[i].click()
                            print(f"Clicked on {innings_name} tab")
                            time.sleep(random.uniform(2, 4))
                        except Exception as e:
                            print(f"Error clicking tab: {e}")
                    def expand_all_commentary():
                        show_more_count = 0
                        max_show_more_attempts = 50  # Set a reasonable limit                        
                        while show_more_count < max_show_more_attempts:
                            try:
                                show_more_found = False
                                show_more_selectors = [
                                    ("xpath", "//a[contains(text(), 'Show More')]"),
                                    ("xpath", "//a[text()='Show More']"),
                                    ("css", ".cb-col-100 .cb-col-txt-show-more"),
                                    ("css", ".cb-txt-show-more")
                                ]                               
                                for selector_type, selector in show_more_selectors:
                                    elements = []
                                    if selector_type == "xpath":
                                        elements = driver.find_elements(By.XPATH, selector)
                                    else:
                                        elements = driver.find_elements(By.CSS_SELECTOR, selector)                                      
                                    for element in elements:
                                        if element.is_displayed() and "Show More" in element.text:
                                            # Scroll to the show more button
                                            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", element)
                                            time.sleep(random.uniform(0.5, 1.5))
                                            element.click()
                                            show_more_count += 1
                                            show_more_found = True
                                            print(f"Clicked 'Show More' button {show_more_count} times")
                                            if show_more_count % 5 == 0:
                                                driver.save_screenshot(f"{debug_folder}/06_{innings_name.replace(' ', '_')}_show_more_{show_more_count}.png")                               
                                            time.sleep(random.uniform(1.0, 2.5))
                                            break                                    
                                    if show_more_found:
                                        break                                
                                if not show_more_found:
                                    print("No more 'Show More' buttons found")
                                    return show_more_count                                    
                            except Exception as e:
                                print(f"Exception when clicking 'Show More': {e}")
                                time.sleep(1)                                
                        return show_more_count
                    show_more_clicks = expand_all_commentary()
                    print(f"Expanded commentary with {show_more_clicks} 'Show More' clicks")
                    driver.save_screenshot(f"{debug_folder}/07_{innings_name.replace(' ', '_')}_expanded.png")

                    def extract_commentary_data():
                        ball_counter = 0
                        
                        # Approach 1: Using combined over headers and commentary blocks
                        try:
                            print("Trying approach 1 for commentary extraction")
                            
                            # First find all over headers
                            over_headers = driver.find_elements(By.CSS_SELECTOR, ".cb-col-100.cb-col-comments")
                            commentary_blocks = driver.find_elements(By.CSS_SELECTOR, ".cb-col-60")
                            
                            print(f"Found {len(over_headers)} over headers and {len(commentary_blocks)} commentary blocks")
                            
                            current_over = "1"  # Default starting over
                            
                            # Combine all elements in order of appearance
                            all_elements = []
                            
                            # Find all rows that contain commentary
                            rows = driver.find_elements(By.CSS_SELECTOR, ".cb-col-100")
                            for row in rows:
                                try:
                                    text = row.text.strip()
                                    if not text:
                                        continue
                                        
                                    # Check if this is an over header
                                    if "OVER" in text.upper() and len(text) < 100:
                                        try:
                                            # Extract over number
                                            over_match = text.upper().split("OVER")[0].strip()
                                            if over_match and over_match.isdigit():
                                                current_over = over_match
                                                print(f"Found over: {current_over}")
                                            elif over_match:
                                                # Try to extract number part
                                                import re
                                                num_match = re.search(r'\d+', over_match)
                                                if num_match:
                                                    current_over = num_match.group()
                                                    print(f"Extracted over: {current_over}")
                                        except Exception as e:
                                            print(f"Error extracting over number: {e}")
                                        continue
                                        
                                    # Check if this is ball commentary by looking for specific elements
                                    commentary_element = None
                                    try:
                                        commentary_element = row.find_element(By.CSS_SELECTOR, ".cb-col-60")
                                    except:
                                        try:
                                            commentary_element = row.find_element(By.CSS_SELECTOR, ".cb-com-ln")
                                        except:
                                            pass
                                            
                                    if commentary_element:
                                        commentary_text = commentary_element.text.strip()
                                        if commentary_text and len(commentary_text) > 15:
                                            ball_counter += 1
                                            
                                            # Add to our data collection
                                            all_commentary_data.append({
                                                "year": year,
                                                "series_type": series_type,
                                                "series_name": series_name,
                                                "match_no": match_no,
                                                "match_type": match_type,
                                                "match_id": match_id,
                                                "match_date": match_date,
                                                "innings": innings_name,
                                                "match_winning_team": match_winning_team,
                                                "match_tie_breaker": match_tie_breaker,
                                                "match_toss": match_toss,
                                                "ball_no": ball_counter,
                                                "over_no": current_over,
                                                "ball_commentary": commentary_text
                                            })
                                            
                                            if ball_counter % 10 == 0:
                                                print(f"Processed {ball_counter} balls in {innings_name} innings")
                                                
                                except Exception as e:
                                    print(f"Error processing row: {e}")
                                    continue
                            
                            return ball_counter
                            
                        except Exception as e:
                            print(f"Error in approach 1: {e}")
                            return 0
                    
                    # Extract commentary
                    balls_processed = extract_commentary_data()
                    
                    # If primary approach doesn't work, try fallback
                    if balls_processed == 0:
                        try:
                            print("Trying fallback approach for commentary extraction")
                            
                            commentary_elements = driver.find_elements(By.CSS_SELECTOR, ".cb-com-ln")
                            current_over = "1"  # Default starting over
                            ball_counter = 0
                            
                            for element in commentary_elements:
                                text = element.text.strip()
                                
                                # Skip empty elements
                                if not text:
                                    continue
                                    
                                # Try to identify over numbers
                                if "OVER" in text.upper() and len(text) < 50:
                                    try:
                                        # Extract over number
                                        over_parts = text.upper().split("OVER")[0].strip().split()
                                        if over_parts:
                                            current_over = over_parts[-1]
                                            print(f"Found over: {current_over}")
                                    except:
                                        pass
                                    continue
                                
                                # If it has reasonable length, consider it ball commentary
                                if len(text) > 15 and not text.upper().startswith(("MATCH", "OVER", "END")):
                                    ball_counter += 1
                                    
                                    all_commentary_data.append({
                                        "year": year,
                                        "series_type": series_type,
                                        "series_name": series_name,
                                        "match_no": match_no,
                                        "match_type": match_type,
                                        "match_id": match_id,
                                        "match_date": match_date,
                                        "innings": innings_name,
                                        "match_winning_team": match_winning_team,
                                        "match_tie_breaker": match_tie_breaker,
                                        "match_toss": match_toss,
                                        "ball_no": ball_counter,
                                        "over_no": current_over,
                                        "ball_commentary": text
                                    })
                                    
                                    if ball_counter % 10 == 0:
                                        print(f"Processed {ball_counter} balls in {innings_name} innings")
                            
                            balls_processed = ball_counter
                            print(f"Fallback approach processed {balls_processed} balls")
                            
                        except Exception as e:
                            print(f"Error in fallback approach: {e}")
                    
                    print(f"Completed {innings_name} innings extraction with {balls_processed} balls")
                else:
                    print(f"Could not find tab for {innings_name} innings")
            except Exception as e:
                print(f"Error processing {innings_name} innings: {e}")
                driver.save_screenshot(f"{debug_folder}/error_{innings_name.replace(' ', '_')}.png")
        
        # Create DataFrame
        df = pd.DataFrame(all_commentary_data)
        
        # Handle empty dataframe case
        if df.empty:
            print("WARNING: No data was collected!")
            # Create a last-ditch effort to get any commentary
            try:
                print("Making final attempt to extract any commentary")
                all_text_elements = driver.find_elements(By.XPATH, "//*[contains(text(), 'to') and not(self::script)]")
                
                ball_counter = 0
                current_over = "1"
                
                for element in all_text_elements:
                    text = element.text.strip()
                    if text and " to " in text and len(text) > 15:
                        ball_counter += 1
                        
                        all_commentary_data.append({
                            "year": year,
                            "series_type": series_type,
                            "series_name": series_name,
                            "match_no": match_no,
                            "match_type": match_type,
                            "match_id": match_id,
                            "match_date": match_date,
                            "innings": "Unknown",  # Can't determine innings
                            "match_winning_team": match_winning_team,
                            "match_tie_breaker": match_tie_breaker,
                            "match_toss": match_toss,
                            "ball_no": ball_counter,
                            "over_no": "Unknown",  # Can't determine over
                            "ball_commentary": text
                        })
                
                if ball_counter > 0:
                    print(f"Final attempt collected {ball_counter} commentary entries")
                    df = pd.DataFrame(all_commentary_data)
                else:
                    print("Final attempt failed to collect any data")
                    # Look at what's on the page
                    print("Page source preview:")
                    print(driver.page_source[:1000])  # Print first 1000 chars of page source
                    driver.save_screenshot(f"{debug_folder}/final_empty_result.png")
                    
                    # Create empty DataFrame with expected columns
                    columns = [
                        "year", "series_type", "series_name", "match_no", "match_type", 
                        "match_id", "match_date", "innings", "match_winning_team", 
                        "match_tie_breaker", "match_toss", "ball_no", "over_no", "ball_commentary"
                    ]
                    df = pd.DataFrame(columns=columns)
            except Exception as e:
                print(f"Error in final attempt: {e}")
                # Create empty DataFrame with expected columns
                columns = [
                    "year", "series_type", "series_name", "match_no", "match_type", 
                    "match_id", "match_date", "innings", "match_winning_team", 
                    "match_tie_breaker", "match_toss", "ball_no", "over_no", "ball_commentary"
                ]
                df = pd.DataFrame(columns=columns)
        else:
            print(f"Successfully collected {len(df)} ball commentaries")
            driver.save_screenshot(f"{debug_folder}/final_success.png")
        
        return df
    
    except Exception as e:
        print(f"Main error: {e}")
        # Take screenshot of error state
        driver.save_screenshot(f"{debug_folder}/fatal_error.png")
        
        # Return empty DataFrame with correct columns
        columns = [
            "year", "series_type", "series_name", "match_no", "match_type", 
            "match_id", "match_date", "innings", "match_winning_team", 
            "match_tie_breaker", "match_toss", "ball_no", "over_no", "ball_commentary"
        ]
        return pd.DataFrame(columns=columns)
    
    finally:
        # Close browser
        driver.quit()
        print("Browser closed")

# Run the script
if __name__ == "__main__":
    print("Starting MI vs RCB match commentary scraper...")
    df = scrape_mi_rcb_commentary()
    
    # Save to CSV
    output_filename = "mi_vs_rcb_2021_ball_by_ball_commentary.csv"
    df.to_csv(output_filename, index=False)
    print(f"Saved data to {output_filename}")
    
    # Print summary
    if not df.empty:
        # Group by innings and count balls
        innings_summary = df.groupby('innings').size()
        print("\nSummary by innings:")
        for innings, count in innings_summary.items():
            print(f"{innings}: {count} balls")
        
        # Display sample from each innings
        print("\nSample from MI innings:")
        mi_sample = df[df['innings'] == 'Mumbai Indians'].head(3)
        if not mi_sample.empty:
            print(mi_sample[['over_no', 'ball_commentary']].to_string())
        else:
            print("No MI innings data found")
        
        print("\nSample from RCB innings:")
        rcb_sample = df[df['innings'] == 'Royal Challengers Bangalore'].head(3)
        if not rcb_sample.empty:
            print(rcb_sample[['over_no', 'ball_commentary']].to_string())
        else:
            print("No RCB innings data found")
    
    print(f"\nTotal balls scraped: {len(df)}")

Starting MI vs RCB match commentary scraper...
Processing match ID: 35612
Successfully navigated to the match URL
Primary page element loaded
Page title: Catch the Full Commentary of Mumbai Indians vs Royal Challengers Bangalore, 1st Match, Indian Premier League 2021 | Cricbuzz.com
Successfully clicked on Commentary tab

Processing Mumbai Indians innings
Could not find tab for Mumbai Indians innings

Processing Royal Challengers Bangalore innings
Could not find tab for Royal Challengers Bangalore innings
Making final attempt to extract any commentary
Final attempt collected 17 commentary entries
Browser closed
Saved data to mi_vs_rcb_2021_ball_by_ball_commentary.csv

Summary by innings:
Unknown: 17 balls

Sample from MI innings:
No MI innings data found

Sample from RCB innings:
No RCB innings data found

Total balls scraped: 17
