In [3]:
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import time

In [4]:
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()),options=options)
base_url = "https://www.cricbuzz.com/"
page_url = "https://www.cricbuzz.com/cricket-full-commentary/35612/mi-vs-rcb-1st-match-indian-premier-league-2021"
driver.get(page_url)
time.sleep(3)
page_source = driver.page_source
driver.quit()
soup = BeautifulSoup(page_source,"html.parser")

In [None]:
commentary = []


In [None]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import pandas as pd
import time
import re

class CricbuzzScraper:
    def __init__(self):
        # Set up Chrome options
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # Run in headless mode
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        
        # Initialize webdriver
        self.driver = webdriver.Chrome(options=chrome_options)
        self.wait = WebDriverWait(self.driver, 10)
        
    def close(self):
        """Close the browser when done"""
        if self.driver:
            self.driver.quit()
            
    def extract_match_info(self, url):
        """Extract basic match information"""
        self.driver.get(url)
        
        # Wait for page to load
        try:
            self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, "cb-nav-hdr")))
        except TimeoutException:
            print("Page load timeout")
            return None
        
        match_info = {}
        
        # Extract title which contains teams
        try:
            title_element = self.driver.find_element(By.CSS_SELECTOR, "h1.cb-nav-hdr-text")
            title = title_element.text
            match_info['title'] = title
        except NoSuchElementException:
            match_info['title'] = "Title not found"
        
        # Extract series info
        try:
            series_element = self.driver.find_element(By.XPATH, "//div[contains(text(), 'Series:')]/following-sibling::div")
            match_info['series'] = series_element.text
        except NoSuchElementException:
            try:
                series_element = self.driver.find_element(By.XPATH, "//span[contains(text(), 'Series:')]/following-sibling::span")
                match_info['series'] = series_element.text
            except NoSuchElementException:
                match_info['series'] = "Series info not found"
        
        # Extract match winning team
        try:
            match_result = self.driver.find_element(By.CSS_SELECTOR, ".cb-text-complete")
            match_info['result'] = match_result.text
        except NoSuchElementException:
            match_info['result'] = "Result not found"
        
        # Extract match date
        try:
            date_element = self.driver.find_element(By.XPATH, "//span[contains(text(), 'Date & Time:')]/following-sibling::span")
            match_info['date'] = date_element.text
        except NoSuchElementException:
            match_info['date'] = "Date not found"
        
        # Extract match ID from URL
        match_id_match = re.search(r'/(\d+)/', url)
        if match_id_match:
            match_info['match_id'] = match_id_match.group(1)
        else:
            match_info['match_id'] = "Unknown"
        
        # Extract teams
        try:
            teams_elements = self.driver.find_elements(By.CSS_SELECTOR, ".cb-col-100.cb-mtch-info-itm")
            match_info['teams'] = {}
            
            for team_element in teams_elements:
                if "Playing XI" in team_element.text:
                    team_text = team_element.text
                    team_name = team_text.split("(Playing XI)")[0].strip()
                    players = team_text.split("(Playing XI):")[1].strip()
                    match_info['teams'][team_name] = players
        except Exception as e:
            match_info['teams'] = f"Team info extraction error: {str(e)}"
        
        return match_info
    
    def get_commentary(self, url):
        """Navigate to commentary section and extract ball-by-ball data"""
        # Navigate to the match page
        self.driver.get(url)
        
        # Get match info
        match_info = self.extract_match_info(url)
        
        # Navigate to the commentary tab
        try:
            commentary_link = self.wait.until(EC.element_to_be_clickable((By.XPATH, "//a[contains(text(), 'Commentary') or contains(@href, 'commentary')]")))
            commentary_link.click()
        except TimeoutException:
            print("Could not find or click on commentary tab")
            return None
        
        # Wait for commentary to load
        time.sleep(2)
        
        # Extract series name and year from match info
        series_name = match_info.get('series', 'Unknown')
        match_year = None
        date_text = match_info.get('date', '')
        year_match = re.search(r'\b(20\d{2})\b', date_text)
        if year_match:
            match_year = year_match.group(1)
        else:
            # Try to get year from series name
            year_match = re.search(r'\b(20\d{2})\b', series_name)
            if year_match:
                match_year = year_match.group(1)
            else:
                match_year = "Unknown"
        
        # Determine series type
        series_type = "Unknown"
        if "IPL" in series_name or "Indian Premier League" in series_name:
            series_type = "IPL"
            match_type = "T20"
        elif "Test" in series_name:
            series_type = "Test"
            match_type = "Test"
        elif "ODI" in series_name:
            series_type = "ODI"
            match_type = "ODI"
        elif "T20" in series_name:
            series_type = "T20"
            match_type = "T20"
        else:
            match_type = "Unknown"
        
        # Extract match number
        match_no = "1"  # Default
        match_no_match = re.search(r'(\d+)(st|nd|rd|th) Match', match_info.get('title', ''))
        if match_no_match:
            match_no = match_no_match.group(1)
        
        # Extract winning team
        winning_team = "Unknown"
        result_text = match_info.get('result', '')
        for team_name in match_info.get('teams', {}).keys():
            if team_name in result_text and "won" in result_text:
                winning_team = team_name
                break
        
        # Extract toss info - This would need more specific selectors based on the page structure
        toss_info = "Not available"
        
        # Check if match was tied
        match_tie_breaker = "None"
        if "Super Over" in result_text or "tie" in result_text.lower():
            match_tie_breaker = "Super Over" if "Super Over" in result_text else "Tie"
        
        # Initialize the data collection
        commentary_data = []
        
        # Find all commentary blocks
        inning_tabs = self.driver.find_elements(By.CSS_SELECTOR, ".cb-nav-pill-1")
        
        # Process each innings
        for inning_idx, inning_tab in enumerate(inning_tabs):
            # Click on the innings tab
            try:
                inning_tab.click()
                time.sleep(2)
            except Exception as e:
                print(f"Error clicking innings tab: {e}")
                continue
            
            # Find all commentary blocks for this innings
            commentary_blocks = self.driver.find_elements(By.CSS_SELECTOR, ".cb-com-ln")
            
            for block in commentary_blocks:
                try:
                    # Extract over number
                    over_text = block.find_element(By.CSS_SELECTOR, ".cb-com-ln-tim").text
                    over_match = re.match(r'(\d+\.\d+)', over_text)
                    if over_match:
                        over_no = over_match.group(1)
                    else:
                        continue  # Skip if we can't determine the over number
                    
                    # Extract ball commentary
                    commentary_text = block.find_element(By.CSS_SELECTOR, ".cb-com-ln-txt").text
                    
                    # Generate unique ball number based on innings and over
                    ball_no = f"{inning_idx+1}.{over_no}"
                    
                    # Add to our dataset
                    commentary_data.append({
                        'year': match_year,
                        'series_type': series_type,
                        'series_name': series_name,
                        'match_no': match_no,
                        'match_type': match_type,
                        'match_id': match_info.get('match_id', 'Unknown'),
                        'match_winning_team': winning_team,
                        'match_tie_breaker': match_tie_breaker,
                        'match_toss': toss_info,
                        'ball_no': ball_no,
                        'over_no': over_no,
                        'ball_commentary': commentary_text
                    })
                    
                except Exception as e:
                    print(f"Error processing commentary block: {e}")
        
        # Convert to DataFrame
        df = pd.DataFrame(commentary_data)
        return df
    
    def save_to_csv(self, df, filename="cricbuzz_commentary.csv"):
        """Save the data to a CSV file"""
        if df is not None and not df.empty:
            df.to_csv(filename, index=False)
            print(f"Data saved to {filename}")
        else:
            print("No data to save")

# Example usage
def main():
    # Sample URL from the image - modify as needed for the actual match
    url = "https://www.cricbuzz.com/live-cricket-scorecard/33171/mi-vs-rcb-1st-match-indian-premier-league-2021"
    
    scraper = CricbuzzScraper()
    try:
        print("Extracting ball-by-ball commentary...")
        df = scraper.get_commentary(url)
        
        if df is not None:
            print(f"Extracted {len(df)} commentary entries")
            scraper.save_to_csv(df)
            
            # Display first few rows
            print("\nSample data:")
            print(df.head())
        else:
            print("Failed to extract commentary data")
    
    finally:
        scraper.close()

if __name__ == "__main__":
    main()

Extracting ball-by-ball commentary...
Error processing commentary block: Message: no such element: Unable to locate element: {"method":"css selector","selector":".cb-com-ln-tim"}
  (Session info: chrome=135.0.7049.96); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF6CFA05355+78597]
	GetHandleVerifier [0x00007FF6CFA053B0+78688]
	(No symbol) [0x00007FF6CF7B91AA]
	(No symbol) [0x00007FF6CF80F149]
	(No symbol) [0x00007FF6CF80F3FC]
	(No symbol) [0x00007FF6CF801BEC]
	(No symbol) [0x00007FF6CF83712F]
	(No symbol) [0x00007FF6CF801AB6]
	(No symbol) [0x00007FF6CF837300]
	(No symbol) [0x00007FF6CF85F2BB]
	(No symbol) [0x00007FF6CF836EC3]
	(No symbol) [0x00007FF6CF8003F8]
	(No symbol) [0x00007FF6CF801163]
	GetHandleVerifier [0x00007FF6CFCAEF0D+2870973]
	GetHandleVerifier [0x00007FF6CFCA96B8+2848360]
	GetHandleVerifier [0x00007FF6CFCC6993+2967875]
	GetHandleVeri