In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
import random
def get_venue_details(match_url):
    """
    Scrape venue details from Cricbuzz match scorecard URL
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(match_url, headers=headers)
        response.raise_for_status()       
        soup = BeautifulSoup(response.text, 'html.parser')
        venue_info = soup.select_one('.cb-nav-main.cb-col-100.cb-col.cb-bg-white .cb-nav-subhdr.cb-font-12')        
        if venue_info:
            venue_text = venue_info.text.strip()
            venue_pattern = r'Venue:\s*(.*?)(?:,\s*([^,]+))?$'
            match = re.search(venue_pattern, venue_text)            
            if match:
                stadium_name = match.group(1).strip() if match.group(1) else "Not available"
                city = match.group(2).strip() if match.group(2) else "Not available"               
                return {
                    'match_url': match_url,
                    'match_venue_stadium': stadium_name,
                    'match_venue_city': city,
                    'match_venue_capacity': get_stadium_capacity(stadium_name),
                    'match_venue_host_teams': get_host_teams(stadium_name, city)
                }   
    except Exception as e:
        print(f"Error scraping {match_url}: {e}")    
    return {
        'match_url': match_url,
        'match_venue_stadium': "Not available",
        'match_venue_city': "Not available",
        'match_venue_capacity': "Not available",
        'match_venue_host_teams': "Not available"
    }
def get_stadium_capacity(stadium_name):
    """
    Return stadium capacity based on stadium name
    Note: This is a simplified version with a few stadiums. 
    For a complete solution, you'd need a database or additional scraping.
    """
    stadium_capacity = {
        "MA Chidambaram Stadium": "50,000",
        "Eden Gardens": "68,000",
        "Wankhede Stadium": "33,108",
        "M Chinnaswamy Stadium": "40,000",
        "Narendra Modi Stadium": "132,000",
        "Arun Jaitley Stadium": "41,820",
        "Rajiv Gandhi International Stadium": "55,000",
        "Punjab Cricket Association Stadium": "26,000",
        "Sawai Mansingh Stadium": "30,000",
        "Brabourne Stadium": "20,000",
        "DY Patil Stadium": "55,000"
    }
    for known_stadium, capacity in stadium_capacity.items():
        if known_stadium in stadium_name or stadium_name in known_stadium:
            return capacity           
    return "Not available"
def get_host_teams(stadium_name, city):
    """
    Return host teams based on stadium and city
    Note: This is a simplified version based on common associations.
    For a complete solution, you'd need a database or additional scraping.
    """
    host_team_map = {
        "Chennai": "Chennai Super Kings",
        "Mumbai": "Mumbai Indians",
        "Bengaluru": "Royal Challengers Bangalore",
        "Kolkata": "Kolkata Knight Riders",
        "Delhi": "Delhi Capitals",
        "Hyderabad": "Sunrisers Hyderabad",
        "Punjab": "Punjab Kings",
        "Mohali": "Punjab Kings",
        "Jaipur": "Rajasthan Royals",
        "Ahmedabad": "Gujarat Titans",
        "Lucknow": "Lucknow Super Giants"
    }
    stadium_team_map = {
        "MA Chidambaram Stadium": "Chennai Super Kings",
        "Eden Gardens": "Kolkata Knight Riders",
        "Wankhede Stadium": "Mumbai Indians",
        "M Chinnaswamy Stadium": "Royal Challengers Bangalore",
        "Arun Jaitley Stadium": "Delhi Capitals",
        "Rajiv Gandhi International Stadium": "Sunrisers Hyderabad",
        "Punjab Cricket Association Stadium": "Punjab Kings",
        "Sawai Mansingh Stadium": "Rajasthan Royals",
        "Narendra Modi Stadium": "Gujarat Titans"
    }
    for known_stadium, team in stadium_team_map.items():
        if known_stadium in stadium_name or stadium_name in known_stadium:
            return team
    if city in host_team_map:
        return host_team_map[city]
    return "Not available"
def main():
    match_urls = [
        "https://www.cricbuzz.com/live-cricket-scorecard/35612/mi-vs-rcb-1st-match-indian-premier-league-2021",
        "https://www.cricbuzz.com/cricket-scores/35718/kkr-vs-rcb-30th-match-indian-premier-league-2021"
    ]
    results = []    
    for url in match_urls:
        print(f"Scraping: {url}")
        venue_data = get_venue_details(url)
        results.append(venue_data)
        time.sleep(random.uniform(1, 3))
    df = pd.DataFrame(results)
    df.to_csv('cricket_venue_details.csv', index=False)
    print(f"Scraped {len(results)} matches. Data saved to cricket_venue_details.csv")
    print("\nResults:")
    for result in results:
        print("-" * 50)
        for key, value in result.items():
            print(f"{key}: {value}")
if __name__ == "__main__":
    main()

Scraping: https://www.cricbuzz.com/live-cricket-scorecard/35612/mi-vs-rcb-1st-match-indian-premier-league-2021
Scraping: https://www.cricbuzz.com/cricket-scores/35718/kkr-vs-rcb-30th-match-indian-premier-league-2021
Scraped 2 matches. Data saved to cricket_venue_details.csv

Results:
--------------------------------------------------
match_url: https://www.cricbuzz.com/live-cricket-scorecard/35612/mi-vs-rcb-1st-match-indian-premier-league-2021
match_venue_stadium: MA Chidambaram Stadium, Chennai Date & Time: Apr 09
match_venue_city: 07:30 PM LOCAL
match_venue_capacity: 50,000
match_venue_host_teams: Chennai Super Kings
--------------------------------------------------
match_url: https://www.cricbuzz.com/cricket-scores/35718/kkr-vs-rcb-30th-match-indian-premier-league-2021
match_venue_stadium: Narendra Modi Stadium, Ahmedabad Date & Time: May 03
match_venue_city: 07:30 PM LOCAL
match_venue_capacity: 132,000
match_venue_host_teams: Gujarat Titans
