In [3]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
def get_match_details(match_url):
    """
    Scrape comprehensive match details from Cricbuzz match scorecard URL
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }   
    try:
        response = requests.get(match_url, headers=headers)
        response.raise_for_status()        
        soup = BeautifulSoup(response.text, 'html.parser')
        match_data = {
            'match_url': match_url,
            'match_status': 'completed', 
            'match_winning_team': None,
            'match_tie_breaker': None,
            'match_toss': None,
            'umpires': None,
            'match_referee': None,
            'third_umpires': None,
            'match_datetime': None,
            'team1_name': None,
            'team2_name': None,
            'team1_score': None,
            'team1_wickets': None,
            'team2_score': None,
            'team2_wickets': None,
            'team1_captain': None,
            'team1_players': None,
            'team1_bench': None,
            'team1_support_staff': None,
            'team2_captain': None,
            'team2_players': None,
            'team2_bench': None,
            'team2_support_staff': None,
            'match_venue_stadium': None,
            'match_venue_city': None,
            'match_venue_capacity': None,
            'match_venue_host_teams': None
        }
        venue_info = soup.select_one('.cb-nav-main.cb-col-100.cb-col.cb-bg-white .cb-nav-subhdr.cb-font-12')
        if venue_info:
            venue_text = venue_info.text.strip()
            date_time_pattern = r'Date & Time:\s*(.*?)(?=\s*\|)'
            date_match = re.search(date_time_pattern, venue_text)
            if date_match:
                match_data['match_datetime'] = date_match.group(1).strip()
            venue_pattern = r'Venue:\s*(.*?)(?:,\s*([^,]+))?$'
            venue_match = re.search(venue_pattern, venue_text)
            if venue_match:
                stadium_name = venue_match.group(1).strip() if venue_match.group(1) else "Not available"
                city = venue_match.group(2).strip() if venue_match.group(2) else "Not available"
                match_data['match_venue_stadium'] = stadium_name
                match_data['match_venue_city'] = city
                match_data['match_venue_capacity'] = get_stadium_capacity(stadium_name)
                match_data['match_venue_host_teams'] = get_host_teams(stadium_name, city)
        match_info = soup.select_one('.cb-col.cb-col-100.cb-mini-col.cb-min-stts')
        if match_info:
            status_text = match_info.text.strip()
            if 'abandoned' in status_text.lower() or 'no result' in status_text.lower():
                match_data['match_status'] = 'abandoned'
            elif 'rescheduled' in status_text.lower() or 'postponed' in status_text.lower():
                match_data['match_status'] = 'rescheduled'
            elif 'match tied' in status_text.lower():
                match_data['match_status'] = 'tied'
                if 'super over' in status_text.lower():
                    match_data['match_tie_breaker'] = 'Super Over'
            winning_team_patterns = [
                r'(.*?) won by',
                r'(.*?) beat',
                r'(.*?) won'
            ]            
            for pattern in winning_team_patterns:
                winner_match = re.search(pattern, status_text)
                if winner_match:
                    match_data['match_winning_team'] = winner_match.group(1).strip()
                    break
            toss_section = soup.select_one('.cb-col.cb-col-100.cb-font-12.cb-toss-sts')
            if toss_section:
                match_data['match_toss'] = toss_section.text.strip()
        team_scores = soup.select('.cb-col.cb-col-100.cb-scrd-hdr-rw')
        if len(team_scores) >= 2:
            team1_info = team_scores[0].text.strip()
            team1_name_match = re.search(r'^(.*?)(?=\s*\d)', team1_info)
            if team1_name_match:
                match_data['team1_name'] = team1_name_match.group(1).strip()
            team1_score_match = re.search(r'(\d+)(?:/(\d+))?', team1_info)
            if team1_score_match:
                match_data['team1_score'] = team1_score_match.group(1)
                match_data['team1_wickets'] = team1_score_match.group(2) if team1_score_match.group(2) else '0'
            team2_info = team_scores[1].text.strip()
            team2_name_match = re.search(r'^(.*?)(?=\s*\d)', team2_info)
            if team2_name_match:
                match_data['team2_name'] = team2_name_match.group(1).strip()
            
            # Extract score and wickets
            team2_score_match = re.search(r'(\d+)(?:/(\d+))?', team2_info)
            if team2_score_match:
                match_data['team2_score'] = team2_score_match.group(1)
                match_data['team2_wickets'] = team2_score_match.group(2) if team2_score_match.group(2) else '0'
        match_officials_section = None
        for element in soup.select('.cb-col.cb-col-100.cb-mtch-info-itm'):
            if 'Officials' in element.text:
                match_officials_section = element
                break                
        if match_officials_section:
            officials_text = match_officials_section.text.strip()
            umpires_match = re.search(r'Umpires:\s*(.*?)(?=\s*\||$)', officials_text)
            if umpires_match:
                match_data['umpires'] = umpires_match.group(1).strip()
            third_umpire_match = re.search(r'TV Umpire:\s*(.*?)(?=\s*\||$)', officials_text)
            if third_umpire_match:
                match_data['third_umpires'] = third_umpire_match.group(1).strip()
            referee_match = re.search(r'Referee:\s*(.*?)(?=\s*\||$)', officials_text)
            if referee_match:
                match_data['match_referee'] = referee_match.group(1).strip()
        teams_url = match_url.replace('live-cricket-scorecard', 'live-cricket-match-squads')
        response_teams = requests.get(teams_url, headers=headers)
        soup_teams = BeautifulSoup(response_teams.text, 'html.parser')
        team_sections = soup_teams.select('.cb-col.cb-col-100.cb-minfo-tm-nm')
        teams_players = soup_teams.select('.cb-col.cb-col-100.cb-minfo-tm-nm + .cb-col.cb-col-100')        
        if len(team_sections) >= 2 and len(teams_players) >= 2:
            match_data['team1_name'] = team_sections[0].text.strip()
            players_section = teams_players[0]
            playing_xi = []
            captain = None            
            for player in players_section.select('.cb-col.cb-col-60'):
                player_name = player.text.strip()
                if '(c)' in player_name:
                    captain = player_name.replace('(c)', '').strip()
                if '(wk)' in player_name:
                    player_name = player_name.replace('(wk)', '').strip()
                if '(c & wk)' in player_name:
                    captain = player_name.replace('(c & wk)', '').strip()
                    player_name = player_name.replace('(c & wk)', '').strip()
                
                player_name_clean = re.sub(r'\([^)]*\)', '', player_name).strip()
                playing_xi.append(player_name_clean)            
            match_data['team1_players'] = ', '.join(playing_xi)
            match_data['team1_captain'] = captain
            team1_bench = []
            bench_section = players_section.select_one('.cb-col.cb-col-100.cb-squadstd-bench')
            if bench_section:
                for player in bench_section.select('.cb-col.cb-col-60'):
                    team1_bench.append(player.text.strip())            
            match_data['team1_bench'] = ', '.join(team1_bench) if team1_bench else None
            match_data['team2_name'] = team_sections[1].text.strip()
            players_section = teams_players[1]
            playing_xi = []
            captain = None 
            for player in players_section.select('.cb-col.cb-col-60'):
                player_name = player.text.strip()
                if '(c)' in player_name:
                    captain = player_name.replace('(c)', '').strip()
                if '(wk)' in player_name:
                    player_name = player_name.replace('(wk)', '').strip()
                if '(c & wk)' in player_name:
                    captain = player_name.replace('(c & wk)', '').strip()
                    player_name = player_name.replace('(c & wk)', '').strip()               
                player_name_clean = re.sub(r'\([^)]*\)', '', player_name).strip()
                playing_xi.append(player_name_clean)            
            match_data['team2_players'] = ', '.join(playing_xi)
            match_data['team2_captain'] = captain
            team2_bench = []
            bench_section = players_section.select_one('.cb-col.cb-col-100.cb-squadstd-bench')
            if bench_section:
                for player in bench_section.select('.cb-col.cb-col-60'):
                    team2_bench.append(player.text.strip())            
            match_data['team2_bench'] = ', '.join(team2_bench) if team2_bench else None
        match_data['team1_support_staff'] = "Not available"
        match_data['team2_support_staff'] = "Not available"       
        return match_data   
    except Exception as e:
        print(f"Error scraping {match_url}: {e}")
        return {key: "Not available" for key in [
            'match_url', 'match_status', 'match_winning_team', 'match_tie_breaker', 
            'match_toss', 'umpires', 'match_referee', 'third_umpires', 
            'match_datetime', 'team1_name', 'team2_name', 'team1_score', 
            'team1_wickets', 'team2_score', 'team2_wickets', 'team1_captain', 
            'team1_players', 'team1_bench', 'team1_support_staff', 'team2_captain', 
            'team2_players', 'team2_bench', 'team2_support_staff', 'match_venue_stadium', 
            'match_venue_city', 'match_venue_capacity', 'match_venue_host_teams'
        ]}
def get_stadium_capacity(stadium_name):
    """
    Return stadium capacity based on stadium name
    """
    stadium_capacity = {
        "MA Chidambaram Stadium": "50,000",
        "Eden Gardens": "68,000",
        "Wankhede Stadium": "33,108",
        "M Chinnaswamy Stadium": "40,000",
        "Narendra Modi Stadium": "132,000",
        "Arun Jaitley Stadium": "41,820",
        "Rajiv Gandhi International Stadium": "55,000",
        "Punjab Cricket Association Stadium": "26,000",
        "Sawai Mansingh Stadium": "30,000",
        "Brabourne Stadium": "20,000",
        "DY Patil Stadium": "55,000"
    }
    for known_stadium, capacity in stadium_capacity.items():
        if known_stadium in stadium_name or stadium_name in known_stadium:
            return capacity            
    return "Not available"
def get_host_teams(stadium_name, city):
    """
    Return host teams based on stadium and city
    """
    host_team_map = {
        "Chennai": "Chennai Super Kings",
        "Mumbai": "Mumbai Indians",
        "Bengaluru": "Royal Challengers Bangalore",
        "Kolkata": "Kolkata Knight Riders",
        "Delhi": "Delhi Capitals",
        "Hyderabad": "Sunrisers Hyderabad",
        "Punjab": "Punjab Kings",
        "Mohali": "Punjab Kings",
        "Jaipur": "Rajasthan Royals",
        "Ahmedabad": "Gujarat Titans",
        "Lucknow": "Lucknow Super Giants"
    }
    stadium_team_map = {
        "MA Chidambaram Stadium": "Chennai Super Kings",
        "Eden Gardens": "Kolkata Knight Riders",
        "Wankhede Stadium": "Mumbai Indians",
        "M Chinnaswamy Stadium": "Royal Challengers Bangalore",
        "Arun Jaitley Stadium": "Delhi Capitals",
        "Rajiv Gandhi International Stadium": "Sunrisers Hyderabad",
        "Punjab Cricket Association Stadium": "Punjab Kings",
        "Sawai Mansingh Stadium": "Rajasthan Royals",
        "Narendra Modi Stadium": "Gujarat Titans"
    }
    for known_stadium, team in stadium_team_map.items():
        if known_stadium in stadium_name or stadium_name in known_stadium:
            return team
    if city in host_team_map:
        return host_team_map[city]   
    return "Not available"
def main():
    match_url = "https://www.cricbuzz.com/live-cricket-scorecard/35612/mi-vs-rcb-1st-match-indian-premier-league-2021"    
    print(f"Scraping: {match_url}")
    match_data = get_match_details(match_url)
    df = pd.DataFrame([match_data])
    columns = [
        'match_url', 'match_status', 'match_winning_team', 'match_tie_breaker', 
        'match_toss', 'umpires', 'match_referee', 'third_umpires', 
        'match_datetime', 'team1_name', 'team2_name', 'team1_score', 
        'team1_wickets', 'team2_score', 'team2_wickets', 'team1_captain', 
        'team1_players', 'team1_bench', 'team1_support_staff', 'team2_captain', 
        'team2_players', 'team2_bench', 'team2_support_staff', 'match_venue_stadium', 
        'match_venue_city', 'match_venue_capacity', 'match_venue_host_teams'
    ]
    for col in columns:
        if col not in df.columns:
            df[col] = "Not available"    
    df = df[columns]
    df.to_csv('mi_vs_rcb_match_details.csv', index=False, encoding='utf-8')
    print(f"Scraped MI vs RCB match. Data saved to mi_vs_rcb_match_details.csv")
if __name__ == "__main__":
    main()

Scraping: https://www.cricbuzz.com/live-cricket-scorecard/35612/mi-vs-rcb-1st-match-indian-premier-league-2021
Scraped MI vs RCB match. Data saved to mi_vs_rcb_match_details.csv
