In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
def scrape_venue_details(match_url):
    """
    Scrape cricket match venue details from a given URL
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(match_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    venue_data = {
        'match_venue_stadium': None,
        'match_venue_city': None,
        'match_venue_capacity': None,
        'match_venue_host_teams': None
    }
    
    try:
        venue_info = soup.find('div', class_='match-info')
        if venue_info:
            venue_parts = venue_info.text.strip().split(',')
            if len(venue_parts) >= 2:
                venue_data['match_venue_stadium'] = venue_parts[0].strip()
                venue_data['match_venue_city'] = venue_parts[1].strip()
        capacity_tag = soup.find('span', string=lambda x: x and 'capacity' in x.lower())
        if capacity_tag:
            venue_data['match_venue_capacity'] = capacity_tag.find_next('span').text.strip()
        teams_tag = soup.find('span', string=lambda x: x and 'home team' in x.lower())
        if teams_tag:
            venue_data['match_venue_host_teams'] = teams_tag.find_next('span').text.strip()
        ground_link = soup.find('a', href=lambda x: x and 'ground' in x)
        if ground_link:
            ground_url = 'https://www.espncricinfo.com' + ground_link['href']
            ground_response = requests.get(ground_url, headers=headers)
            ground_soup = BeautifulSoup(ground_response.content, 'html.parser')
            capacity_label = ground_soup.find('span', string='Capacity')
            if capacity_label:
                venue_data['match_venue_capacity'] = capacity_label.find_next('span').text.strip()
            teams_label = ground_soup.find('span', string='Home team')
            if teams_label:
                venue_data['match_venue_host_teams'] = teams_label.find_next('span').text.strip()   
    except Exception as e:
        print(f"Error scraping venue data: {e}")   
    return venue_data

def scrape_scorecard(match_url):
    """
    Scrape cricket match scorecard details from a given URL
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(match_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    match_data = {
        'match_status': None,
        'match_winning_team': None,
        'match_tie_breaker': None,
        'match_toss': None,
        'umpires': None,
        'match_referee': None,
        'third_umpires': None,
        'match_datetime': None,
        'team1_name': None,
        'team2_name': None,
        'team1_score': None,
        'team1_wickets': None,
        'team2_score': None,
        'team2_wickets': None,
        'team1_captain': None,
        'team1_players': None,
        'team1_bench': None,
        'team1_support_staff': None,
        'team2_captain': None,
        'team2_players': None,
        'team2_bench': None,
        'team2_support_staff': None
    }

    try:
        status = soup.find('div', class_='status-text').text.strip().lower()
        if 'abandoned' in status:
            match_data['match_status'] = 'abandoned'
        elif 'rescheduled' in status:
            match_data['match_status'] = 'rescheduled'
        elif 'tied' in status:
            match_data['match_status'] = 'tied'
        else:
            match_data['match_status'] = 'completed'
        if match_data['match_status'] == 'completed' or match_data['match_status'] == 'tied':
            result = soup.find('div', class_='status-text').text.strip()
            match_data['match_winning_team'] = result.split(' won ')[0] if ' won ' in result else None
            if match_data['match_status'] == 'tied':
                match_data['match_tie_breaker'] = result.split('tied, ')[-1] if 'tied, ' in result else None

        toss_info = soup.find('div', class_='match-info-item', string=lambda x: x and 'toss' in x.lower())
        if toss_info:
            match_data['match_toss'] = toss_info.text.strip().replace('Toss', '').strip()

        date_time = soup.find('div', class_='match-info-time').text.strip()
        match_data['match_datetime'] = datetime.strptime(date_time, '%b %d, %Y, %H:%M %Z')
        teams = soup.find_all('div', class_='team')
        if len(teams) >= 2:
            match_data['team1_name'] = teams[0].find('div', class_='name').text.strip()
            match_data['team2_name'] = teams[1].find('div', class_='name').text.strip()
            team1_score = teams[0].find('div', class_='score-detail')
            if team1_score:
                score_parts = team1_score.text.strip().split('/')
                match_data['team1_score'] = score_parts[0]
                if len(score_parts) > 1:
                    match_data['team1_wickets'] = score_parts[1].split()[0]
            team2_score = teams[1].find('div', class_='score-detail')
            if team2_score:
                score_parts = team2_score.text.strip().split('/')
                match_data['team2_score'] = score_parts[0]
                if len(score_parts) > 1:
                    match_data['team2_wickets'] = score_parts[1].split()[0]
        squads = soup.find_all('div', class_='squad-players')
        if len(squads) >= 2:
            team1_players = [p.text.strip() for p in squads[0].find_all('div', class_='player-name')]
            match_data['team1_players'] = ', '.join(team1_players)
            team2_players = [p.text.strip() for p in squads[1].find_all('div', class_='player-name')]
            match_data['team2_players'] = ', '.join(team2_players)
            if team1_players:
                match_data['team1_captain'] = team1_players[0]
            if team2_players:
                match_data['team2_captain'] = team2_players[0]  
    except Exception as e:
        print(f"Error scraping data: {e}")
    return match_data
def scrape_multiple_matches(match_urls):
    """
    Scrape multiple match data using URLs and return a DataFrame
    """
    all_match_data = []
    for match_url in match_urls:
        match_data = scrape_scorecard(match_url)
        venue_data = scrape_venue_details(match_url)
        match_data.update(venue_data)  
        all_match_data.append(match_data)
        print(f"Scraped data for {match_url}")
    df = pd.DataFrame(all_match_data)
    return df
match_urls = [
    "https://www.espncricinfo.com/series/ipl-2023-1345038/chennai-super-kings-vs-gujarat-titans-final-1370353/full-scorecard",
]
df = scrape_multiple_matches(match_urls)
print(df.head())

Error scraping data: 'NoneType' object has no attribute 'text'
Scraped data for https://www.espncricinfo.com/series/ipl-2023-1345038/chennai-super-kings-vs-gujarat-titans-final-1370353/full-scorecard
  match_status match_winning_team match_tie_breaker match_toss umpires  \
0         None               None              None       None    None   

  match_referee third_umpires match_datetime team1_name team2_name  ...  \
0          None          None           None       None       None  ...   

  team1_bench team1_support_staff team2_captain team2_players team2_bench  \
0        None                None          None          None        None   

  team2_support_staff match_venue_stadium match_venue_city  \
0                None                None             None   

  match_venue_capacity match_venue_host_teams  
0                 None                   None  

[1 rows x 26 columns]
