In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from typing import List, Dict
import time

class CricketScraper:
    def __init__(self):
        self.base_url = "https://www.espncricinfo.com"
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2.1 Safari/605.1.15'
        }

    def get_match_links(self, tournament_id: str) -> List[str]:
        """Collect all match summary links from the tournament page"""
        url = f'https://stats.espncricinfo.com/ci/engine/records/team/match_results.html?id=14450;type=tournament'
        
        try:
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find all rows in the match results table
            match_rows = soup.select(table.ds-w-full ds-table ds-table-xs ds-table-auto ds-w-full ds-overflow-scroll ds-scrollbar-hide > tbody > tr.ds-bg-fill-content-alternate ds-text-left')
            
            # Extract match links
            match_links = []
            for row in match_rows:
                link = row.select_one('td:nth-child(7) a')
                if link and link.get('href'):
                    full_url = self.base_url + link['href']
                    match_links.append(full_url)
            
            return match_links
        
        except requests.exceptions.RequestException as e:
            print(f"Error fetching match links: {e}")
            return []

    def get_match_details(self, match_url: str) -> List[Dict]:
        """Extract batting statistics from a single match page"""
        try:
            response = requests.get(match_url, headers=self.headers)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find match details
            match_details = soup.find_all('div', string="Match Details")
            if not match_details:
                return []
            
            # Get team names
            innings_headers = soup.select('div.ds-text-tight-m')
            team1 = innings_headers[0].get_text().replace(' Innings', '').strip()
            team2 = innings_headers[1].get_text().replace(' Innings', '').strip()
            match_info = f'{team1} Vs {team2}'
            
            # Get scorecard tables
            tables = soup.select('table.ci-scorecard-table')
            if len(tables) < 2:
                return []
            
            batting_summary = []
            
            # Process both innings
            for inning, table in enumerate(tables[:2]):
                team = team1 if inning == 0 else team2
                rows = table.select('tbody > tr')
                
                batting_pos = 1
                for row in rows:
                    # Check if it's a batting row (has 8 or more columns)
                    cells = row.select('td')
                    if len(cells) >= 8:
                        try:
                            batting_summary.append({
                                'match': match_info,
                                'teamInnings': team,
                                'battingPos': batting_pos,
                                'batsmanName': cells[0].select_one('a span').get_text().replace(' ', ''),
                                'dismissal': cells[1].select_one('span').get_text().strip(),
                                'runs': cells[2].select_one('strong').get_text(),
                                'balls': cells[3].get_text(),
                                '4s': cells[5].get_text(),
                                '6s': cells[6].get_text(),
                                'SR': cells[7].get_text()
                            })
                            batting_pos += 1
                        except (AttributeError, IndexError) as e:
                            print(f"Error processing row: {e}")
                            continue
            
            return batting_summary
        
        except requests.exceptions.RequestException as e:
            print(f"Error processing match {match_url}: {e}")
            return []

    def scrape_tournament(self, tournament_id: str) -> pd.DataFrame:
        """Scrape entire tournament and return data as DataFrame"""
        match_links = self.get_match_links(tournament_id)
        all_batting_data = []
        
        for link in match_links:
            print(f"Processing match: {link}")
            match_data = self.get_match_details(link)
            all_batting_data.extend(match_data)
            # Be nice to the server
            time.sleep(1)
        
        # Convert to DataFrame
        return pd.DataFrame(all_batting_data)

# Usage example
if __name__ == "__main__":
    scraper = CricketScraper()
    # Example tournament ID
    tournament_id = "14450"
    
    # Scrape the tournament
    df = scraper.scrape_tournament(tournament_id)
    
    # Save to CSV
    df.to_csv('cricket_stats.csv', index=False)
    print("Scraping completed! Data saved to cricket_stats.csv")

Scraping completed! Data saved to cricket_stats.csv


In [5]:
df

In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def get_match_links():
    url = "https://stats.espncricinfo.com/ci/engine/records/team/match_results.html?id=14450;type=tournament"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    links = []
    rows = soup.select('table.engineTable > tbody > tr.data1')
    for row in rows:
        link = "https://www.espncricinfo.com" + row.select_one('td:nth-child(7) a')['href']
        links.append(link)
    return links

def parse_match_data(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Get match details
    match_div = soup.find('div', string=lambda text: text and "Match Details" in text).parent.parent.parent
    innings_divs = match_div.find_next_siblings('div')[:2]
    team1 = innings_divs[0].find('span').text.replace(" Innings", "")
    team2 = innings_divs[1].find('span').text.replace(" Innings", "")
    match_info = f"{team1} Vs {team2}"
    
    batting_summary = []
    tables = soup.select('div > table.ci-scorecard-table')
    
    # Process both innings
    for inning, team in enumerate([team1, team2]):
        rows = tables[inning].select('tbody > tr')
        batting_pos = 1
        
        for row in rows:
            cols = row.select('td')
            if len(cols) >= 8:
                batting_summary.append({
                    "match": match_info,
                    "teamInnings": team,
                    "battingPos": batting_pos,
                    "batsmanName": cols[0].select_one('a > span > span').text.replace(' ', ''),
                    "dismissal": cols[1].select_one('span > span').text,
                    "runs": cols[2].select_one('strong').text,
                    "balls": cols[3].text,
                    "4s": cols[5].text,
                    "6s": cols[6].text,
                    "SR": cols[7].text
                })
                batting_pos += 1
    
    return batting_summary

def main():
    all_batting_data = []
    match_links = get_match_links()
    
    for link in match_links:
        try:
            match_data = parse_match_data(link)
            all_batting_data.extend(match_data)
        except Exception as e:
            print(f"Error processing {link}: {str(e)}")
    
    # Convert to DataFrame and save
    df = pd.DataFrame(all_batting_data)
    df.to_csv('cricket_batting_stats.csv', index=False)
    return df

if __name__ == "__main__":
    main()

In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from typing import List, Dict
import time
from datetime import datetime

class CricketScraper:
    def __init__(self):
        self.base_url = "https://www.espncricinfo.com"
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2.1 Safari/605.1.15'
        }

    def get_match_links(self, tournament_id: str) -> List[str]:
        url = f'https://stats.espncricinfo.com/ci/engine/records/team/match_results.html?id={tournament_id};type=tournament'
        
        try:
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            match_rows = soup.select('table.engineTable > tbody > tr.data1')
            match_links = []
            for row in match_rows:
                link = row.select_one('td:nth-child(7) a')
                if link and link.get('href'):
                    full_url = self.base_url + link['href']
                    match_links.append(full_url)
            
            return match_links
        
        except requests.exceptions.RequestException as e:
            print(f"Error fetching match links: {e}")
            return []

    def get_match_details(self, match_url: str) -> List[Dict]:
        try:
            response = requests.get(match_url, headers=self.headers)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            match_details = soup.find_all('div', string="Match Details")
            if not match_details:
                return []
            
            innings_headers = soup.select('div.ds-text-tight-m')
            team1 = innings_headers[0].get_text().replace(' Innings', '').strip()
            team2 = innings_headers[1].get_text().replace(' Innings', '').strip()
            match_info = f'{team1} Vs {team2}'
            
            tables = soup.select('table.ci-scorecard-table')
            if len(tables) < 2:
                return []
            
            batting_summary = []
            
            for inning, table in enumerate(tables[:2]):
                team = team1 if inning == 0 else team2
                rows = table.select('tbody > tr')
                
                batting_pos = 1
                for row in rows:
                    cells = row.select('td')
                    if len(cells) >= 8:
                        try:
                            batting_summary.append({
                                'match': match_info,
                                'teamInnings': team,
                                'battingPos': batting_pos,
                                'batsmanName': cells[0].select_one('a span').get_text().replace(' ', ''),
                                'dismissal': cells[1].select_one('span').get_text().strip(),
                                'runs': cells[2].select_one('strong').get_text(),
                                'balls': cells[3].get_text(),
                                '4s': cells[5].get_text(),
                                '6s': cells[6].get_text(),
                                'SR': cells[7].get_text(),
                                'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                                'source_url': match_url
                            })
                            batting_pos += 1
                        except (AttributeError, IndexError) as e:
                            print(f"Error processing row: {e}")
                            continue
            
            return batting_summary
        
        except requests.exceptions.RequestException as e:
            print(f"Error processing match {match_url}: {e}")
            return []

    def scrape_tournament(self, tournament_id: str) -> pd.DataFrame:
        match_links = self.get_match_links(tournament_id)
        all_batting_data = []
        
        for link in match_links:
            print(f"Processing match: {link}")
            match_data = self.get_match_details(link)
            all_batting_data.extend(match_data)
            time.sleep(1)
        
        return pd.DataFrame(all_batting_data)

def save_cricket_data():
    scraper = CricketScraper()
    tournament_id = "14450"
    df = scraper.scrape_tournament(tournament_id)
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"cricket_stats_{timestamp}.csv"
    df.to_csv(filename, index=False)
    print(f"Data saved to {filename}")


if __name__ == "__main__":
    save_cricket_data()

Data saved to cricket_stats_20241226_232250.csv


In [8]:

# Read the CSV file
df = pd.read_csv('cricket_stats_20241226_232250.csv')

# Display the first few rows and basic information
print("DataFrame Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())
print("\nColumns:", list(df.columns))


EmptyDataError: No columns to parse from file

In [11]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from datetime import datetime

def scrape_cricket_data():
    url = "https://stats.espncricinfo.com/ci/engine/records/team/match_results.html?id=14450;type=tournament"
    headers = {'User-Agent': 'Mozilla/5.0'}
    
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    match_rows = soup.select('table.engineTable > tbody > tr.data1')
    all_batting_data = []
    
    for row in match_rows:
        match_url = "https://www.espncricinfo.com" + row.select_one('td:nth-child(7) a')['href']
        print(f"Processing: {match_url}")
        
        match_response = requests.get(match_url, headers=headers)
        match_soup = BeautifulSoup(match_response.content, 'html.parser')
        
        innings = match_soup.select('div.ds-text-tight-m')
        if len(innings) < 2:
            continue
            
        team1 = innings[0].text.replace(' Innings', '').strip()
        team2 = innings[1].text.replace(' Innings', '').strip()
        match_info = f'{team1} Vs {team2}'
        
        tables = match_soup.select('table.ci-scorecard-table')
        for inning, table in enumerate(tables[:2]):
            team = team1 if inning == 0 else team2
            rows = table.select('tbody > tr')
            
            batting_pos = 1
            for player_row in rows:
                cells = player_row.select('td')
                if len(cells) >= 8:
                    all_batting_data.append({
                        'match': match_info,
                        'teamInnings': team,
                        'battingPos': batting_pos,
                        'batsmanName': cells[0].select_one('a > span').text.strip(),
                        'dismissal': cells[1].select_one('span').text.strip(),
                        'runs': cells[2].select_one('strong').text.strip(),
                        'balls': cells[3].text.strip(),
                        '4s': cells[5].text.strip(),
                        '6s': cells[6].text.strip(),
                        'SR': cells[7].text.strip()
                    })
                    batting_pos += 1
        time.sleep(1)
    
    df = pd.DataFrame(all_batting_data)
    filename = f"cricket_stats_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    df.to_csv(filename, index=False)
    print(f"Data saved to {filename}")
    return df

if __name__ == "__main__":
    df = scrape_cricket_data()

Data saved to cricket_stats_20241226_232711.csv


In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_cricket_stats():
    # URL for the cricket statistics
    url = 'https://stats.espncricinfo.com/ci/engine/records/team/match_results.html?id=14450;type=tournament'
    
    # Send HTTP request and get the page content
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes
    except requests.RequestException as e:
        print(f"Error fetching the webpage: {e}")
        return None

    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the target table
    table = soup.find('table', class_='engineTable')
    if not table:
        print("Could not find the target table")
        return None
    
    # Initialize list to store match data
    match_summary = []
    
    # Find all data rows (excluding header)
    rows = table.find_all('tr', class_='data1')
    
    # Extract data from each row
    for row in rows:
        cells = row.find_all('td')
        if len(cells) >= 7:  # Ensure we have all required cells
            match_data = {
                'team1': cells[0].get_text(strip=True),
                'team2': cells[1].get_text(strip=True),
                'winner': cells[2].get_text(strip=True),
                'margin': cells[3].get_text(strip=True),
                'ground': cells[4].get_text(strip=True),
                'matchDate': cells[5].get_text(strip=True),
                'scorecard': cells[6].get_text(strip=True)
            }
            match_summary.append(match_data)
    
    return {
        "matchSummary": match_summary
    }

def main():
    # Scrape the data
    result = scrape_cricket_stats()
    
    if result:
        # Convert to DataFrame for easier viewing/manipulation
        df = pd.DataFrame(result['matchSummary'])
        print("Successfully scraped match data:")
        print(df)
        
        # Optionally save to CSV
        df.to_csv('cricket_matches.csv', index=False)
        print("\nData has been saved to 'cricket_matches.csv'")

if __name__ == "__main__":
    main()

Error fetching the webpage: 403 Client Error: Forbidden for url: https://stats.espncricinfo.com/ci/engine/records/team/match_results.html?id=14450;type=tournament


In [20]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

class CricketScraper:
    def __init__(self):
        self.base_url = "https://www.espncricinfo.com"
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2.1 Safari/605.1.15'
        }
    
    def scrape_cricket_stats(self, tournament_id="14450"):
        # Construct the URL for the cricket statistics
        url = f'https://stats.espncricinfo.com/ci/engine/records/team/match_results.html?id={tournament_id};type=tournament'
        
        # Send HTTP request and get the page content
        try:
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()  # Raise an exception for bad status codes
        except requests.RequestException as e:
            print(f"Error fetching the webpage: {e}")
            return None

        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the target table
        table = soup.find('table', class_='ds-w-full ds-table ds-table-xs ds-table-auto ds-w-full ds-overflow-scroll ds-scrollbar-hide')
        if not table:
            print("Could not find the target table")
            return None
        
        # Initialize list to store match data
        match_summary = []
        
        # Find all data rows (excluding header)
        rows = table.find_all('tr', class_='$0')
        
        # Extract data from each row
        for row in rows:
            cells = row.find_all('td')
            if len(cells) >= 7:  # Ensure we have all required cells
                match_data = {
                    'team1': cells[0].get_text(strip=True),
                    'team2': cells[1].get_text(strip=True),
                    'winner': cells[2].get_text(strip=True),
                    'margin': cells[3].get_text(strip=True),
                    'ground': cells[4].get_text(strip=True),
                    'matchDate': cells[5].get_text(strip=True),
                    'scorecard': cells[6].get_text(strip=True)
                }
                match_summary.append(match_data)
        
        return {
            "matchSummary": match_summary
        }

    def to_dataframe(self, data):
        """Convert the match summary data to a pandas DataFrame"""
        if data and 'matchSummary' in data:
            return pd.DataFrame(data['matchSummary'])
        return pd.DataFrame()

    def save_to_csv(self, data, filename='cricket_matches.csv'):
        """Save the match summary data to a CSV file"""
        df = self.to_dataframe(data)
        if not df.empty:
            df.to_csv(filename, index=False)
            print(f"\nData has been saved to '{filename}'")
        else:
            print("No data to save")

def main():
    # Create scraper instance
    scraper = CricketScraper()
    
    # Scrape the data
    result = scraper.scrape_cricket_stats()
    
    if result:
        # Convert to DataFrame and display
        df = scraper.to_dataframe(result)
        print("Successfully scraped match data:")
        print(df)
        
        # Save to CSV
        scraper.save_to_csv(result)

if __name__ == "__main__":
    main()

SyntaxError: invalid syntax (2838398622.py, line 37)

In [21]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

class CricketScraper:
    def __init__(self):
        self.base_url = "https://www.espncricinfo.com"
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2.1 Safari/605.1.15'
        }
    
    def scrape_cricket_stats(self, tournament_id="14450"):
        url = f'https://stats.espncricinfo.com/ci/engine/records/team/match_results.html?id={tournament_id};type=tournament'
        
        try:
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
        except requests.RequestException as e:
            print(f"Error fetching the webpage: {e}")
            return None

        soup = BeautifulSoup(response.content, 'html.parser')
        
        table = soup.find('table', class_='ds-table')
        if not table:
            print("Could not find the target table")
            return None
        
        match_summary = []
        
        # First get rows with ds-bg-ui-fill-translucent class (first 3 rows)
        filled_rows = table.find_all('tr', class_='ds-bg-ui-fill-translucent')
        # Then get rows with empty class
        empty_rows = table.find_all('tr', class_='')
        
        # Combine all rows
        all_rows = filled_rows + empty_rows
        
        for row in all_rows:
            # Find all td elements with ds-min-w-max class
            cells = row.find_all('td', class_='ds-min-w-max')
            if len(cells) >= 7:
                match_data = {
                    'team1': cells[0].get_text(strip=True),
                    'team2': cells[1].get_text(strip=True),
                    'winner': cells[2].get_text(strip=True),
                    'margin': cells[3].get_text(strip=True),
                    'ground': cells[4].get_text(strip=True),
                    'matchDate': cells[5].get_text(strip=True),
                    'scorecard': cells[6].get_text(strip=True)
                }
                match_summary.append(match_data)
        
        return {
            "matchSummary": match_summary
        }

    def to_dataframe(self, data):
        if data and 'matchSummary' in data:
            return pd.DataFrame(data['matchSummary'])
        return pd.DataFrame()

    def save_to_csv(self, data, filename='cricket_matches.csv'):
        df = self.to_dataframe(data)
        if not df.empty:
            df.to_csv(filename, index=False)
            print(f"\nData has been saved to '{filename}'")
        else:
            print("No data to save")

def main():
    scraper = CricketScraper()
    result = scraper.scrape_cricket_stats()
    
    if result:
        df = scraper.to_dataframe(result)
        print("Successfully scraped match data:")
        print(df)
        scraper.save_to_csv(result)

if __name__ == "__main__":
    main()

Successfully scraped match data:
           team1         team2        winner      margin     ground  \
0        England      Pakistan       England   5 wickets  Melbourne   
1        England         India       England  10 wickets   Adelaide   
2    New Zealand      Pakistan      Pakistan   7 wickets     Sydney   
3          India      Zimbabwe         India     71 runs  Melbourne   
4     Bangladesh      Pakistan      Pakistan   5 wickets   Adelaide   
5    Netherlands  South Africa   Netherlands     13 runs   Adelaide   
6         Team 1        Team 2        Winner      Margin     Ground   
7        England     Sri Lanka       England   4 wickets     Sydney   
8      Australia   Afghanistan     Australia      4 runs   Adelaide   
9        Ireland   New Zealand   New Zealand     35 runs   Adelaide   
10      Pakistan  South Africa      Pakistan     33 runs     Sydney   
11    Bangladesh         India         India      5 runs   Adelaide   
12   Netherlands      Zimbabwe   Netherlands