In [2]:
import requests
import pandas as pd
import json
from io import StringIO

In [None]:
def get_teams_from_api(team_type):
    """
    Fetches cricket team data for a given team type from the Cricbuzz RapidAPI.

    Args:
        team_type (str): The type of cricket teams to fetch (e.g., 'international').

    Returns:
        list: A list of dictionaries containing team data, or None if the request fails.
    """
    # NOTE: You must replace "YOUR_RAPIDAPI_KEY_HERE" with your actual, valid key.
    # The key provided in the prompt is for demonstration purposes and will not work.
    url = f"https://cricbuzz-cricket.p.rapidapi.com/teams/v1/{team_type}"

    headers = {
	"x-rapidapi-key": "0f0637916emsh7de39c796d105c9p1a18d5jsn6321acab484a",
	"x-rapidapi-host": "cricbuzz-cricket.p.rapidapi.com"
}

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # This will raise an HTTPError for bad responses (4xx or 5xx)
        data = response.json()
        return data.get('list', [])
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for {team_type}: {e}")
        return None
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON for {team_type}: {e}")
        return None

def create_csv_from_teams(teams, file_prefix):
    """
    Creates a CSV file from a list of team dictionaries.

    Args:
        teams (list): A list of dictionaries, where each dictionary represents a team.
        file_prefix (str): The prefix for the output CSV file (e.g., 'international').
    """
    # Filter the list to only include teams with 'teamId', 'teamName', and 'teamSName'
    filtered_teams = [
        team for team in teams
        if all(key in team for key in ['teamId', 'teamName', 'teamSName'])
    ]

    if not filtered_teams:
        print(f"No valid team data found for {file_prefix}.")
        return

    # Create a DataFrame from the filtered list of dictionaries
    df = pd.DataFrame(filtered_teams)

    # Select the required columns in the correct order
    df_final = df[['teamId', 'teamName', 'teamSName']]

    # Define the output filename
    output_filename = f"{file_prefix}_teams.csv"

    # Save the DataFrame to a CSV file
    df_final.to_csv(output_filename, index=False)

    print(f"Successfully created '{output_filename}' with {len(df_final)} teams.")


def main():
    """
    Main function to fetch data for various team types and create CSV files.
    """
    team_types = ['international', 'league', 'domestic', 'women']

    print("Starting data fetching and CSV creation process...")

    for team_type in team_types:
        print(f"\nProcessing team type: '{team_type}'")

        teams_data = get_teams_from_api(team_type)

        if teams_data:
            create_csv_from_teams(teams_data, team_type)
        else:
            print(f"Skipping CSV creation for {team_type} due to an error.")

    print("\nProcess completed.")


if __name__ == "__main__":
    main()


In [None]:
import requests
import csv
from datetime import datetime

# Define the list of series types to fetch
series_types = ['international', 'league', 'domestic', 'women']

# Base URL and headers for the API request
base_url = "https://cricbuzz-cricket.p.rapidapi.com/series/v1/"
headers = {
    "x-rapidapi-key": "0f0637916emsh7de39c796d105c9p1a18d5jsn6321acab484a",
    "x-rapidapi-host": "cricbuzz-cricket.p.rapidapi.com"
}

# List to store all series data from all types
all_series_data = []

print("Starting to fetch cricket series data...")

# Loop through each series type
for series_type in series_types:
    url = f"{base_url}{series_type}"
    print(f"\nFetching data for series type: '{series_type}' from URL: {url}")

    try:
        response = requests.get(url, headers=headers)

        # Check if the API request was successful (status code 200)
        if response.status_code == 200:
            data = response.json()

            # The series data is nested inside 'seriesMapProto'
            series_map_proto = data.get('seriesMapProto', [])

            # Use a variable to count the number of series fetched for the current type
            series_count = 0

            # Loop through the list of series maps
            for series_map in series_map_proto:
                series_list = series_map.get('series', [])

                # Check if any series data was found
                if series_list:
                    for series in series_list:
                        # Extract the required information for each series
                        series_id = series.get('id')
                        series_name = series.get('name')

                        # The dates are in startDt and endDt and are timestamps in milliseconds
                        start_date_str = series.get('startDt')
                        end_date_str = series.get('endDt')

                        start_date = None
                        if start_date_str:
                            # Convert the timestamp string to a datetime object
                            try:
                                start_date = datetime.fromtimestamp(int(start_date_str) / 1000).strftime('%Y-%m-%d')
                            except (ValueError, TypeError):
                                start_date = None # Handle cases of invalid timestamp

                        end_date = None
                        if end_date_str:
                            # Convert the timestamp string to a datetime object
                            try:
                                end_date = datetime.fromtimestamp(int(end_date_str) / 1000).strftime('%Y-%m-%d')
                            except (ValueError, TypeError):
                                end_date = None # Handle cases of invalid timestamp

                        # Append the formatted data to our main list
                        all_series_data.append({
                            'id': series_id,
                            'name': series_name,
                            'start_date': start_date,
                            'end_date': end_date,
                            'series_type': series_type
                        })
                        series_count += 1

            print(f"Successfully fetched {series_count} series for '{series_type}'.")

        else:
            print(f"Error fetching data for '{series_type}'. Status code: {response.status_code}")
            print(f"Response: {response.text}")

    except requests.exceptions.RequestException as e:
        print(f"An error occurred while making the request for '{series_type}': {e}")

print("\nData fetching complete.")

# Define the name of the output CSV file
output_filename = 'all_cricket_series.csv'

# Check if we have any data to write
if all_series_data:
    # Define the column names for the CSV file
    csv_headers = ['id', 'name', 'start_date', 'end_date', 'series_type']

    try:
        # Open the CSV file in write mode
        with open(output_filename, 'w', newline='', encoding='utf-8') as csvfile:
            # Create a DictWriter object to write the data
            writer = csv.DictWriter(csvfile, fieldnames=csv_headers)

            # Write the header row
            writer.writeheader()

            # Write the data rows
            writer.writerows(all_series_data)

        print(f"\nSuccessfully created and saved the file '{output_filename}'.")
        print(f"Total number of series written to the file: {len(all_series_data)}")

    except IOError as e:
        print(f"An I/O error occurred while writing the CSV file: {e}")
else:
    print(f"\nNo data was fetched to be written to a CSV file.")


In [None]:
import requests
import pandas as pd
import os

# Define the file paths
input_csv_file = 'all_cricket_series.csv'
output_csv_file = 'venue_details.csv'

# Set up the API headers with the provided key and host
# Note: It's a good practice to handle sensitive information like API keys securely,
# for example, by storing them in environment variables.
headers = {
    "x-rapidapi-key": "b87c7076d0msh0b7df52da025facp1e3f86jsn77fa64563887",
    "x-rapidapi-host": "cricbuzz-cricket.p.rapidapi.com"
}

# Base URL for the API endpoint
base_url = "https://cricbuzz-cricket.p.rapidapi.com/series/v1/{}/venues"

# Check if the input CSV file exists
if not os.path.exists(input_csv_file):
    print(f"Error: The file '{input_csv_file}' was not found.")
else:
    try:
        # Read the CSV file into a pandas DataFrame
        print(f"Reading data from '{input_csv_file}'...")
        series_df = pd.read_csv(input_csv_file)

        # Initialize a list to store all venue data
        all_venues = []

        # Iterate over the 'id' column of the DataFrame
        for series_id in series_df['id']:
            print(f"Fetching venue data for series ID: {series_id}")
            # Construct the API URL for the current series ID
            api_url = base_url.format(series_id)

            try:
                # Make the API request
                response = requests.get(api_url, headers=headers)

                # Check if the request was successful
                if response.status_code == 200:
                    data = response.json()
                    # Check if 'seriesVenue' data exists in the response
                    if 'seriesVenue' in data:
                        # Iterate through each venue in the list
                        for venue in data['seriesVenue']:
                            # Extract the required details and append to our list
                            venue_details = {
                                'id': venue.get('id'),
                                'ground': venue.get('ground'),
                                'city': venue.get('city'),
                                'country': venue.get('country')
                            }
                            all_venues.append(venue_details)
                    else:
                        print(f"No venue data found for series ID: {series_id}")
                else:
                    print(f"Failed to fetch data for series ID {series_id}. Status code: {response.status_code}")
            except requests.exceptions.RequestException as e:
                print(f"An error occurred during the request for series ID {series_id}: {e}")

        # Create a new DataFrame from the collected venue data
        if all_venues:
            venues_df = pd.DataFrame(all_venues)

            # Save the DataFrame to a new CSV file
            venues_df.to_csv(output_csv_file, index=False)
            print(f"\nSuccessfully saved all venue data to '{output_csv_file}'.")
        else:
            print("\nNo venue data was collected.")

    except pd.errors.EmptyDataError:
        print(f"Error: The file '{input_csv_file}' is empty.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")


In [None]:
import requests
import pandas as pd
import time
import os
from requests.exceptions import JSONDecodeError

# Define the API headers from the user's prompt
HEADERS = {
    # Using the API key from the user's code snippet
    "x-rapidapi-key": "d08ad5ed80mshfa4f1be24cd6ec6p1e36b5jsn47875e9c26c2",
    "x-rapidapi-host": "cricbuzz-cricket.p.rapidapi.com"
}
BASE_URL = "https://cricbuzz-cricket.p.rapidapi.com/teams/v1/{}/players"

# List of CSV files provided by the user. Updated to a single file.
CSV_FILES = [
    'all_teams.csv'
]

def get_teams_from_csvs(file_list):
    """
    Reads team data from a list of CSV files and consolidates them,
    filtering for 'international' teams.

    Args:
        file_list (list): A list of CSV file paths.

    Returns:
        list: A list of dictionaries, each containing 'teamId', 'teamName', and 'team_type'.
    """
    all_teams = []
    print("Reading team data from CSV files...")
    for file_name in file_list:
        try:
            df = pd.read_csv(file_name)

            # Check if the required columns exist
            if 'teamId' in df.columns and 'teamName' in df.columns and 'team_type' in df.columns:
                # Filter the DataFrame to include only international teams
                df_international = df[df['team_type'] == 'international']

                # Convert the filtered DataFrame to a list of dictionaries
                teams = df_international[['teamId', 'teamName', 'team_type']].to_dict('records')
                all_teams.extend(teams)
                print(f"Successfully read and filtered {len(teams)} international teams from {file_name}")
            else:
                print(f"Skipping {file_name}: Missing 'teamId', 'teamName', or 'team_type' columns.")
        except FileNotFoundError:
            print(f"File not found: {file_name}. Skipping...")
        except Exception as e:
            print(f"An error occurred while reading {file_name}: {e}")

    # Remove duplicates based on teamId
    unique_teams = list({team['teamId']: team for team in all_teams}.values())
    print(f"\nFound a total of {len(unique_teams)} unique international teams across all files.")
    return unique_teams

def fetch_players_for_team(team_id):
    """
    Fetches player data for a given teamId from the Cricbuzz API.

    Args:
        team_id (int or str): The ID of the team.

    Returns:
        dict: The JSON response data, or None if the request fails.
    """
    url = BASE_URL.format(team_id)
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        response.raise_for_status()  # Raises an HTTPError for bad responses
        return response.json()
    except JSONDecodeError as e:
        # Specific handling for non-JSON responses
        print(f"Error fetching data for team ID {team_id}: Invalid JSON response. The API might be returning an error page or no data.")
        return None
    except requests.exceptions.RequestException as e:
        # General handling for other request-related errors
        print(f"Error fetching data for team ID {team_id}: {e}")
        return None

def process_players_data(players_list, team_name):
    """
    Processes the raw player list from the API response and formats it.

    Args:
        players_list (list): The 'player' array from the API response.
        team_name (str): The name of the team.

    Returns:
        list: A list of formatted player dictionaries.
    """
    processed_players = []
    current_role = None
    if not players_list:
        return []

    for player in players_list:
        # Check if the entry is a role header (no 'id' key)
        if 'id' not in player and 'name' in player:
            current_role = player['name']
        elif 'id' in player:
            # This is a player entry
            player_data = {
                'id': player.get('id'),
                'name': player.get('name'),
                'battingStyle': player.get('battingStyle', ''),
                'bowlingStyle': player.get('bowlingStyle', ''),
                'role': current_role,
                'team_name': team_name
            }
            processed_players.append(player_data)

    return processed_players

def main():
    """
    Main function to orchestrate the data scraping and file generation.
    """
    all_teams = get_teams_from_csvs(CSV_FILES)
    all_players_data = []

    print("\nStarting to fetch player data from API...")
    for team in all_teams:
        team_id = team['teamId']
        team_name = team['teamName']
        print(f"Fetching players for Team: {team_name} (ID: {team_id})...")

        api_data = fetch_players_for_team(team_id)

        if api_data and 'player' in api_data:
            players = process_players_data(api_data['player'], team_name)
            all_players_data.extend(players)
            print(f"  -> Found {len(players)} players for {team_name}.")
        else:
            print(f"  -> No player data found or error for {team_name}.")

        # Add a delay to avoid hitting API rate limits
        time.sleep(1)

    if all_players_data:
        # Create a DataFrame from the collected data
        df_players = pd.DataFrame(all_players_data)

        # Define the output file name
        output_file = "cricket_player_data.csv"

        # Save the DataFrame to a single CSV file
        df_players.to_csv(output_file, index=False)
        print(f"\nData collection complete. Total players found: {len(all_players_data)}")
        print(f"All data has been saved to '{output_file}'.")
    else:
        print("\nNo player data was collected. The output file will not be created.")

if __name__ == "__main__":
    main()

In [None]:
import requests
import pandas as pd
import time
import os
from requests.exceptions import JSONDecodeError

# Define the API headers from the user's prompt
HEADERS = {
    # Using the API key from the user's code snippet
    "x-rapidapi-key": "d08ad5ed80mshfa4f1be24cd6ec6p1e36b5jsn47875e9c26c2",
    "x-rapidapi-host": "cricbuzz-cricket.p.rapidapi.com"
}
BASE_URL = "https://cricbuzz-cricket.p.rapidapi.com/teams/v1/{}/players"

# List of CSV files provided by the user. Updated to a single file.
CSV_FILES = [
    'all_teams.csv'
]

def get_teams_from_csvs(file_list):
    """
    Reads team data from a list of CSV files and consolidates them,
    filtering for 'league' teams.

    Args:
        file_list (list): A list of CSV file paths.

    Returns:
        list: A list of dictionaries, each containing 'teamId', 'teamName', and 'team_type'.
    """
    all_teams = []
    print("Reading team data from CSV files...")
    for file_name in file_list:
        try:
            df = pd.read_csv(file_name)

            # Check if the required columns exist
            if 'teamId' in df.columns and 'teamName' in df.columns and 'team_type' in df.columns:
                # Filter the DataFrame to include only league teams
                df_international = df[df['team_type'] == 'league']

                # Convert the filtered DataFrame to a list of dictionaries
                teams = df_international[['teamId', 'teamName', 'team_type']].to_dict('records')
                all_teams.extend(teams)
                print(f"Successfully read and filtered {len(teams)} league teams from {file_name}")
            else:
                print(f"Skipping {file_name}: Missing 'teamId', 'teamName', or 'team_type' columns.")
        except FileNotFoundError:
            print(f"File not found: {file_name}. Skipping...")
        except Exception as e:
            print(f"An error occurred while reading {file_name}: {e}")

    # Remove duplicates based on teamId
    unique_teams = list({team['teamId']: team for team in all_teams}.values())
    print(f"\nFound a total of {len(unique_teams)} unique league teams across all files.")
    return unique_teams

def fetch_players_for_team(team_id):
    """
    Fetches player data for a given teamId from the Cricbuzz API.

    Args:
        team_id (int or str): The ID of the team.

    Returns:
        dict: The JSON response data, or None if the request fails.
    """
    url = BASE_URL.format(team_id)
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        response.raise_for_status()  # Raises an HTTPError for bad responses
        return response.json()
    except JSONDecodeError as e:
        # Specific handling for non-JSON responses
        print(f"Error fetching data for team ID {team_id}: Invalid JSON response. The API might be returning an error page or no data.")
        return None
    except requests.exceptions.RequestException as e:
        # General handling for other request-related errors
        print(f"Error fetching data for team ID {team_id}: {e}")
        return None

def process_players_data(players_list, team_name):
    """
    Processes the raw player list from the API response and formats it.

    Args:
        players_list (list): The 'player' array from the API response.
        team_name (str): The name of the team.

    Returns:
        list: A list of formatted player dictionaries.
    """
    processed_players = []
    current_role = None
    if not players_list:
        return []

    for player in players_list:
        # Check if the entry is a role header (no 'id' key)
        if 'id' not in player and 'name' in player:
            current_role = player['name']
        elif 'id' in player:
            # This is a player entry
            player_data = {
                'id': player.get('id'),
                'name': player.get('name'),
                'battingStyle': player.get('battingStyle', ''),
                'bowlingStyle': player.get('bowlingStyle', ''),
                'role': current_role,
                'team_name': team_name
            }
            processed_players.append(player_data)

    return processed_players

def main():
    """
    Main function to orchestrate the data scraping and file generation.
    """
    all_teams = get_teams_from_csvs(CSV_FILES)
    all_players_data = []

    print("\nStarting to fetch player data from API...")
    for team in all_teams:
        team_id = team['teamId']
        team_name = team['teamName']
        print(f"Fetching players for Team: {team_name} (ID: {team_id})...")

        api_data = fetch_players_for_team(team_id)

        if api_data and 'player' in api_data:
            players = process_players_data(api_data['player'], team_name)
            all_players_data.extend(players)
            print(f"  -> Found {len(players)} players for {team_name}.")
        else:
            print(f"  -> No player data found or error for {team_name}.")

        # Add a delay to avoid hitting API rate limits
        time.sleep(1)

    if all_players_data:
        # Create a DataFrame from the collected data
        df_players = pd.DataFrame(all_players_data)

        # Define the output file name
        output_file = "cricket_player_league_data.csv"

        # Save the DataFrame to a single CSV file
        df_players.to_csv(output_file, index=False)
        print(f"\nData collection complete. Total players found: {len(all_players_data)}")
        print(f"All data has been saved to '{output_file}'.")
    else:
        print("\nNo player data was collected. The output file will not be created.")

if __name__ == "__main__":
    main()

In [None]:
import pandas as pd
import requests
from requests.exceptions import RequestException
import time

# This function reads the venue_details.csv and fetches venue data
# from the Cricbuzz API for each unique venue ID.
def get_venue_details(api_key, api_host):
    """
    Fetches detailed venue information from the Cricbuzz API.

    Args:
        api_key (str): The RapidAPI key for authentication.
        api_host (str): The RapidAPI host for the Cricbuzz API.

    Returns:
        list: A list of dictionaries, where each dictionary contains
              the detailed information for a venue.
    """
    try:
        # Read the venue_details.csv file using pandas.
        # This file is provided by the user and is located in the same directory.
        df_venues = pd.read_csv("venue_details.csv")
    except FileNotFoundError:
        print("Error: 'venue_details.csv' not found. Please ensure the file is in the correct path.")
        return []

    # Get a list of unique venue IDs from the 'id' column to avoid redundant API calls.
    venue_ids = df_venues['id'].unique()

    # Define the API headers for authentication.
    headers = {
        "x-rapidapi-key": api_key,
        "x-rapidapi-host": api_host
    }

    # Initialize an empty list to store the venue data.
    venue_info_list = []

    # Loop through each unique venue ID.
    for venue_id in venue_ids:
        # Construct the API URL for the current venue ID.
        url = f"https://cricbuzz-cricket.p.rapidapi.com/venues/v1/{venue_id}"

        print(f"Fetching details for venue ID: {venue_id}...")

        try:
            # Make the GET request to the API.
            response = requests.get(url, headers=headers)

            # Raise an exception for bad status codes (4xx or 5xx).
            response.raise_for_status()

            # Parse the JSON response.
            venue_data = response.json()

            # Extract the required information and handle potential missing keys
            # by providing a default value (e.g., None).
            info = {
                "id": venue_id,
                "ground": venue_data.get("ground"),
                "city": venue_data.get("city"),
                "country": venue_data.get("country"),
                "timezone": venue_data.get("timezone"),
                "capacity": venue_data.get("capacity"),
                "ends": venue_data.get("ends"),
                "homeTeam": venue_data.get("homeTeam")
            }

            # Append the extracted info to our list.
            venue_info_list.append(info)

        except RequestException as e:
            # Print an error message if the API request fails.
            print(f"Error fetching data for venue ID {venue_id}: {e}")

        # Add a delay to avoid hitting the API rate limit.
        time.sleep(1)

    return venue_info_list

# Main execution block.
if __name__ == "__main__":
    # Your API key and host. NOTE: It's best practice to handle these
    # securely, e.g., using environment variables, but for this
    # self-contained script, we'll keep them here.
    RAPIDAPI_KEY = "d08ad5ed80mshfa4f1be24cd6ec6p1e36b5jsn47875e9c26c2"
    RAPIDAPI_HOST = "cricbuzz-cricket.p.rapidapi.com"

    # Call the function to get all venue details.
    all_venue_details = get_venue_details(RAPIDAPI_KEY, RAPIDAPI_HOST)

    # Check if any data was retrieved.
    if all_venue_details:
        # Create a pandas DataFrame from the list of dictionaries.
        df_venue_info = pd.DataFrame(all_venue_details)

        # Save the DataFrame to a new CSV file.
        # index=False prevents pandas from writing the DataFrame index as a column.
        df_venue_info.to_csv("venue_info.csv", index=False)

        print("\nSuccessfully created 'venue_info.csv' with the extracted venue data.")
    else:
        print("\nNo venue data was retrieved. The 'venue_info.csv' file was not created.")


In [19]:
international_teams = pd.read_csv('international_teams.csv').sort_values('teamId')

In [20]:
domestic_teams = pd.read_csv('domestic_teams.csv').sort_values('teamId')

In [21]:
league_teams = pd.read_csv('league_teams.csv').sort_values('teamId')

In [22]:
women_teams = pd.read_csv('women_teams.csv').sort_values('teamId')

In [23]:
international_teams['team_type'] = 'international'
domestic_teams['team_type'] = 'domestic'
league_teams['team_type'] = 'league'
women_teams['team_type'] = 'women'

In [34]:
all_teams = pd.concat([domestic_teams, league_teams, women_teams,international_teams],ignore_index=True)
all_teams.to_csv('all_teams.csv',index=False)

In [67]:
all_teams[all_teams['team_type'] == 'league'][['teamId','teamName']].drop_duplicates().head(20)

Unnamed: 0,teamId,teamName
355,58,Chennai Super Kings
356,59,Royal Challengers Bengaluru
357,61,Delhi Capitals
358,62,Mumbai Indians
359,63,Kolkata Knight Riders
360,64,Rajasthan Royals
361,65,Punjab Kings
362,84,Trinidad and Tobago
363,88,Sialkot Stallions
364,89,Titans


In [8]:
venue_details = pd.read_csv('venue_details.csv')

In [9]:
all_cricket_series = pd.read_csv('all_cricket_series.csv')

In [10]:
all_teams = pd.read_csv('all_teams.csv')

In [11]:
venue_info = pd.read_csv('venue_info.csv')

In [13]:
venue_info

Unnamed: 0,id,ground,city,country,timezone,capacity,ends,homeTeam
0,458,Tribhuvan University International Cricket Ground,Kirtipur,Nepal,+05:45,20000,"Pavillion End, Chobar End",Nepal
1,153,Dubai International Cricket Stadium,Dubai,United Arab Emirates,+04:00,25000,"Emirates Road End, Dubai Sports City End",United Arab Emirates
2,1437791,Sportpark Duivesteijn,Voorburg,Netherlands,+02:00,,,
3,139,Maple Leaf North-West Ground,King City,Canada,-04:00,,"Northern End, Southern End",Canada
4,915,Prairie View Cricket Complex,Houston,United States,-05:00,10000,"Forest End, Highway End",United States of America
...,...,...,...,...,...,...,...,...
121,1438123,Albert Park 2,Suva,Fiji,+12:00,,,
122,27,M.Chinnaswamy Stadium,Bengaluru,India,+05:30,40000,"Pavilion End, BEML End","Karnataka, Royal Challengers Bengaluru"
123,256,Colombo Cricket Club Ground,Colombo,Sri Lanka,+05:30,6000,"Press Box End, Pavilion End",Colombo Cricket Club
124,132,Dr DY Patil Sports Academy,Navi Mumbai,India,+05:30,60000,"Media End, Pavilion End",


In [15]:
all_teams = pd.read_csv('all_teams.csv')

In [8]:
all_cricket_series = pd.read_csv('all_cricket_series.csv')

In [9]:
all_cricket_series

Unnamed: 0,id,name,start_date,end_date,series_type
0,7572,ICC Cricket World Cup League Two 2023-27,2024-02-15,2027-03-30,international
1,10267,"Sri Lanka tour of Zimbabwe, 2025",2025-08-29,2025-09-07,international
2,10642,United Arab Emirates T20I Tri-Series 2025,2025-08-29,2025-09-07,international
3,8788,"South Africa tour of England, 2025",2025-09-02,2025-09-14,international
4,10812,Sweden tour of Isle Of Man 2025,2025-09-06,2025-09-07,international
...,...,...,...,...,...
75,10251,"South Africa Women tour of New Zealand, 2026",2026-03-15,2026-04-04,women
76,10526,New Zealand Women tour of England 2026,2026-05-10,2026-05-25,women
77,10543,India Women tour of England 2026,2026-05-28,2027-07-13,women
78,10119,ICC Womens T20 World Cup 2026,2026-06-12,2026-07-05,women


In [16]:
all_teams

Unnamed: 0,teamId,teamName,teamSName,team_type
0,18,India Senior,INDSR,domestic
1,20,India Green,INDGREEN,domestic
2,21,India Red,INDRED,domestic
3,36,India Blue,INDBLUE,domestic
4,41,Sussex,SUS,domestic
...,...,...,...,...
896,343,Fiji,FIJI,international
897,527,Italy,ITA,international
898,529,Botswana,BW,international
899,541,Belgium,BEL,international


In [23]:
cricket_player_data = pd.read_csv('cricket_player_data.csv')

In [None]:
import requests
import pandas as pd
import time
import os
import json
from requests.exceptions import JSONDecodeError, RequestException

# --- Global Constants and Counters ---
HEADERS = {
    # It's a good practice to use an environment variable for a real project
    # For this example, we use the key directly as provided by the user
    "x-rapidapi-key": "d08ad5ed80mshfa4f1be24cd6ec6p1e36b5jsn47875e9c26c2",
    "x-rapidapi-host": "cricbuzz-cricket.p.rapidapi.com"
}
BASE_URL = "https://cricbuzz-cricket.p.rapidapi.com/teams/v1/{}/players"
API_CALL_COUNT = 0
MAX_RETRIES = 5
CACHE_DIR = "api_cache"
CSV_FILES = [
    'all_teams.csv'
]

# --- Cache Management Functions ---

def setup_cache_dir():
    """
    Ensures the cache directory exists.
    """
    if not os.path.exists(CACHE_DIR):
        os.makedirs(CACHE_DIR)

def get_cached_data(team_id):
    """
    Tries to retrieve data from the local cache.

    Args:
        team_id (int or str): The ID of the team.

    Returns:
        dict or None: The cached data if found, otherwise None.
    """
    cache_path = os.path.join(CACHE_DIR, f"{team_id}.json")
    if os.path.exists(cache_path):
        print(f"  -> Data for team {team_id} found in cache. Using cached data.")
        with open(cache_path, 'r') as f:
            return json.load(f)
    return None

def save_data_to_cache(team_id, data):
    """
    Saves API response data to the local cache.

    Args:
        team_id (int or str): The ID of the team.
        data (dict): The JSON data to save.
    """
    cache_path = os.path.join(CACHE_DIR, f"{team_id}.json")
    with open(cache_path, 'w') as f:
        json.dump(data, f, indent=4)
    print(f"  -> Data for team {team_id} saved to cache.")

# --- API & Data Handling Functions ---

def get_teams_from_csvs(file_list):
    """
    Reads team data from a list of CSV files and consolidates them,
    filtering for 'league' teams.

    Args:
        file_list (list): A list of CSV file paths.

    Returns:
        list: A list of dictionaries, each containing 'teamId', 'teamName', and 'team_type'.
    """
    all_teams = []
    print("Reading team data from CSV files...")
    for file_name in file_list:
        try:
            df = pd.read_csv(file_name)
            if 'teamId' in df.columns and 'teamName' in df.columns and 'team_type' in df.columns:
                df_league = df[df['team_type'] == 'league']
                teams = df_league[['teamId', 'teamName', 'team_type']].to_dict('records')
                all_teams.extend(teams)
                print(f"Successfully read and filtered {len(teams)} league teams from {file_name}")
            else:
                print(f"Skipping {file_name}: Missing 'teamId', 'teamName', or 'team_type' columns.")
        except FileNotFoundError:
            print(f"File not found: {file_name}. Skipping...")
        except Exception as e:
            print(f"An error occurred while reading {file_name}: {e}")

    unique_teams = list({team['teamId']: team for team in all_teams}.values())
    print(f"\nFound a total of {len(unique_teams)} unique league teams across all files.")
    return unique_teams

def fetch_players_for_team(team_id, retries=0):
    """
    Fetches player data for a given teamId from the Cricbuzz API, with retry logic.

    Args:
        team_id (int or str): The ID of the team.
        retries (int): Current number of retries.

    Returns:
        dict: The JSON response data, or None if the request fails.
    """
    global API_CALL_COUNT
    url = BASE_URL.format(team_id)

    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        response.raise_for_status()
        API_CALL_COUNT += 1
        print(f"  -> API call successful. Total calls made this session: {API_CALL_COUNT}")
        return response.json()
    except requests.exceptions.HTTPError as http_err:
        if http_err.response.status_code == 429:
            # Handle "Too Many Requests" error with exponential backoff
            if retries < MAX_RETRIES:
                wait_time = 2 ** retries  # Exponential backoff (1s, 2s, 4s, etc.)
                print(f"  -> Received 429 Too Many Requests. Waiting {wait_time}s before retry {retries + 1}/{MAX_RETRIES}.")
                time.sleep(wait_time)
                return fetch_players_for_team(team_id, retries + 1)
            else:
                print("  -> Max retries reached for 429 error. Skipping this team.")
                return None
        else:
            print(f"  -> HTTP error fetching data for team ID {team_id}: {http_err}")
            return None
    except (JSONDecodeError, RequestException) as err:
        print(f"  -> An error occurred for team ID {team_id}: {err}")
        return None

def process_players_data(players_list, team_name):
    """
    Processes the raw player list from the API response and formats it.

    Args:
        players_list (list): The 'player' array from the API response.
        team_name (str): The name of the team.

    Returns:
        list: A list of formatted player dictionaries.
    """
    processed_players = []
    current_role = None
    if not players_list:
        return []

    for player in players_list:
        if 'id' not in player and 'name' in player:
            current_role = player['name']
        elif 'id' in player:
            player_data = {
                'id': player.get('id'),
                'name': player.get('name'),
                'battingStyle': player.get('battingStyle', ''),
                'bowlingStyle': player.get('bowlingStyle', ''),
                'role': current_role,
                'team_name': team_name
            }
            processed_players.append(player_data)

    return processed_players

def main():
    """
    Main function to orchestrate the data scraping and file generation.
    """
    setup_cache_dir()
    all_teams = get_teams_from_csvs(CSV_FILES)
    all_players_data = []

    print("\nStarting to fetch player data from API or cache...")
    for team in all_teams:
        team_id = team['teamId']
        team_name = team['teamName']
        print(f"Processing Team: {team_name} (ID: {team_id})...")

        # 1. Try to get data from cache first
        api_data = get_cached_data(team_id)

        # 2. If not in cache, fetch from API
        if api_data is None:
            api_data = fetch_players_for_team(team_id)
            if api_data:
                save_data_to_cache(team_id, api_data)

        if api_data and 'player' in api_data:
            players = process_players_data(api_data['player'], team_name)
            all_players_data.extend(players)
            print(f"  -> Found {len(players)} players for {team_name}.")
        else:
            print(f"  -> No player data found or error for {team_name}. Skipping.")

    if all_players_data:
        df_players = pd.DataFrame(all_players_data)
        output_file = "cricket_player_league_data.csv"
        df_players.to_csv(output_file, index=False)
        print(f"\nData collection complete. Total players found: {len(all_players_data)}")
        print(f"All data has been saved to '{output_file}'.")
    else:
        print("\nNo player data was collected. The output file will not be created.")

if __name__ == "__main__":
    main()


In [55]:
venue_info['timezone'].unique()

array(['+05:45', '+04:00', '+02:00', '-04:00', '-05:00', '+01:00',
       '+13:00', '+05:30', '+05:00', '+08:00', '+10:30', '+11:00',
       '+10:00', '+03:00', '+12:00'], dtype=object)

In [59]:
venue_info.columns

Index(['id', 'ground', 'city', 'country', 'timezone', 'capacity', 'ends',
       'homeTeam'],
      dtype='object')

In [60]:
venue_info['city'].unique()

array(['Kirtipur', 'Dubai', 'Voorburg', 'King City', 'Houston', 'Dallas',
       'Al Amerat', 'Lauderhill, Florida', 'Harare', 'Sharjah', 'Leeds',
       'London', 'Southampton', 'Cardiff', 'Manchester', 'Nottingham',
       'Tromode', 'Abu Dhabi', 'Malkerns', 'Bulawayo', 'Mount Maunganui',
       'Ahmedabad', 'Delhi', 'Lahore', 'Rawalpindi', 'Faisalabad',
       'Christchurch', 'Auckland', 'Hamilton', 'Wellington', 'Perth',
       'Adelaide', 'Sydney', 'Canberra', 'Melbourne', 'Hobart',
       'Gold Coast', 'Brisbane', 'Nelson', 'Dunedin', 'Napier', 'Lincoln',
       'Kolkata', 'Guwahati', 'Ranchi', 'Raipur', 'Visakhapatnam',
       'Cuttack', 'New Chandigarh', 'Dharamsala', 'Lucknow', 'Vadodara',
       'Rajkot', 'Indore', 'Thiruvananthapuram', 'Colombo', 'Pallekele',
       'Paarl', 'Cape Town', 'East London', 'Centurion', 'Johannesburg',
       'Chester-le-Street', 'Bristol', 'Birmingham', 'Leicester',
       'Taunton', 'Worcester', 'Chelmsford', 'Northampton', 'Hove',
       'Derb

In [62]:
venue_info['timezone'].value_counts()

Unnamed: 0_level_0,count
timezone,Unnamed: 1_level_1
+01:00,27
+05:30,23
+02:00,18
+13:00,16
+11:00,9
-04:00,8
+10:00,5
+04:00,4
+08:00,3
+12:00,3


In [70]:
venue_info['capacity'] = venue_info['capacity'].str.replace(',','')

In [72]:
venue_info['capacity'].unique()

array(['20000', '25000', nan, '10000', '7000', '16000', '17000', '30000',
       '6500 (20000 with temporary seating)',
       '5500 (15000 after redevelopment)', '19000', '15350 (to be 17000)',
       '9000', '132000', '48000', '27000', '15000', '41000',
       '10000 with flexibility to 30 000',
       '37000 (With temporary seating)', '60000',
       '53583 (including standing room)', '13550',
       '100000 (approx including standing room)', '42000 (approx)',
       '5000', '3500 (Increased to 6000 by temporary seating)', '22500',
       '11600', '63000', '40000', '65000', '28000', '45000', '38000',
       '23000', '50000', '35000', '22000', '34000', '23500',
       '5000 (17000 for internationals)', '7000 (15000 ODIs)', '21000',
       '12000', '6500', '4500', '4000', '9500',
       '8000 (10000 incl temporary seating)',
       '10000 (20000 for 2007 World Cup)', '15000 (increasing to 20000)',
       '12000 to 70000 (Cricket: 48000)', '1000', '11500', '6000', '3000',
       '22000

In [85]:
venue_info.loc[venue_info['capacity'] =='6500 (20000 with temporary seating)','capacity'] = '6500'
venue_info.loc[venue_info['capacity'] =='5500 (15000 after redevelopment)','capacity'] = '15000'
venue_info.loc[venue_info['capacity'] =='15350 (to be 17000)','capacity'] = '17000'
venue_info.loc[venue_info['capacity'] =='10000 with flexibility to 30 000','capacity'] = '10000'
venue_info.loc[venue_info['capacity'] =='37000 (With temporary seating)','capacity'] = '37000'
venue_info.loc[venue_info['capacity'] =='53583 (including standing room)','capacity'] = '53583'
venue_info.loc[venue_info['capacity'] =='100000 (approx including standing room)','capacity'] = '100000'
venue_info.loc[venue_info['capacity'] =='42000 (approx)','capacity'] = '42000'
venue_info.loc[venue_info['capacity'] =='3500 (Increased to 6000 by temporary seating)','capacity'] = '3500'
venue_info.loc[venue_info['capacity'] =='5000 (17000 for internationals)','capacity'] = '5000'
venue_info.loc[venue_info['capacity'] =='7000 (15000 ODIs)','capacity'] = '7000'
venue_info.loc[venue_info['capacity'] =='8000 (10000 incl temporary seating)','capacity'] = '8000'
venue_info.loc[venue_info['capacity'] =='10000 (20000 for 2007 World Cup)','capacity'] = '10000'
venue_info.loc[venue_info['capacity'] =='15000 (increasing to 20000)','capacity'] = '20000'
venue_info.loc[venue_info['capacity'] =='12000 to 70000 (Cricket: 48000)','capacity'] = '48000'
venue_info.loc[venue_info['capacity'] =='22000 (24500 with temporary seating)','capacity'] = '22000'
venue_info.loc[venue_info['capacity'] =='8000 (approx)','capacity'] = '8000'


In [88]:
venue_info['capacity'].fillna(0,inplace=True)

In [89]:
venue_info['capacity'] = venue_info['capacity'].astype(int)

Unnamed: 0,capacity
0,20000
1,25000
2,0
3,0
4,10000
...,...
121,0
122,40000
123,6000
124,60000


In [91]:
venue_info.to_csv('venue_info.csv',index=False)

In [3]:
pd.read_csv('cricket_player_data.csv')

Unnamed: 0,id,name,battingStyle,bowlingStyle,role,team_name
0,11808,Shubman Gill,Right-hand bat,Right-arm offbreak,BATSMEN,India
1,13940,Yashasvi Jaiswal,Left-hand bat,Right-arm legbreak,BATSMEN,India
2,13866,Sai Sudharsan,Left-hand bat,Right-arm legbreak,BATSMEN,India
3,576,Rohit Sharma,Right-hand bat,Right-arm offbreak,BATSMEN,India
4,1413,Virat Kohli,Right-hand bat,Right-arm medium,BATSMEN,India
...,...,...,...,...,...,...
601,11833,Twinkal Bhandari,Right-hand bat,,WICKET KEEPER,Oman
602,10458,Ajay Lalcheta,Left-hand bat,Left-arm orthodox,BOWLER,Oman
603,10461,Sufyan Mehmood,Left-hand bat,Right-arm medium,BOWLER,Oman
604,10460,Rajeshkumar Ranpura,Left-hand bat,Right-arm fast-medium,BOWLER,Oman


In [None]:
import pandas as pd
import requests
import time
import sys
import os

def get_batsmen_ids_from_csv(file_path: str) -> pd.DataFrame:
    """
    Reads a CSV file from a specified file path, filters for players with the 'BATSMEN' role,
    and returns their IDs and names in a DataFrame.
    """
    if not os.path.exists(file_path):
        print(f"Error: The file '{file_path}' was not found.", file=sys.stderr)
        return pd.DataFrame()

    try:
        df = pd.read_csv(file_path)
        batsmen_df = df[df['role'] == 'BATSMEN'][['id', 'name']]
        return batsmen_df
    except Exception as e:
        print(f"Error processing CSV file: {e}", file=sys.stderr)
        return pd.DataFrame()

def fetch_player_stats(player_id: str, api_key: str, api_host: str) -> dict:
    """
    Fetches batting statistics for a single player from the Cricbuzz API.
    """
    url = f"https://cricbuzz-cricket.p.rapidapi.com/stats/v1/player/{player_id}/batting"
    headers = {
        "x-rapidapi-key": api_key,
        "x-rapidapi-host": api_host
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raises an HTTPError for bad responses
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for player ID {player_id}: {e}", file=sys.stderr)
        return {}

def parse_stats_json(json_data: dict, player_id: str, player_name: str) -> list[dict]:
    """
    Parses the JSON response and formats it into a list of dictionaries.
    """
    parsed_stats = []
    if not json_data or 'headers' not in json_data or 'values' not in json_data:
        return parsed_stats

    formats = json_data['headers'][1:]
    stats_map = {item['values'][0]: item['values'][1:] for item in json_data['values']}

    for i, fmt in enumerate(formats):
        row = {
            'player_id': player_id,
            'player_name': player_name,
            'format': fmt,
            'matches': stats_map.get('Matches', ['0'])[i] if i < len(stats_map.get('Matches', ['0'])) else '0',
            'innings': stats_map.get('Innings', ['0'])[i] if i < len(stats_map.get('Innings', ['0'])) else '0',
            'runs': stats_map.get('Runs', ['0'])[i] if i < len(stats_map.get('Runs', ['0'])) else '0',
            'balls_faced': stats_map.get('Balls', ['0'])[i] if i < len(stats_map.get('Balls', ['0'])) else '0',
            'highest_score': stats_map.get('Highest', ['0'])[i] if i < len(stats_map.get('Highest', ['0'])) else '0',
            'average': stats_map.get('Average', ['0.0'])[i] if i < len(stats_map.get('Average', ['0.0'])) else '0.0',
            'strike_rate': stats_map.get('SR', ['0.0'])[i] if i < len(stats_map.get('SR', ['0.0'])) else '0.0',
            'not_out': stats_map.get('Not Out', ['0'])[i] if i < len(stats_map.get('Not Out', ['0'])) else '0',
            'fours': stats_map.get('Fours', ['0'])[i] if i < len(stats_map.get('Fours', ['0'])) else '0',
            'sixes': stats_map.get('Sixes', ['0'])[i] if i < len(stats_map.get('Sixes', ['0'])) else '0',
            'fifty_plus': stats_map.get('50s', ['0'])[i] if i < len(stats_map.get('50s', ['0'])) else '0',
            'hundreds': stats_map.get('100s', ['0'])[i] if i < len(stats_map.get('100s', ['0'])) else '0',
            'double_hundreds': stats_map.get('200s', ['0'])[i] if i < len(stats_map.get('200s', ['0'])) else '0'
        }
        parsed_stats.append(row)
    return parsed_stats

def main():
    """
    Main function to execute the data extraction process.
    """
    csv_file_path = "cricket_player_data.csv"

    print(f"Starting data extraction from '{csv_file_path}'...")

    # Get player IDs for batsmen from the CSV file
    batsmen_df = get_batsmen_ids_from_csv(csv_file_path)

    if batsmen_df.empty:
        print("No batsmen found in the CSV file or file not accessible. Exiting.", file=sys.stderr)
        return

    all_players_stats = []

    # Replace with your actual API key and host
    api_key = "0fec7b9425mshbc167bba6885159p1b85d7jsna612b835a4f0"
    api_host = "cricbuzz-cricket.p.rapidapi.com"

    # Iterate through each batsman and fetch their data
    for index, row in batsmen_df.iterrows():
        player_id = row['id']
        player_name = row['name']

        print(f"Fetching data for {player_name} (ID: {player_id})...")

        json_data = fetch_player_stats(str(player_id), api_key, api_host)

        if json_data:
            parsed_data = parse_stats_json(json_data, str(player_id), player_name)
            all_players_stats.extend(parsed_data)

        # Pause to prevent hitting API rate limits
        time.sleep(0.5)

    if all_players_stats:
        # Create a DataFrame from the collected data
        df_final = pd.DataFrame(all_players_stats)

        # Save to CSV
        output_file = 'all_batsmen_stats.csv'
        df_final.to_csv(output_file, index=False)
        print(f"Data extraction complete. Results saved to {output_file}")
    else:
        print("No statistics were successfully retrieved. The output CSV will not be created.", file=sys.stderr)

if __name__ == '__main__':
    main()

In [12]:
batsmen_stats = pd.read_csv('all_batsmen_stats.csv')

In [14]:
batsmen_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 680 entries, 0 to 679
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   player_id        680 non-null    int64  
 1   player_name      680 non-null    object 
 2   format           680 non-null    object 
 3   matches          680 non-null    int64  
 4   innings          680 non-null    int64  
 5   runs             680 non-null    int64  
 6   balls_faced      680 non-null    int64  
 7   highest_score    680 non-null    int64  
 8   average          680 non-null    float64
 9   strike_rate      680 non-null    float64
 10  not_out          680 non-null    int64  
 11  fours            680 non-null    int64  
 12  sixes            680 non-null    int64  
 13  fifty_plus       680 non-null    int64  
 14  hundreds         680 non-null    int64  
 15  double_hundreds  680 non-null    int64  
dtypes: float64(2), int64(12), object(2)
memory usage: 85.1+ KB


In [None]:
import pandas as pd
import requests
import time
import sys
import os

def get_bowlers_ids_from_csv(file_path: str) -> pd.DataFrame:
    """
    Reads a CSV file from a specified file path, filters for players with the 'BOWLER' role,
    and returns their IDs and names in a DataFrame.
    """
    if not os.path.exists(file_path):
        print(f"Error: The file '{file_path}' was not found.", file=sys.stderr)
        return pd.DataFrame()

    try:
        df = pd.read_csv(file_path)
        bowlers_df = df[df['role'] == 'BOWLER'][['id', 'name']]
        return bowlers_df
    except Exception as e:
        print(f"Error processing CSV file: {e}", file=sys.stderr)
        return pd.DataFrame()

def fetch_player_bowling_stats(player_id: str, api_key: str, api_host: str) -> dict:
    """
    Fetches bowling statistics for a single player from the Cricbuzz API.
    """
    url = f"https://cricbuzz-cricket.p.rapidapi.com/stats/v1/player/{player_id}/bowling"
    headers = {
        "x-rapidapi-key": api_key,
        "x-rapidapi-host": api_host
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raises an HTTPError for bad responses
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for player ID {player_id}: {e}", file=sys.stderr)
        return {}

def parse_bowling_stats_json(json_data: dict, player_id: str, player_name: str) -> list[dict]:
    """
    Parses the JSON response and formats it into a list of dictionaries for bowling stats.
    """
    parsed_stats = []
    if not json_data or 'headers' not in json_data or 'values' not in json_data:
        return parsed_stats

    formats = json_data['headers'][1:]
    stats_map = {item['values'][0]: item['values'][1:] for item in json_data['values']}

    for i, fmt in enumerate(formats):
        row = {
            'player_id': player_id,
            'player_name': player_name,
            'format': fmt,
            'matches': stats_map.get('Matches', ['0'])[i] if i < len(stats_map.get('Matches', ['0'])) else '0',
            'innings': stats_map.get('Innings', ['0'])[i] if i < len(stats_map.get('Innings', ['0'])) else '0',
            'balls': stats_map.get('Balls', ['0'])[i] if i < len(stats_map.get('Balls', ['0'])) else '0',
            'runs': stats_map.get('Runs', ['0'])[i] if i < len(stats_map.get('Runs', ['0'])) else '0',
            'maidens': stats_map.get('Maidens', ['0'])[i] if i < len(stats_map.get('Maidens', ['0'])) else '0',
            'wickets': stats_map.get('Wickets', ['0'])[i] if i < len(stats_map.get('Wickets', ['0'])) else '0',
            'avg': stats_map.get('Avg', ['0.0'])[i] if i < len(stats_map.get('Avg', ['0.0'])) else '0.0',
            'eco': stats_map.get('Eco', ['0.0'])[i] if i < len(stats_map.get('Eco', ['0.0'])) else '0.0',
            'sr': stats_map.get('SR', ['0.0'])[i] if i < len(stats_map.get('SR', ['0.0'])) else '0.0',
            'bbi': stats_map.get('BBI', ['-/-'])[i] if i < len(stats_map.get('BBI', ['-/-'])) else '-/-',
            'bbm': stats_map.get('BBM', ['-/-'])[i] if i < len(stats_map.get('BBM', ['-/-'])) else '-/-',
            '4w': stats_map.get('4w', ['0'])[i] if i < len(stats_map.get('4w', ['0'])) else '0',
            '5w': stats_map.get('5w', ['0'])[i] if i < len(stats_map.get('5w', ['0'])) else '0',
            '10w': stats_map.get('10w', ['0'])[i] if i < len(stats_map.get('10w', ['0'])) else '0'
        }
        parsed_stats.append(row)
    return parsed_stats

def main():
    """
    Main function to execute the data extraction process for bowlers.
    """
    csv_file_path = "cricket_player_data.csv"

    print(f"Starting bowling stats extraction from '{csv_file_path}'...")

    # Get player IDs for bowlers from the CSV file
    bowlers_df = get_bowlers_ids_from_csv(csv_file_path)

    if bowlers_df.empty:
        print("No bowlers found in the CSV file or file not accessible. Exiting.", file=sys.stderr)
        return

    all_players_stats = []

    # Your API key and host
    api_key = "db628101fcmshed571bf2e07f01ap1e0399jsn24eba9e2505c"
    api_host = "cricbuzz-cricket.p.rapidapi.com"

    # Iterate through each bowler and fetch their data
    for index, row in bowlers_df.iterrows():
        player_id = row['id']
        player_name = row['name']

        print(f"Fetching data for {player_name} (ID: {player_id})...")

        json_data = fetch_player_bowling_stats(str(player_id), api_key, api_host)

        if json_data:
            parsed_data = parse_bowling_stats_json(json_data, str(player_id), player_name)
            all_players_stats.extend(parsed_data)

        # Pause to prevent hitting API rate limits
        time.sleep(2)

    if all_players_stats:
        # Create a DataFrame from the collected data
        df_final = pd.DataFrame(all_players_stats)

        # Save to CSV
        output_file = 'all_bowlers_stats.csv'
        df_final.to_csv(output_file, index=False)
        print(f"Data extraction complete. Results saved to {output_file}")
    else:
        print("No statistics were successfully retrieved. The output CSV will not be created.", file=sys.stderr)

if __name__ == '__main__':
    main()


In [4]:
pd.read_csv('all_bowlers_stats.csv')

Unnamed: 0,player_id,player_name,format,matches,innings,balls,runs,maidens,wickets,avg,eco,sr,bbi,bbm,4w,5w,10w
0,10808,Mohammed Siraj,Test,41,76,6419,3820,190,123,31.06,3.57,52.19,6/15,9/190,7,5,0
1,10808,Mohammed Siraj,ODI,44,43,1975,1708,32,71,24.06,5.19,27.82,6/21,6/21,2,1,0
2,10808,Mohammed Siraj,T20,16,16,348,452,2,14,32.29,7.79,24.86,4/17,4/17,1,0,0
3,10808,Mohammed Siraj,IPL,108,108,2300,3349,4,109,30.72,8.74,21.10,4/17,4/17,3,0,0
4,9311,Jasprit Bumrah,Test,48,91,9348,4341,365,219,19.82,2.79,42.68,6/27,9/86,7,15,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
755,10460,Rajeshkumar Ranpura,IPL,0,0,0,0,0,0,0.00,0.00,0.00,-/-,-/-,0,0,0
756,10457,Munis Ansari,Test,0,0,0,0,0,0,0.00,0.00,0.00,-/-,-/-,0,0,0
757,10457,Munis Ansari,ODI,0,0,0,0,0,0,0.00,0.00,0.00,-/-,-/-,0,0,0
758,10457,Munis Ansari,T20,10,10,227,334,0,8,41.75,8.83,28.38,3/37,3/37,0,0,0


In [None]:
import pandas as pd
import requests
import time
import sys
import os

def get_players_by_role_from_csv(file_path: str, role: str) -> pd.DataFrame:
    """
    Reads a CSV file from a specified file path, filters for players with a given role,
    and returns their IDs and names in a DataFrame.
    """
    if not os.path.exists(file_path):
        print(f"Error: The file '{file_path}' was not found.", file=sys.stderr)
        return pd.DataFrame()

    try:
        df = pd.read_csv(file_path)
        players_df = df[df['role'] == role][['id', 'name']]
        return players_df
    except Exception as e:
        print(f"Error processing CSV file: {e}", file=sys.stderr)
        return pd.DataFrame()

def fetch_player_bowling_stats(player_id: str, api_key: str, api_host: str) -> dict:
    """
    Fetches bowling statistics for a single player from the Cricbuzz API.
    """
    url = f"https://cricbuzz-cricket.p.rapidapi.com/stats/v1/player/{player_id}/bowling"
    headers = {
        "x-rapidapi-key": api_key,
        "x-rapidapi-host": api_host
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raises an HTTPError for bad responses
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching bowling data for player ID {player_id}: {e}", file=sys.stderr)
        return {}

def fetch_player_batting_stats(player_id: str, api_key: str, api_host: str) -> dict:
    """
    Fetches batting statistics for a single player from the Cricbuzz API.
    """
    url = f"https://cricbuzz-cricket.p.rapidapi.com/stats/v1/player/{player_id}/batting"
    headers = {
        "x-rapidapi-key": api_key,
        "x-rapidapi-host": api_host
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raises an HTTPError for bad responses
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching batting data for player ID {player_id}: {e}", file=sys.stderr)
        return {}

def parse_bowling_stats_json(json_data: dict, player_id: str, player_name: str) -> list[dict]:
    """
    Parses the JSON response and formats it into a list of dictionaries for bowling stats.
    """
    parsed_stats = []
    if not json_data or 'headers' not in json_data or 'values' not in json_data:
        return parsed_stats

    formats = json_data['headers'][1:]
    stats_map = {item['values'][0]: item['values'][1:] for item in json_data['values']}

    for i, fmt in enumerate(formats):
        row = {
            'player_id': player_id,
            'player_name': player_name,
            'format': fmt,
            'matches': stats_map.get('Matches', ['0'])[i] if i < len(stats_map.get('Matches', ['0'])) else '0',
            'innings': stats_map.get('Innings', ['0'])[i] if i < len(stats_map.get('Innings', ['0'])) else '0',
            'balls': stats_map.get('Balls', ['0'])[i] if i < len(stats_map.get('Balls', ['0'])) else '0',
            'runs': stats_map.get('Runs', ['0'])[i] if i < len(stats_map.get('Runs', ['0'])) else '0',
            'maidens': stats_map.get('Maidens', ['0'])[i] if i < len(stats_map.get('Maidens', ['0'])) else '0',
            'wickets': stats_map.get('Wickets', ['0'])[i] if i < len(stats_map.get('Wickets', ['0'])) else '0',
            'avg': stats_map.get('Avg', ['0.0'])[i] if i < len(stats_map.get('Avg', ['0.0'])) else '0.0',
            'eco': stats_map.get('Eco', ['0.0'])[i] if i < len(stats_map.get('Eco', ['0.0'])) else '0.0',
            'sr': stats_map.get('SR', ['0.0'])[i] if i < len(stats_map.get('SR', ['0.0'])) else '0.0',
            'bbi': stats_map.get('BBI', ['-/-'])[i] if i < len(stats_map.get('BBI', ['-/-'])) else '-/-',
            'bbm': stats_map.get('BBM', ['-/-'])[i] if i < len(stats_map.get('BBM', ['-/-'])) else '-/-',
            '4w': stats_map.get('4w', ['0'])[i] if i < len(stats_map.get('4w', ['0'])) else '0',
            '5w': stats_map.get('5w', ['0'])[i] if i < len(stats_map.get('5w', ['0'])) else '0',
            '10w': stats_map.get('10w', ['0'])[i] if i < len(stats_map.get('10w', ['0'])) else '0'
        }
        parsed_stats.append(row)
    return parsed_stats

def parse_batting_stats_json(json_data: dict, player_id: str, player_name: str) -> list[dict]:
    """
    Parses the JSON response and formats it into a list of dictionaries for batting stats.
    """
    parsed_stats = []
    if not json_data or 'headers' not in json_data or 'values' not in json_data:
        return parsed_stats

    formats = json_data['headers'][1:]
    stats_map = {item['values'][0]: item['values'][1:] for item in json_data['values']}

    for i, fmt in enumerate(formats):
        row = {
            'player_id': player_id,
            'player_name': player_name,
            'format': fmt,
            'matches': stats_map.get('Matches', ['0'])[i] if i < len(stats_map.get('Matches', ['0'])) else '0',
            'innings': stats_map.get('Innings', ['0'])[i] if i < len(stats_map.get('Innings', ['0'])) else '0',
            'not_out': stats_map.get('Not Outs', ['0'])[i] if i < len(stats_map.get('Not Outs', ['0'])) else '0',
            'runs': stats_map.get('Runs', ['0'])[i] if i < len(stats_map.get('Runs', ['0'])) else '0',
            'high_score': stats_map.get('Highest Score', ['0'])[i] if i < len(stats_map.get('Highest Score', ['0'])) else '0',
            'avg': stats_map.get('Avg', ['0.0'])[i] if i < len(stats_map.get('Avg', ['0.0'])) else '0.0',
            'strike_rate': stats_map.get('Strike Rate', ['0.0'])[i] if i < len(stats_map.get('Strike Rate', ['0.0'])) else '0.0',
            'hundreds': stats_map.get('100s', ['0'])[i] if i < len(stats_map.get('100s', ['0'])) else '0',
            'fifties': stats_map.get('50s', ['0'])[i] if i < len(stats_map.get('50s', ['0'])) else '0',
            'fours': stats_map.get('4s', ['0'])[i] if i < len(stats_map.get('4s', ['0'])) else '0',
            'sixes': stats_map.get('6s', ['0'])[i] if i < len(stats_map.get('6s', ['0'])) else '0',
            'ducks': stats_map.get('Ducks', ['0'])[i] if i < len(stats_map.get('Ducks', ['0'])) else '0'
        }
        parsed_stats.append(row)
    return parsed_stats

def main():
    """
    Main function to execute the data extraction process for all-rounders.
    """
    csv_file_path = "cricket_player_data.csv"

    print(f"Starting stats extraction for ALL ROUNDERs from '{csv_file_path}'...")

    # Get player IDs for all-rounders from the CSV file
    all_rounders_df = get_players_by_role_from_csv(csv_file_path, 'ALL ROUNDER')

    if all_rounders_df.empty:
        print("No all-rounders found in the CSV file or file not accessible. Exiting.", file=sys.stderr)
        return

    all_batting_stats = []
    all_bowling_stats = []

    # Your API key and host
    api_key = "a7d620e600msh1fd5e2619340345p161cb2jsn56f917c5665b"
    api_host = "cricbuzz-cricket.p.rapidapi.com"

    # Iterate through each all-rounder and fetch their data
    for index, row in all_rounders_df.iterrows():
        player_id = row['id']
        player_name = row['name']

        print(f"Fetching data for {player_name} (ID: {player_id})...")

        # Fetch and parse bowling stats
        bowling_json_data = fetch_player_bowling_stats(str(player_id), api_key, api_host)
        if bowling_json_data:
            parsed_bowling_data = parse_bowling_stats_json(bowling_json_data, str(player_id), player_name)
            all_bowling_stats.extend(parsed_bowling_data)

        # Fetch and parse batting stats
        batting_json_data = fetch_player_batting_stats(str(player_id), api_key, api_host)
        if batting_json_data:
            parsed_batting_data = parse_batting_stats_json(batting_json_data, str(player_id), player_name)
            all_batting_stats.extend(parsed_batting_data)

        # Pause to prevent hitting API rate limits
        time.sleep(5)

    if all_batting_stats:
        df_batting = pd.DataFrame(all_batting_stats)
        output_file_batting = 'all_rounders_batting_stats.csv'
        df_batting.to_csv(output_file_batting, index=False)
        print(f"Batting data extraction complete. Results saved to {output_file_batting}")
    else:
        print("No batting statistics were successfully retrieved.", file=sys.stderr)

    if all_bowling_stats:
        df_bowling = pd.DataFrame(all_bowling_stats)
        output_file_bowling = 'all_rounders_bowling_stats.csv'
        df_bowling.to_csv(output_file_bowling, index=False)
        print(f"Bowling data extraction complete. Results saved to {output_file_bowling}")
    else:
        print("No bowling statistics were successfully retrieved.", file=sys.stderr)

if __name__ == '__main__':
    main()
