In [5]:
import polars as pl
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed

# Function to extract player profile URL and ID
def get_player_profile(player_name):
    search_name = player_name.replace(" ", "%20")  # URL-encode the player name
    search_url = f"https://search.espncricinfo.com/ci/content/site/search.html?search={search_name}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    try:
        response = requests.get(search_url, headers=headers)
        soup = BeautifulSoup(response.text, "html.parser")

        # Find the first player profile link
        player_links = soup.find_all("a", href=True)
        for link in player_links:
            if "/cricketers/" in link["href"]:
                profile_url = f"https://www.espncricinfo.com{link['href']}"
                player_id = profile_url.split("-")[-1]  # Extract player ID from URL
                return player_name, profile_url, player_id
    except Exception as e:
        print(f"Error fetching {player_name}: {e}")
    
    return player_name, "Not found", "Not found"

# Load the CSV with player names into a Polars DataFrame
df = pl.read_csv("player_names.csv")  # Assuming the CSV has a 'Player' column

df = df[:20]
# Add new columns for profile URL and player ID
df = df.with_columns([
    pl.lit("").alias("profile_url"),
    pl.lit("").alias("player_id")
])

# Define a function to process the DataFrame in parallel batches
def process_in_batches(player_names, batch_size=10):
    results = []
    
    with ThreadPoolExecutor(max_workers=batch_size) as executor:
        # Submit each player search task to the thread pool
        future_to_player = {executor.submit(get_player_profile, player_name): player_name for player_name in player_names}
        
        # Process the completed tasks as they finish
        for future in as_completed(future_to_player):
            try:
                result = future.result()
                results.append(result)
            except Exception as e:
                print(f"Error in future: {e}")
    
    return results

# Batch size for processing requests in parallel
batch_size = 10  # Adjust batch size according to your system capabilities

# Get player names from the DataFrame
player_names = df['Player'].to_list()

# Process player names in parallel and get results
parallel_results = process_in_batches(player_names, batch_size=batch_size)

# Convert the parallel results back to a Polars DataFrame
updated_df = pl.DataFrame(parallel_results, schema=["Player", "profile_url", "player_id"])

# Save the updated DataFrame back to a CSV file
updated_df.write_csv("players_with_ids_20.csv")

print("Player profiles and IDs have been added to the CSV.")


Player profiles and IDs have been added to the CSV.


  return dispatch(args[0].__class__)(*args, **kw)


In [4]:
df = df[:5]

Player
str
"""Parvez Rasool"""
"""M Shumba"""
"""Mominul Haque"""
"""DI Allan"""
"""Mohammad Naim"""
