In [20]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_and_save_table(url: str, file_name: str, output_dir: str):
    print(f"🔄 Fetching: {url}")
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
    }

    try:
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code != 200:
            print(f"❌ Failed to load page. Status code: {response.status_code}")
            return

        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Try multiple table class variants
        possible_classes = [
            "table table-responsive table-hover",
            "table table-hover",
            "table"
        ]
        
        table = None
        for class_name in possible_classes:
            table = soup.find("table", class_=class_name)
            if table:
                break

        if not table:
            print(f"❌ No valid table found for {file_name}")
            return

        # Extract table rows
        rows = []
        for tr in table.find_all("tr"):
            cols = [td.get_text(strip=True) for td in tr.find_all(["td", "th"])]
            if cols:
                rows.append(cols)

        if len(rows) < 2:
            print(f"❌ Not enough data to parse for {file_name}")
            return

        headers_row = rows[0]
        data_rows = rows[1:]

        df = pd.DataFrame(data_rows, columns=headers_row)

        # Ensure output directory exists
        os.makedirs(output_dir, exist_ok=True)
        file_path = os.path.join(output_dir, f"{file_name}.csv")
        df.to_csv(file_path, index=False)
        
        print(f"✅ Saved: {file_path}")

    except Exception as e:
        print(f"❌ Error scraping {file_name}: {e}")


In [21]:
# Define directory for saving CSVs
output_directory = "combined odi+ t20 + test stats"

# Define the URLs with custom file keys
record_pages = {
    "combined_results_summary": "https://www.cricwindow.com/combined-records/most-matches-played-by-team.html",
    "combined_most_runs": "https://www.cricwindow.com/combined-records/most-runs-career.html",
    "combined_most_wickets": "https://www.cricwindow.com/combined-records/most-wickets-career.html",
    "combined_most_WK_dismissals": "https://www.cricwindow.com/combined-records/most-dismissals-career.html",
    "combined_most_catches": "https://www.cricwindow.com/combined-records/most-catches-career.html",
    "combined_most_matches_as_captain": "https://www.cricwindow.com/combined-records/most-matches-as-captain.html",
    "combined_most_player_of_the_matches": "https://www.cricwindow.com/combined-records/most-player-of-the-match-awards.html",
    "combined_most_player_of_the_series": "https://www.cricwindow.com/combined-records/most-player-of-the-series-awards.html"
}

# Run scraper for each record page
for file_key, url in record_pages.items():
    scrape_and_save_table(url, file_key, output_directory)


🔄 Fetching: https://www.cricwindow.com/combined-records/most-matches-played-by-team.html
✅ Saved: combined odi+ t20 + test stats\combined_results_summary.csv
🔄 Fetching: https://www.cricwindow.com/combined-records/most-runs-career.html
✅ Saved: combined odi+ t20 + test stats\combined_most_runs.csv
🔄 Fetching: https://www.cricwindow.com/combined-records/most-wickets-career.html
✅ Saved: combined odi+ t20 + test stats\combined_most_wickets.csv
🔄 Fetching: https://www.cricwindow.com/combined-records/most-dismissals-career.html
✅ Saved: combined odi+ t20 + test stats\combined_most_WK_dismissals.csv
🔄 Fetching: https://www.cricwindow.com/combined-records/most-catches-career.html
✅ Saved: combined odi+ t20 + test stats\combined_most_catches.csv
🔄 Fetching: https://www.cricwindow.com/combined-records/most-matches-as-captain.html
✅ Saved: combined odi+ t20 + test stats\combined_most_matches_as_captain.csv
🔄 Fetching: https://www.cricwindow.com/combined-records/most-player-of-the-match-awards.h

In [22]:
# Define directory for saving CSVs
output_directory = "odi stats"

# Define the URLs with custom file keys
record_pages = {
    "ODI_results_summary": "https://www.cricwindow.com/odi-records/most-matches-played-by-team.html",
    "ODI_most_runs": "https://www.cricwindow.com/odi-records/most-runs-career.html",
    "ODI_most_wickets": "https://www.cricwindow.com/odi-records/most-wickets-career.html",
    "ODI_most_WK_dismissals": "https://www.cricwindow.com/odi-records/most-dismissals-career.html",
    "ODI_most_catches": "https://www.cricwindow.com/odi-records/most-catches-career.html",
    "ODI_most_matches_as_captain": "https://www.cricwindow.com/odi-records/most-matches-as-captain.html",
    "ODI_most_player_of_the_matches": "https://www.cricwindow.com/odi-records/most-player-of-the-match-awards.html",
    "ODI_most_player_of_the_series": "https://www.cricwindow.com/odi-records/most-player-of-the-series-awards.html",
}

# Run scraper for each record page
for file_key, url in record_pages.items():
    scrape_and_save_table(url, file_key, output_directory)


🔄 Fetching: https://www.cricwindow.com/odi-records/most-matches-played-by-team.html
✅ Saved: odi stats\ODI_results_summary.csv
🔄 Fetching: https://www.cricwindow.com/odi-records/most-runs-career.html
✅ Saved: odi stats\ODI_most_runs.csv
🔄 Fetching: https://www.cricwindow.com/odi-records/most-wickets-career.html
✅ Saved: odi stats\ODI_most_wickets.csv
🔄 Fetching: https://www.cricwindow.com/odi-records/most-dismissals-career.html
✅ Saved: odi stats\ODI_most_WK_dismissals.csv
🔄 Fetching: https://www.cricwindow.com/odi-records/most-catches-career.html
✅ Saved: odi stats\ODI_most_catches.csv
🔄 Fetching: https://www.cricwindow.com/odi-records/most-matches-as-captain.html
✅ Saved: odi stats\ODI_most_matches_as_captain.csv
🔄 Fetching: https://www.cricwindow.com/odi-records/most-player-of-the-match-awards.html
✅ Saved: odi stats\ODI_most_player_of_the_matches.csv
🔄 Fetching: https://www.cricwindow.com/odi-records/most-player-of-the-series-awards.html
✅ Saved: odi stats\ODI_most_player_of_the_se

In [23]:
# Define directory for saving CSVs
output_directory = "t20 stats"

# Define the URLs with custom file keys
record_pages = {
    "T20_results_summary": "https://www.cricwindow.com/t20i-records/most-matches-played-by-team.html",
    "T20_most_runs": "https://www.cricwindow.com/t20i-records/most-runs-career.html",
    "T20_most_wickets": "https://www.cricwindow.com/t20i-records/most-wickets-career.html",
    "T20_most_WK_dismissals": "https://www.cricwindow.com/t20i-records/most-dismissals-career.html",
    "T20_most_catches": "https://www.cricwindow.com/t20i-records/most-catches-career.html",
    "T20_most_matches_as_captain": "https://www.cricwindow.com/t20i-records/most-matches-as-captain.html",
    "T20_most_player_of_the_matches": "https://www.cricwindow.com/t20i-records/most-player-of-the-match-awards.html",
    "T20_most_player_of_the_series": "https://www.cricwindow.com/t20i-records/most-player-of-the-series-awards.html",
}

# Run scraper for each record page
for file_key, url in record_pages.items():
    scrape_and_save_table(url, file_key, output_directory)


🔄 Fetching: https://www.cricwindow.com/t20i-records/most-matches-played-by-team.html
✅ Saved: t20 stats\T20_results_summary.csv
🔄 Fetching: https://www.cricwindow.com/t20i-records/most-runs-career.html
✅ Saved: t20 stats\T20_most_runs.csv
🔄 Fetching: https://www.cricwindow.com/t20i-records/most-wickets-career.html
✅ Saved: t20 stats\T20_most_wickets.csv
🔄 Fetching: https://www.cricwindow.com/t20i-records/most-dismissals-career.html
✅ Saved: t20 stats\T20_most_WK_dismissals.csv
🔄 Fetching: https://www.cricwindow.com/t20i-records/most-catches-career.html
✅ Saved: t20 stats\T20_most_catches.csv
🔄 Fetching: https://www.cricwindow.com/t20i-records/most-matches-as-captain.html
✅ Saved: t20 stats\T20_most_matches_as_captain.csv
🔄 Fetching: https://www.cricwindow.com/t20i-records/most-player-of-the-match-awards.html
✅ Saved: t20 stats\T20_most_player_of_the_matches.csv
🔄 Fetching: https://www.cricwindow.com/t20i-records/most-player-of-the-series-awards.html
✅ Saved: t20 stats\T20_most_player_o

In [25]:
# Define directory for saving CSVs
output_directory = "test stats"

# Define the URLs with custom file keys
record_pages = {
    "Test_results_summary": "https://www.cricwindow.com/test-records/most-matches-played-by-team.html",
    "Test_most_runs": "https://www.cricwindow.com/test-records/most-runs-career.html",
    "T20_most_wickets": "https://www.cricwindow.com/test-records/most-wickets-career.html",
    "Test_most_WK_dismissals": "https://www.cricwindow.com/test-records/most-dismissals-career.html",
    "Test_most_catches": "https://www.cricwindow.com/test-records/most-catches-career.html",
    "Test_most_matches_as_captain": "https://www.cricwindow.com/test-records/most-matches-as-captain.html",
    "Test_most_player_of_the_matches": "https://www.cricwindow.com/test-records/most-player-of-the-match-awards.html",
    "Test_most_player_of_the_series": "https://www.cricwindow.com/test-records/most-player-of-the-series-awards.html",
}

# Run scraper for each record page
for file_key, url in record_pages.items():
    scrape_and_save_table(url, file_key, output_directory)


🔄 Fetching: https://www.cricwindow.com/test-records/most-matches-played-by-team.html
✅ Saved: test stats\Test_results_summary.csv
🔄 Fetching: https://www.cricwindow.com/test-records/most-runs-career.html
✅ Saved: test stats\Test_most_runs.csv
🔄 Fetching: https://www.cricwindow.com/test-records/most-wickets-career.html
✅ Saved: test stats\T20_most_wickets.csv
🔄 Fetching: https://www.cricwindow.com/test-records/most-dismissals-career.html
✅ Saved: test stats\Test_most_WK_dismissals.csv
🔄 Fetching: https://www.cricwindow.com/test-records/most-catches-career.html
✅ Saved: test stats\Test_most_catches.csv
🔄 Fetching: https://www.cricwindow.com/test-records/most-matches-as-captain.html
✅ Saved: test stats\Test_most_matches_as_captain.csv
🔄 Fetching: https://www.cricwindow.com/test-records/most-player-of-the-match-awards.html
✅ Saved: test stats\Test_most_player_of_the_matches.csv
🔄 Fetching: https://www.cricwindow.com/test-records/most-player-of-the-series-awards.html
✅ Saved: test stats\Tes