In [11]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from io import StringIO  # Import StringIO to handle HTML as a file-like object

def scrape_districts(wiki_url, output_csv="districts.csv"):
    """
    Scrapes the district table from the given Wikipedia URL and saves it to a CSV file.

    Parameters:
    - wiki_url (str): The URL of the Wikipedia page containing the districts table.
    - output_csv (str): The filename where the data will be stored (default: 'districts.csv').
    """
    try:
        # Get page content
        response = requests.get(wiki_url)
        response.raise_for_status()  # Raise an error if request fails

        # Parse the page
        soup = BeautifulSoup(response.text, "html.parser")

        # Find all tables on the page
        tables = soup.find_all("table")
        if not tables:
            print(f"No tables found on {wiki_url}")
            return

        # Identify the largest relevant table
        selected_table = None
        max_rows = 0

        for table in tables:
            try:
                # Wrap the table HTML in StringIO to avoid FutureWarning
                table_html = StringIO(str(table))
                df = pd.read_html(table_html)[0]

                # Ensure the table contains district-related information
                relevant_keywords = ["District", "Code", "Headquarters", "Population", "Area"]
                match_count = sum(1 for col in df.columns if any(key in str(col) for key in relevant_keywords))

                if match_count >= 2 and len(df) > max_rows:  # Ensure it's the largest relevant table
                    max_rows = len(df)
                    selected_table = df
            except Exception as e:
                continue  # Ignore errors and move to the next table

        # Save the correct table
        if selected_table is not None:
            selected_table.to_csv(output_csv, index=False, encoding="utf-8")
            print(f"✅ Data saved successfully to {output_csv}")
        else:
            print(f"⚠ No relevant district table found on {wiki_url}")

    except Exception as e:
        print(f"❌ Error processing {wiki_url}: {e}")


In [19]:
# Dictionary mapping states to their Wikipedia URLs
state_urls = {
    "Andhra Pradesh": "https://en.wikipedia.org/wiki/List_of_districts_of_Andhra_Pradesh",
    "Arunachal Pradesh": "https://en.wikipedia.org/wiki/List_of_districts_of_Arunachal_Pradesh",
    "Assam": "https://en.wikipedia.org/wiki/List_of_districts_of_Assam",
    "Bihar": "https://en.wikipedia.org/wiki/List_of_districts_of_Bihar",
    "Chhattisgarh": "https://en.wikipedia.org/wiki/List_of_districts_of_Chhattisgarh",
    "Goa": "https://en.wikipedia.org/wiki/List_of_districts_of_Goa",
    "Gujarat": "https://en.wikipedia.org/wiki/List_of_districts_of_Gujarat",
    "Haryana": "https://en.wikipedia.org/wiki/List_of_districts_of_Haryana",
    "Himachal Pradesh": "https://en.wikipedia.org/wiki/List_of_districts_of_Himachal_Pradesh",
    "Jharkhand": "https://en.wikipedia.org/wiki/List_of_districts_of_Jharkhand",
    "Karnataka": "https://en.wikipedia.org/wiki/List_of_districts_of_Karnataka",
    "Kerala": "https://en.wikipedia.org/wiki/List_of_districts_of_Kerala",
    "Madhya Pradesh": "https://en.wikipedia.org/wiki/List_of_districts_of_Madhya_Pradesh",
    "Maharashtra": "https://en.wikipedia.org/wiki/List_of_districts_of_Maharashtra",
    "Manipur": "https://en.wikipedia.org/wiki/List_of_districts_of_Manipur",
    "Meghalaya": "https://en.wikipedia.org/wiki/List_of_districts_of_Meghalaya",
    "Mizoram": "https://en.wikipedia.org/wiki/List_of_districts_of_Mizoram",
    "Nagaland": "https://en.wikipedia.org/wiki/List_of_districts_of_Nagaland",
    "Odisha": "https://en.wikipedia.org/wiki/List_of_districts_of_Odisha",
    "Punjab": "https://en.wikipedia.org/wiki/List_of_districts_of_Punjab,_India",
    "Rajasthan": "https://en.wikipedia.org/wiki/List_of_districts_of_Rajasthan",
    "Sikkim": "https://en.wikipedia.org/wiki/List_of_districts_of_Sikkim",
    "Tamil Nadu": "https://en.wikipedia.org/wiki/List_of_districts_of_Tamil_Nadu",
    "Telangana": "https://en.wikipedia.org/wiki/List_of_districts_of_Telangana",
    "Tripura": "https://en.wikipedia.org/wiki/List_of_districts_of_Tripura",
    "Uttar Pradesh": "https://en.wikipedia.org/wiki/List_of_districts_of_Uttar_Pradesh",
    "Uttarakhand": "https://en.wikipedia.org/wiki/List_of_districts_of_Uttarakhand",
    "West Bengal": "https://en.wikipedia.org/wiki/List_of_districts_of_West_Bengal",
    "Andaman and Nicobar Islands": "https://en.wikipedia.org/wiki/List_of_districts_of_Andaman_and_Nicobar_Islands",
    "Dadra Nagar Haveli, Daman and Diu": "https://en.wikipedia.org/wiki/List_of_districts_of_Dadra_and_Nagar_Haveli_and_Daman_and_Diu",
    "Jammu and Kashmir": "https://en.wikipedia.org/wiki/List_of_districts_of_Jammu_and_Kashmir",
    "Ladakh": "https://en.wikipedia.org/wiki/List_of_districts_of_Ladakh",
    "Delhi": "https://en.wikipedia.org/wiki/List_of_districts_of_Delhi",
    "Puducherry": "https://en.wikipedia.org/wiki/List_of_districts_of_Puducherry"
}

In [21]:
for state, url in state_urls.items():
  scrape_districts(url, state + '.csv')


✅ Data saved successfully to Andhra Pradesh.csv
✅ Data saved successfully to Arunachal Pradesh.csv
✅ Data saved successfully to Assam.csv
✅ Data saved successfully to Bihar.csv
✅ Data saved successfully to Chhattisgarh.csv
✅ Data saved successfully to Goa.csv
✅ Data saved successfully to Gujarat.csv
✅ Data saved successfully to Haryana.csv
✅ Data saved successfully to Himachal Pradesh.csv
✅ Data saved successfully to Jharkhand.csv
✅ Data saved successfully to Karnataka.csv
✅ Data saved successfully to Kerala.csv
✅ Data saved successfully to Madhya Pradesh.csv
✅ Data saved successfully to Maharashtra.csv
✅ Data saved successfully to Manipur.csv
✅ Data saved successfully to Meghalaya.csv
✅ Data saved successfully to Mizoram.csv
✅ Data saved successfully to Nagaland.csv
✅ Data saved successfully to Odisha.csv
✅ Data saved successfully to Punjab.csv
✅ Data saved successfully to Rajasthan.csv
✅ Data saved successfully to Sikkim.csv
✅ Data saved successfully to Tamil Nadu.csv
✅ Data saved su