In [1]:
import os  # For file and folder operations
import requests  # To send HTTP requests to websites
from bs4 import BeautifulSoup as bs  # To parse HTML content of a webpage
import pandas as pd  # To handle tabular data
from io import StringIO  # Helps in handling data as strings (for converting tables)
 
# List of URLs you want to scrape
app_issue_links = [
    # Add your list of URLs here (I will keep only the first few for simplicity)
'https://nreganarep.nic.in/netnrega/app_issue.aspx?page=b&lflag=&state_name=WEST+BENGAL&state_code=32&district_name=24+PARGANAS+(NORTH)&district_code=3215&block_code=3215001&block_name=AMDANGA&fin_year=2023-2024&source=national&Digest=+fye1dKGV8VZ/Al5kDRPHg',
'https://nreganarep.nic.in/netnrega/app_issue.aspx?page=b&lflag=&state_name=WEST+BENGAL&state_code=32&district_name=24+PARGANAS+(NORTH)&district_code=3215&block_code=3215002&block_name=BARASAT-I&fin_year=2023-2024&source=national&Digest=/R2LPS3tUVSRferckJTH5A'
    # Add more URLs as needed
]
 
# This function fetches and processes data from each URL
def scrape_app_issue_data(url):
    try:
        print(f"Fetching data from {url}...")
        
        # Send a request to the URL and get the response
        response = requests.get(url)
        
        # Check if the request was successful
        if response.status_code == 200:
            # Parse the HTML content of the page
            soup = bs(response.text, 'lxml')
            
            # Look for the first table on the page (assuming data is in a table)
            table = soup.find('table', {'border': '1'})
            
            if table:
                # Convert the HTML table into a pandas DataFrame for easier handling
                table_str = str(table)
                table_io = StringIO(table_str)  # Use StringIO to read the table like a file
                
                # pandas will read the table and turn it into a DataFrame
                df = pd.read_html(table_io)[0]  # Extract the first table
                return df
            else:
                print(f"No table found on {url}")
                return None
        else:
            print(f"Failed to retrieve data from {url}. Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None
 
# This function helps create valid filenames from URLs (removes spaces and special characters)
def sanitize_filename(filename):
    # Replace characters not allowed in filenames with underscores
    return filename.replace(" ", "_").replace("+", "_").replace("=", "_").replace("/", "_")
 
# Function to scrape data from all the URLs in the list
def scrape_data_from_links(links):
    all_data = {}  # A dictionary to store data from each URL
    
    # Go through each link in the list
    for url in links:
        print(f"Scraping data from {url}...")
        
        # Get data from the URL using the function above
        data = scrape_app_issue_data(url)
        
        if data is not None:
            all_data[url] = data  # Save the DataFrame for each URL
    
    return all_data
 
# Create a folder where data will be saved
output_dir = "scraped_data"
os.makedirs(output_dir, exist_ok=True)  # Create the folder if it doesn't exist
 
# Call the function to scrape data from all URLs
scraped_data = scrape_data_from_links(app_issue_links)
 
# For each URL and its associated data, save the data to a CSV file
for url, data in scraped_data.items():
    if data is not None:
        print(f"Data from {url}:")
        print(data.head())  # Print first few rows of the data
        
        # Create a filename based on the URL (sanitize it for safe file naming)
        sanitized_filename = sanitize_filename(f"scraped_data_from_{url.split('=')[-1]}.csv")
        
        # Full path where the CSV file will be saved
        output_path = os.path.join(output_dir, sanitized_filename)
        
        # Save the DataFrame to a CSV file
        try:
            data.to_csv(output_path, index=False)  # Save to CSV without row numbers
            print(f"Saved data to {output_path}")
        except Exception as e:
            print(f"Error saving data to {output_path}: {e}")

Scraping data from https://nreganarep.nic.in/netnrega/app_issue.aspx?page=b&lflag=&state_name=WEST+BENGAL&state_code=32&district_name=24+PARGANAS+(NORTH)&district_code=3215&block_code=3215001&block_name=AMDANGA&fin_year=2023-2024&source=national&Digest=+fye1dKGV8VZ/Al5kDRPHg...
Fetching data from https://nreganarep.nic.in/netnrega/app_issue.aspx?page=b&lflag=&state_name=WEST+BENGAL&state_code=32&district_name=24+PARGANAS+(NORTH)&district_code=3215&block_code=3215001&block_name=AMDANGA&fin_year=2023-2024&source=national&Digest=+fye1dKGV8VZ/Al5kDRPHg...
Scraping data from https://nreganarep.nic.in/netnrega/app_issue.aspx?page=b&lflag=&state_name=WEST+BENGAL&state_code=32&district_name=24+PARGANAS+(NORTH)&district_code=3215&block_code=3215002&block_name=BARASAT-I&fin_year=2023-2024&source=national&Digest=/R2LPS3tUVSRferckJTH5A...
Fetching data from https://nreganarep.nic.in/netnrega/app_issue.aspx?page=b&lflag=&state_name=WEST+BENGAL&state_code=32&district_name=24+PARGANAS+(NORTH)&distric