In [52]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

# Initialize WebDriver
driver = webdriver.Chrome()
driver.get("https://wbhealthscheme.gov.in/Home/wbhs_doc_details.aspx")  # Replace with the actual URL

# Wait for the table to load
wait = WebDriverWait(driver, 20)
wait.until(EC.presence_of_element_located((By.ID, "ContentPlaceHolder1_gd1")))

# Initialize an empty list to store extracted data
data = []

# Function to extract data from the current page
def extract_data():
    soup = BeautifulSoup(driver.page_source, "html.parser")
    table = soup.find("table", {"id": "ContentPlaceHolder1_gd1"})  # Replace with the actual table ID

    if table:
        rows = table.find_all("tr")
        for row in rows:
            cols = [col.text.strip() for col in row.find_all("td")]
            if len(cols) == 6:  # Ensure the row has the expected number of columns
                data.append(cols)

# Extract data from the first page
extract_data()

# Find total number of pages
pagination_links = driver.find_elements(By.XPATH, "//td/a[contains(@href, 'doPostBack')]")
page_numbers = [link.text.strip() for link in pagination_links if link.text.strip().isdigit()]
total_pages = int(page_numbers[-1]) if page_numbers else 1  # Get the last valid page number

print(f"Total Pages: {total_pages}")

# Loop through pagination by clicking pagination links
for page_num in range(2, total_pages + 1):
    print(f"Extracting Page {page_num}...")
    
    try:
        # Find the pagination link for the current page
        pagination_link = driver.find_element(By.XPATH, f"//td/a[contains(@href, 'doPostBack') and text()='{page_num}']")
        # Scroll the link into view (optional but recommended)
        driver.execute_script("arguments[0].scrollIntoView();", pagination_link)
        # Click the pagination link
        pagination_link.click()
        
        # Wait for the table to reload
        wait.until(EC.staleness_of(driver.find_element(By.ID, "ContentPlaceHolder1_gd1")))
        wait.until(EC.presence_of_element_located((By.ID, "ContentPlaceHolder1_gd1")))
        
        # Extract data from the new page
        extract_data()
    except Exception as e:
        print(f"Error navigating to page {page_num}: {e}")
        break  # Stop pagination if an error occurs

# Close the browser
driver.quit()

# Convert extracted data into a DataFrame
df = pd.DataFrame(data)  # Adjust column names

# Save DataFrame to CSV
df.to_csv("scraped_data.csv", index=False)

Total Pages: 10
Extracting Page 2...
Extracting Page 3...
Extracting Page 4...
Extracting Page 5...
Extracting Page 6...
Extracting Page 7...
Extracting Page 8...
Extracting Page 9...
Extracting Page 10...
