# Bihar Web Scrapping

In [15]:
pip install selenium

Note: you may need to restart the kernel to use updated packages.


In [16]:
pip install webdriver-manager

Collecting webdriver-manager
  Downloading webdriver_manager-3.7.1-py2.py3-none-any.whl (25 kB)
Collecting python-dotenv
  Downloading python_dotenv-0.20.0-py3-none-any.whl (17 kB)
Installing collected packages: python-dotenv, webdriver-manager
Successfully installed python-dotenv-0.20.0 webdriver-manager-3.7.1
Note: you may need to restart the kernel to use updated packages.


### Working on Button Pagination

In [37]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait  # Import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC  # Import expected conditions
import time



# Set up the Selenium WebDriver
driver = webdriver.Chrome(executable_path='./chrome_driver/chromedriver.exe')
url = "https://vidhansabha.bihar.gov.in/Knowyourmla.html"
driver.get(url)

# Function to handle pagination
def handle_pagination():
    while True:
        try:
            # Find the current and next buttons
            next_button = driver.find_element(By.ID, "example_next")
            current_page = driver.find_element(By.CSS_SELECTOR, ".paginate_button.current")
            
            print(f"Current page: {current_page.text}")
            
            # Perform actions on the current page if needed
            # Scrape data or perform specific tasks here
            
            # Check if the 'Next' button is disabled
            if "disabled" in next_button.get_attribute("class"):
                print("Reached the last page.")
                break
            
            # Scroll to the next button to ensure visibility
            ActionChains(driver).move_to_element(next_button).perform()
            
            # Click the next button
            next_button.click()
            print("Clicked 'Next' button.")
            
            # Wait for the page to load
            WebDriverWait(driver, 10).until(
                EC.staleness_of(current_page)  # Wait until the current page element becomes stale
            )
        except Exception as e:
            print(f"Error handling pagination: {e}")
            break

# Call the function to iterate through pagination
handle_pagination()

# Close the browser
driver.quit()

Current page: 1
Clicked 'Next' button.
Current page: 2
Clicked 'Next' button.
Current page: 3
Clicked 'Next' button.
Current page: 4
Clicked 'Next' button.
Current page: 5
Clicked 'Next' button.
Current page: 6
Clicked 'Next' button.
Current page: 7
Clicked 'Next' button.
Current page: 8
Clicked 'Next' button.
Current page: 9
Clicked 'Next' button.
Current page: 10
Clicked 'Next' button.
Current page: 11
Clicked 'Next' button.
Current page: 12
Clicked 'Next' button.
Current page: 13
Clicked 'Next' button.
Current page: 14
Clicked 'Next' button.
Current page: 15
Clicked 'Next' button.
Current page: 16
Clicked 'Next' button.
Current page: 17
Clicked 'Next' button.
Current page: 18
Clicked 'Next' button.
Current page: 19
Clicked 'Next' button.
Current page: 20
Clicked 'Next' button.
Current page: 21
Clicked 'Next' button.
Current page: 22
Clicked 'Next' button.
Current page: 23
Clicked 'Next' button.
Current page: 24
Clicked 'Next' button.
Current page: 25
Reached the last page.


### Combining Data Extractor with Pagination

In [44]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd

# Set up the Selenium WebDriver
driver = webdriver.Chrome(executable_path='./chrome_driver/chromedriver.exe')
url = "https://vidhansabha.bihar.gov.in/Knowyourmla.html"
driver.get(url)

# Initialize list to store data
data = []

# Pagination and Data Extraction
while True:
    try:
        # Parse page content with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        rows = soup.select('tr[role="row"]')

        for row in rows:
            # Extract data from cells
            cells = row.find_all('td')
            print(cells)
            if len(cells) > 1:  # Ensure it's a valid row with data
                constituency = cells[2].text.strip()
                name = cells[3].text.strip()
                gender = cells[4].text.strip()
                party = cells[5].text.strip()
                contact = cells[6].text.strip()
                email = cells[7].text.strip()

                data.append({
                    "Constituency": constituency,
                    "Name": name,
                    "Gender": gender,
                    "Party": party,
                    "Contact": contact,
                    "Email": email,
                    'state': 'bihar'
                })

        # Click the 'Next' button if available
        next_button = driver.find_element(By.ID, "example_next")
        if "disabled" in next_button.get_attribute("class"):
            print("Reached the last page.")
            break
        next_button.click()
    except Exception as e:
        print(f"Error during scraping: {e}")
        break

# Save data to CSV
df = pd.DataFrame(data)
df.to_csv('bihar_mla_contact_details.csv', index=False)
print("Data saved to bihar_mla_contact_details.csv")

# Close the browser
driver.quit()


[]
[<td class="sorting_1">01 </td>, <td><img class="img-thumbnail img-responsive" src="image/final%20mla%20photo%202020/01.jpg" style="width:100px; height:100px;"/> </td>, <td>Valmiki Nagar </td>, <td style="font-family:Times New Roman;font-size:16px;color:#3366FF;font-weight:bold;">Dhirendra Pratap Singh alias Rinku singh </td>, <td>Male_ </td>, <td>JD(U) </td>, <td>9798105666 </td>, <td>mla-vnagar-bih@nic.in</td>]
[<td class="sorting_1">02 </td>, <td><img class="img-thumbnail img-responsive" src="image/final%20mla%20photo%202020/02.jpg" style="width:100px; height:100px;"/> </td>, <td>Ramnagar (SC) </td>, <td style="font-family:Times New Roman;font-size:16px;color:#3366FF;font-weight:bold;">Bhagirathi Devi </td>, <td>Female </td>, <td>BJP </td>, <td>9931378642 </td>, <td>mla-ramnagar-bih@nic.in</td>]
[<td class="sorting_1">03 </td>, <td><img class="img-thumbnail img-responsive" src="image/final%20mla%20photo%202020/03.jpg" style="width:100px; height:100px;"/> </td>, <td>Narkatiaganj <