In [None]:
!apt-get update
!apt install -y firefox
!wget https://github.com/mozilla/geckodriver/releases/download/v0.34.0/geckodriver-v0.34.0-linux64.tar.gz
!tar -xvzf geckodriver-v0.34.0-linux64.tar.gz
!mv geckodriver /usr/local/bin/
!pip install selenium
!pip install geckodriver
!pip install python_dotenv
!pip install opencv-python-headless
!pip install Pillow
!pip install pdfkit

from dotenv import load_dotenv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException 
from dotenv import load_dotenv
from PIL import Image,ImageEnhance
import pytesseract
from io import BytesIO
import requests
import pdfkit
import os
import csv
import re
import time
import os
import cv2
import numpy as np
from pytesseract import image_to_string
from selenium.webdriver.common.keys import Keys

In [None]:
class DriverManager:
    def __init__(self):
         # Initialize Firefox options
        self.options = webdriver.FirefoxOptions()
        self.options.add_argument('--start-maximized')  # Start maximized
        self.driver = webdriver.Firefox()
    
    def get_driver(self):
        return self.driver

    def close_driver(self):
        self.driver.quit()

In [None]:
class DistrictHandler:
    def __init__(self, driver):
        self.driver = driver
    
    def select_district(self, district_index):
        self.driver.find_element(By.ID, "ctl00_ContentPlaceHolder1_ddlDist").click()
        self.driver.find_element(By.CSS_SELECTOR, f"select[id='ctl00_ContentPlaceHolder1_ddlDist'] option[value='{district_index}']").click()
    
    def process_districts(self):
        district = Select(self.driver.find_element(By.ID, "ctl00_ContentPlaceHolder1_ddlDist"))
        total_districts = len(district.options)
        for i in range(1, total_districts):
            self.select_district(i)
            constituency_handler = ConstituencyHandler(self.driver)
            constituency_handler.handle_constituency()

In [None]:
class ConstituencyHandler:
    def __init__(self, driver):
        self.driver = driver

    def select_constituency(self, constituency_index):
        self.driver.find_element(By.ID, "ctl00_ContentPlaceHolder1_ddlAC").click()
        refreshed_constituency = Select(self.driver.find_element(By.ID, "ctl00_ContentPlaceHolder1_ddlAC"))
        option = refreshed_constituency.options[constituency_index]
        return option

    def handle_constituency(self):
        constituency = Select(self.driver.find_element(By.ID, "ctl00_ContentPlaceHolder1_ddlAC"))
        total_constituency = len(constituency.options)
        pattern = re.compile(r"^(?!-Select-$|\d-$)(\d+-\S+)$")
        for j in range(1, total_constituency):
            while True:
                try:
                    option = self.select_constituency(j)
                    match = pattern.search(option.text)
                    if match:
                        ac = match.group(1)
                        self.driver.find_element(By.XPATH, f"//option[normalize-space()='{ac}']").click()
                        submit_button = EC.presence_of_element_located((By.ID, 'ctl00_ContentPlaceHolder1_btnlogin'))
                        WebDriverWait(self.driver, 2).until(submit_button)
                        self.driver.find_element(By.ID, 'ctl00_ContentPlaceHolder1_btnlogin').click()
                        polling_station_handler = PollingStationHandler(self.driver)
                        polling_station_handler.process_polling_stations()
                    break  
                except StaleElementReferenceException:
                    print("Caught StaleElementReferenceException, retrying...")
                    time.sleep(1) 
                except TimeoutException:
                    print("Caught TimeoutException, retrying...")
                    time.sleep(1) 


In [None]:
class PdfDownloader:
    def __init__(self,driver):
        self.driver=driver

    def download_pdf(self):
        try:
            page_html = self.driver.page_source
            output_html_path = '/root/Desktop/Voter_Roll_Downloader/page.html'
            with open(output_html_path, 'w', encoding='utf-8') as file:
                file.write(page_html) 
                path_to_wkhtmltopdf = '/usr/bin/wkhtmltopdf'
                config = pdfkit.configuration(wkhtmltopdf=path_to_wkhtmltopdf)
                output_pdf_path = '/root/Desktop/Voter_Roll_Downloader/voter_roll.pdf'
                pdfkit.from_file(output_html_path, output_pdf_path, configuration=config)
        except Exception as e:
            print(f"Failed to download PDF: {e}")

In [None]:
class CaptchaProcessor:
    def __init__(self,driver):
        self.driver=driver

    def captcha_screenshot(self):
        #wait until the captcha element presence located
        WebDriverWait(self.driver,20).until(EC.presence_of_element_located((By.ID,"Image2")))
        #locate the captcha element
        captcha_element = self.driver.find_element(By.ID, 'Image2')
        #return captcha screenshot and save it in a png format
        return captcha_element.screenshot_as_png

    def process_captcha_image(self):
        captcha_image=self.captcha_screenshot()
        image = Image.open(BytesIO(captcha_image))  # Convert the screenshot to a PIL image
        enhancer = ImageEnhance.Contrast(image)  #enchance the image contrast
        enhanced_image = enhancer.enhance(2.0)
        enhanced_image.save("enhanced_captcha.png") #save the enchanced image for processing
        img = cv2.imread("enhanced_captcha.png") #load image with opencv
        gry=cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) #convert it into grayscale
        (h, w) = gry.shape[:2] #increase image size to extact text
        gry = cv2.resize(gry, (w*2, h*2))
        cls = cv2.morphologyEx(gry, cv2.MORPH_CLOSE, None) #morphological transformations to remove holes in image
        return cv2.threshold(cls, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] #remove any artifacts in the image

    def extract_captcha(self):
        #use tesseract OCR to extract text from processed image
        thr=self.process_captcha_image()
        return image_to_string(thr)  

    def enter_captcha(self):
        WebDriverWait(self.driver, 2).until(EC.presence_of_element_located((By.ID, "txtVerificationCode")))
        self.verification_code = self.driver.find_element(By.ID, "txtVerificationCode")
        captcha_text=self.extract_captcha()
        self.verification_code.send_keys(captcha_text)
        self.driver.find_element(By.ID,"btnSubmit").click()

    def handle_captcha_failure(self):
        try:
            # Wait until error message is present
            error_text = WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located((By.ID, "lblCaptchaMessage"))
            )
            if error_text:
                print("Found captcha error")
                self.driver.close()
                self.driver.switch_to.window(self.driver.window_handles[0])
            else:
                print("No captcha error message found, proceeding with execution.")
        except TimeoutException:
            print("Timed out waiting for captcha error message")

In [None]:
class PollingStationHandler:
    def __init__(self, driver):
        self.driver = driver

    def check_view(self):
        #to handle dynamic grid view
        grid_views=["GridView1","GridView3","GridView2"]
        for grid_view in grid_views:
            try:
                grid_element=self.driver.find_element(By.XPATH,f"//a[@id='ctl00_ContentPlaceHolder1_{grid_view}_ctl02_lnkEnglish']")
                if grid_element:
                    return grid_view
            except Exception as e:
                print(f"{grid_view} not found:{e}")
        return None

    def process_polling_stations(self):
        captcha_processor=CaptchaProcessor(self.driver)
        pdf_downloader=PdfDownloader(self.driver)
        polling_rows = len(self.driver.find_elements(By.XPATH, "//table[@id='ctl00_ContentPlaceHolder1_GridView1']/tbody/tr"))
        grid_view=self.check_view()
        if not grid_view:
            return 
        for k in range(2,polling_rows):
            success=False
            while not success:
                try:
                    view_element=self.driver.find_element(By.XPATH, f"//a[@id='ctl00_ContentPlaceHolder1_{grid_view}_ctl0{k}_lnkEnglish']")
                    self.driver.execute_script("arguments[0].scrollIntoView(true);",view_element)
                    view_element.click()
                    self.driver.switch_to.window(self.driver.window_handles[1])
                    self.driver.maximize_window()
                    captcha_processor.enter_captcha()
                    captcha_processor.handle_captcha_failure()
                    pdf_downloader.download_pdf()
                    self.driver.implicitly_wait(15)
                    self.driver.switch_to.window(self.driver.window_handles[0])
                    self.driver.execute_script("window.scrollBy(0,500)")
                    time.sleep(2)
                    success=True
                except Exception as e:
                    print(f"Error occurred:{e}")

In [None]:
class VoterRollDownloader:
    def __init__(self, driver_manager):
        load_dotenv('/root/Desktop/Voter_Roll_Downloader/cred.env')
        self.url = os.environ.get('VOTER_URL')
        self.driver = driver_manager.get_driver()
    
    def initialize_driver(self):
        self.driver.get(self.url)

    def extract_data(self):
        self.initialize_driver()
        district_handler = DistrictHandler(self.driver)
        district_handler.process_districts()
    
    def close_driver(self):
        self.driver.quit()

In [None]:
if __name__ == "__main__":
    driver_manager = DriverManager()
    roll_extractor = VoterRollDownloader(driver_manager)
    roll_extractor.extract_data()
    roll_extractor.close_driver()