In [None]:
import os
import time
import re
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# ------------------------------------------------------------------------
# CONFIGURATION
# ------------------------------------------------------------------------
BASE_URL = "https://www.........Pages/Browse.aspx"
DOWNLOAD_DIR = "C:/Users/Siddhant/Desktop/Folder1"  # Change this to your preferred path

# Setup Selenium (Chrome)
service = Service("D:/chromedriver-win64/chromedriver-win64/chromedriver.exe")
chrome_options = webdriver.ChromeOptions()
prefs = {"download.default_directory": DOWNLOAD_DIR}
chrome_options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(service=service, options=chrome_options)

# ------------------------------------------------------------------------
# HELPER FUNCTIONS
# ------------------------------------------------------------------------
def sanitize_filename(filename):
    """Remove or replace any characters that are invalid for file names."""
    return re.sub(r'[\\/*?:"<>|]', '_', filename)

def get_sector_cards():
    """Extract sector names and their URLs from the main page."""
    driver.get(BASE_URL)
    wait = WebDriverWait(driver, 20)
    try:
        sector_cards = wait.until(EC.presence_of_all_elements_located(
            (By.CSS_SELECTOR, "div.col-sm-6.col-md-3.item.marginTop30 a.sectImg.lazy")
        ))
    except TimeoutException:
        print("Sector cards not found.")
        return []

    sector_list = []
    for card in sector_cards:
        try:
            sector_name = card.find_element(By.TAG_NAME, "img").get_attribute("alt")
        except Exception:
            sector_name = card.text.strip()
        sector_url = card.get_attribute("href")
        if sector_name and sector_url:
            sector_list.append((sector_name, sector_url))
    return sector_list

def get_links_in_sector(sector_url):
    """Extract job role links from the given sector page."""
    job_links = []
    driver.get(sector_url)
    while True:
        try:
            wait = WebDriverWait(driver, 10)
            job_role_elements = wait.until(EC.presence_of_all_elements_located(
                (By.CSS_SELECTOR, "div.padding5.bdrGray.marginLeftRight5.marginTop5 a")
            ))

            for elem in job_role_elements:
                job_links.append((elem.text.strip(), elem.get_attribute('href')))

            next_button = driver.find_elements(By.XPATH, "//a[span/img[contains(@class, 'ms-promlink-button-right')]]")
            if next_button:
                driver.execute_script("arguments[0].scrollIntoView(true);", next_button[0])
                next_button[0].click()
                time.sleep(5)
            else:
                break
        except Exception as e:
            print(f"Error: {e}")
            break
    return job_links

# def download_pdf(title, url, sector_name):
#     """Download and rename PDFs for each job role."""
#     sector_folder = os.path.join(DOWNLOAD_DIR, sanitize_filename(sector_name))
#     os.makedirs(sector_folder, exist_ok=True)

#     driver.get(url)
#     wait = WebDriverWait(driver, 10)
#     try:
#         pdf_button = wait.until(EC.presence_of_element_located(
#             (By.XPATH, "//input[@type='submit' and @value='Download PDF']")))
#         link = pdf_button.get_attribute('href')

#         filename = sanitize_filename(title) + ".pdf"
#         filepath = os.path.join(sector_folder, filename)

#         pdf_data = requests.get(link)
#         if pdf_data.status_code == 200:
#             with open(filepath, 'wb') as f:
#                 f.write(pdf_data.content)
#             print(f"✅ Downloaded PDF: {filepath}")
#         else:
#             print(f"❌ Failed to download {title}")
#     except TimeoutException:
#         print(f"❌ No PDF found for {title}")

def download_pdf(job_title, url, sector_name):
    """Download and rename PDFs for each job role."""
    sector_folder = os.path.join(DOWNLOAD_DIR, sanitize_filename(sector_name))
    os.makedirs(sector_folder, exist_ok=True)

    driver.get(url)
    wait = WebDriverWait(driver, 10)

    try:
        # Locate Download PDF button
        pdf_button = wait.until(EC.presence_of_element_located(
            (By.XPATH, "//input[@type='submit' and contains(@value, 'Download PDF')]")))

        # Extract JavaScript onclick event
        onclick_text = pdf_button.get_attribute("onclick")
        print(f"🔍 onclick attribute: {onclick_text}")

        # Extract PDF URL using regex
        match = re.search(r"window\.open\('([^']+)'\)", onclick_text)
        if match:
            link = match.group(1)
            print(f"🔗 Extracted PDF URL: {link}")
        else:
            print(f"❌ No valid link found for {title}. Skipping.")
            return

        # Validate URL
        if not link.startswith("http"):
            print(f"❌ Extracted URL is not valid: {link}. Skipping.")
            return

        # Download the PDF
        response = requests.get(link, stream=True)
        if response.status_code == 200:
            filename = sanitize_filename(title) + ".pdf"
            filepath = os.path.join(sector_folder, filename)

            with open(filepath, "wb") as f:
                for chunk in response.iter_content(1024):
                    f.write(chunk)

            print(f"✅ Downloaded PDF: {filepath}")
        else:
            print(f"❌ Failed to download PDF for {title}, Status Code: {response.status_code}")

    except Exception as e:
        print(f"⚠️ Error processing {title}: {e}")

# ------------------------------------------------------------------------
# MAIN LOGIC
# ------------------------------------------------------------------------
# def main():
#     try:
#         sectors = get_sector_cards()
#         print(f"Found {len(sectors)} sectors.")

#         for name, url in sectors:
#             print(f"\nProcessing sector: {name}")
#             job_links = get_job_links_in_sector(url)
#             print(f"  Found {len(job_links)} job links.")

#             for title, url in links:
#                 download_pdf(title, url, name)
#                 time.sleep(10)
#     finally:
#         driver.quit()


def main():
    try:
        sectors = get_sector_cards()
        print(f"Found {len(sectors)} sectors.")

        # Start from the last 5 sectors in case of an interruption
        start_index = max(0, len(sectors) - 5)

        for name, url in sectors[start_index:]:
            print(f"\nProcessing sector: {name}")
            links = get_links_in_sector(url)
            print(f"  Found {len(links)} links.")

            for title, url in links:
                download_pdf(title, url, name)
                time.sleep(10)
    finally:
        driver.quit()

if __name__ == "__main__":
    main()


########     MAIN WORKING SELENIUM SCRAPPING SCRIPT       #########