In [1]:
# Importing the necessary Libraries
from bs4 import BeautifulSoup
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from sqlalchemy.orm import sessionmaker
from db_setup import get_database_engine
from dotenv import load_dotenv
import os
import time

In [2]:
# Import your models
from models import Workday_Company_Data  

# Load environment variables
load_dotenv()

# Establish a connection to your PostgreSQL database
engine = get_database_engine()
Session = sessionmaker(bind=engine)
session = Session()

# Fetching the company_info data from the database
company_data = session.query(Workday_Company_Data).all()

# Converting the fetched data into a DataFrame
company_df = pd.DataFrame([(c.Company_ID, c.Company_Name, c.Company_URL) for c in company_data],
                          columns=['company_id', 'company_name', 'company_url'])

In [8]:
company_df=company_df.head(1)
# Implement your scraping logic here for the current company
job_data = []
# Assuming you have a DataFrame named 'company_df' containing company information
for index, company_row in company_df.iterrows():
    # Assuming you have retrieved the company_id, company_name, and company_url from the DataFrame
    company_id = company_row['company_id']
    company_name = company_row['company_name']
    company_url = company_row['company_url']

    # Extract the base URL from the company URL
    # if company_url.endswith('/en-US/Careers'):
    #     base_company_url = company_url[:-len('/en-US/Careers')]
    # elif company_url.endswith('/en-US'):  # Check if company_url ends with "/en-US"
    #     base_company_url = company_url[:-len('/en-US')]  # Remove "/en-US" if it exists
    # elif company_url.endswith('/'):
    #     base_company_url = company_url.rsplit('/', 1)[0]
    # else :
    #     base_company_url=company_url

    base_company_url = company_url.split('.com')[0] + '.com'
    print("base_company_url",base_company_url)

    
    # Navigate to the company's website
    driver = webdriver.Chrome()
    driver.get(company_url)


    page_count = 0
    while page_count < 3:
        # Get the page source
        page_source = driver.page_source
        
        # Parse the page source with BeautifulSoup
        soup = BeautifulSoup(page_source, 'html.parser')

        # Find all job postings on the current page
        job_postings = soup.find_all("li", class_="css-1q2dra3")

        # Extract job data from each posting
        for job in job_postings:
            try:
                job_title_element = job.find('a', class_='css-19uc56f')
                job_title = job_title_element.text.strip()
                print('jobTitle',job_title)
                job_url = job_title_element['href']
                

                location_element = job.find('dt', string='locations').find_next_sibling('dd')
                location = location_element.text.strip()
                
                
                posting_date_element = job.find('dt', string=lambda text: text in ['posted on', 'postedOn']).find_next_sibling('dd')
                posting_date = posting_date_element.text.strip()
                

                job_id_element = job.find('li', class_='css-h2nt8k')
                job_req_id = job_id_element.text.strip()
                

                
                if posting_date in ['Posted Today', 'Posted Yesterday']:
                    full_job_url = f"{base_company_url}{job_url}"
            
                    job_data.append({
                        'company_id': company_id,
                        'company_name': company_name,
                        'company_url': company_url,
                        'job_title': job_title,
                        'job_url':  full_job_url,
                        'location': location,
                        'posting_date': posting_date,
                        'job_req_id': job_req_id
                    })
                    
                else:
                    continue
            except AttributeError as e:
                print("Error occurred while extracting job data:", e)
                continue

        # Increment page count
        page_count += 1
        # Look for the next page button
        # next_button_xpath = f'//li[contains(@class, "css-1j096s0")]/button[text()="{page_count + 1}"]'
        try: 
            next_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//li[contains(@class, 'css-1j096s0')]/button[contains(@aria-label, 'page')]")))
            print(next_button)
            # next_button = driver.find_element(By.XPATH, next_button)

            # If there is no next page button, break the loop
            if not next_button or page_count >= 3:
                break
            # Click the next page button
            next_button.click()
            
            # Wait for the page to load
            WebDriverWait(driver, 10).until(EC.invisibility_of_element_located((By.ID, "loading-indicator")))  # Replace "loading-indicator" with actual ID if applicable
        except Exception as e:
            print(f"Error navigating to the next page: {e}")
            continue  # Break the loop if unable to find the next button or any other exception occurs

    # Quit the driver
    driver.quit()
    


base_company_url https://pru.wd5.myworkdayjobs.com
<selenium.webdriver.remote.webelement.WebElement (session="e9820f2cfafa43aa0deb9cc02e60ef59", element="f.B5275E248B8DEEB9FA0FBEBFF41D6F46.d.EEC9C1ABBBC5D1A91A53B2C59FDECF9E.e.19")>
jobTitle Senior DevSecOps Engineer
jobTitle PGIM Quantitative Solutions, Investment VP, Multi-Asset Portfolio Manager
jobTitle PGIM Fixed Income: Project Manager (Hybrid/Newark, NJ)
jobTitle Actuarial Associate
jobTitle Associate Actuary
jobTitle PGIM Private Capital - Manager, Business Group Risk Management
jobTitle Sales Support Administrator
jobTitle PGIM Director, Third-Party Risk Management (Hybrid-Newark, NJ/Tampa,FL)
jobTitle Business Analyst - Financial Management Transformation (Hybrid)
jobTitle Sr Business Analyst - Financial Management Transformation (Hybrid)
jobTitle Lead, Infrastructure Developer
jobTitle PGIM Fixed Income: Sr. Investment Analyst – Securitized Products (Hybrid/Newark, NJ)
jobTitle Vice President, Tech Lead - API Platform
jobTitl

In [19]:
df_job = pd.DataFrame(job_data)

In [21]:
df_job.shape


(4, 8)