In [2]:
from bs4 import BeautifulSoup
import logging
import csv
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from pathlib import Path
import pandas as pd

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
}
URLS = {
    "indeed": "https://ie.indeed.com"
}

def extract_site(site: str, skill_name: str, location="Ireland", num_page=0) -> BeautifulSoup:
    """
    Extracts the HTML from the requested site.

    Parameters:
    - site (str): The website to extract data from.
    - skill_name (str): The skill or job title to search for.
    - location (str): The location where the job search should be conducted. Defaults to "Ireland".
    - num_page (int): The number of pages to scrape. If set to 0, scrapes only first page. Defaults to 0.

    Returns:
    - soup (BeautifulSoup): The BeautifulSoup object containing the parsed HTML.
    """
    options = Options()
    driver = webdriver.Chrome(options=options)
    url = ""
    if site == "indeed":
        url = (
            URLS[site]
            + f"/jobs?q={skill_name.replace(' ', '+')}&l={location}&start={num_page * 10}"
        )
    driver.get(url)
    time.sleep(5)  # Let the page load (adjust this time according to your needs)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()  # Close the WebDriver after extracting the HTML
    return soup


if __name__ == "__main__":
    # Set the skill name, location, and number of pages
    job_data = []
    skill_name = "sustainability"
    location = "Ireland"
    num_pages = 7 
    
    # Iterate over each page and extract job information
    for page in range(num_pages):
        soup = extract_site(site="indeed", skill_name=skill_name, location=location, num_page=page)
        job_cards_div = soup.find("div", attrs={"id": "mosaic-provider-jobcards"})
        if job_cards_div:
            jobs = job_cards_div.find_all("li", class_="css-5lfssm eu4oa1w0")
            for job in jobs:
                job_link_elem = job.find('a')
                if job_link_elem:
                    job_id = job_link_elem.get('data-jk')
                    job_title_elem = job.find("div", class_="css-dekpa e37uo190")
                    if job_title_elem:
                        job_title = job_title_elem.text.strip()
                    else:
                        job_title = "N/A"
                    company_name_elem = job.find("span", class_="css-92r8pb eu4oa1w0")
                    if company_name_elem:
                        company_name = company_name_elem.text.strip()
                    else:
                        company_name = "N/A"
                    job_location_elem = job.find("div", class_="css-1p0sjhy eu4oa1w0")
                    if job_location_elem:
                        job_location = job_location_elem.text.strip()
                    else:
                        job_location = "N/A"
                    job_link = f"https://ie.indeed.com/viewjob?jk={job_id}"
                    job_data.append({
                        'Job Title': job_title,
                        'Company': company_name,
                        'Location': job_location,
                        'Link': job_link
                    })
        else:
            print("No job cards found on this page.")
            
            


In [5]:
df = pd.DataFrame(job_data)

csv_filename = "sustainability1.csv"

df.to_csv(csv_filename, index=False)