In [None]:
import time
import random
import re
import csv
import os
import pickle
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException
from bs4 import BeautifulSoup

# List of random User-Agents
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
]

def get_random_user_agent():
    return random.choice(USER_AGENTS)

def random_sleep(min_seconds=2, max_seconds=5):
    sleep_time = random.uniform(min_seconds, max_seconds)
    print(f"Waiting for {sleep_time:.2f} seconds...")
    time.sleep(sleep_time)

def setup_driver(proxy=None, browser='chrome'):
    if browser == 'chrome':
        options = webdriver.ChromeOptions()
        options.add_argument("--headless")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument("--incognito")
        options.add_argument(f"user-agent={get_random_user_agent()}")
        if proxy:
            options.add_argument(f'--proxy-server={proxy}')
        driver = webdriver.Chrome(options=options)
    elif browser == 'firefox':
        from selenium.webdriver.firefox.options import Options as FirefoxOptions
        options = FirefoxOptions()
        options.add_argument("--headless")
        options.add_argument("--private")
        options.set_preference("general.useragent.override", get_random_user_agent())
        driver = webdriver.Firefox(options=options)
    else:
        raise ValueError("Unsupported browser!")
    return driver

def get_page(driver, url, retries=3):
    for attempt in range(retries):
        try:
            driver.get(url)
            WebDriverWait(driver, 60).until(
                EC.presence_of_element_located((By.CLASS_NAME, "job_seen_beacon"))
            )
            return True
        except (TimeoutException, WebDriverException) as e:
            print(f"Error loading {url}: {e}. Attempt {attempt + 1} of {retries}.")
            if attempt < retries - 1:
                random_sleep(5, 10)
            else:
                return False
    return False

def get_job_details(driver, job_url):
    """Extracts job details like location, salary, job type, and description."""
    try:
        driver.execute_script("window.open('');")
        driver.switch_to.window(driver.window_handles[-1])
        driver.get(job_url)
        random_sleep(2, 4)
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        location_div = soup.find("div", attrs={"data-testid": "inlineHeader-companyLocation"})
        location = location_div.get_text(strip=True) if location_div else "Unknown"

        job_info_container = soup.find("div", id="salaryInfoAndJobType")
        if job_info_container:
            span_list = job_info_container.find_all("span", recursive=False)
            salary = span_list[0].get_text(strip=True) if len(span_list) >= 1 else "Not specified"
            job_type = span_list[1].get_text(strip=True) if len(span_list) >= 2 else "Not specified"
        else:
            salary = "Not specified"
            job_type = "Not specified"

        desc_div = soup.find("div", id="jobDescriptionText")
        description = desc_div.get_text(separator="\n").strip() if desc_div else "No full description"
    except Exception as e:
        print(f"Error in get_job_details: {e}")
        location, salary, description, job_type = "Unknown", "Not specified", "No description", "Not specified"
    finally:
        driver.close()
        driver.switch_to.window(driver.window_handles[0])
    return location, salary, description, job_type

def save_cookies(driver, filename="cookies.pkl"):
    with open(filename, "wb") as f:
        pickle.dump(driver.get_cookies(), f)

def load_cookies(driver, filename="cookies.pkl"):
    if os.path.exists(filename):
        with open(filename, "rb") as f:
            cookies = pickle.load(f)
            for cookie in cookies:
                try:
                    driver.add_cookie(cookie)
                except Exception as e:
                    print(f"Error adding cookie: {e}")

def main():
    job_titles = ["data scientist", "machine learning engineer", "product manager"]
    locations = ["New York", "San Francisco", "London"]
    filename = "jobs_data.csv"

    driver = setup_driver(browser='chrome')
    load_cookies(driver)

    with open(filename, mode='a', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=['Job Title', 'Location', 'Company', 'Job URL', 'Salary', 'Description', 'Job Type'])
        writer.writeheader()

        for job_title in job_titles:
            for loc in locations:
                for page in range(5):
                    start = page * 10
                    url = f"https://www.indeed.com/jobs?q={'+'.join(job_title.split())}&l={'+'.join(loc.split())}&start={start}"
                    success = get_page(driver, url)
                    if not success:
                        break

                    job_cards = driver.find_elements(By.CLASS_NAME, "job_seen_beacon")
                    if not job_cards:
                        break

                    for card in job_cards:
                        try:
                            title = card.find_element(By.CSS_SELECTOR, "h2.jobTitle span").text.strip()
                            company = card.find_element(By.CSS_SELECTOR, "span[data-testid='company-name']").text.strip()
                            job_url = card.find_element(By.CSS_SELECTOR, "h2.jobTitle a").get_attribute("href")
                            location, salary, description, job_type = get_job_details(driver, job_url)

                            writer.writerow({"Job Title": title, "Location": location, "Company": company, "Job URL": job_url, "Salary": salary, "Description": description, "Job Type": job_type})
                            csvfile.flush()
                        except Exception as e:
                            print(f"Error extracting job data: {e}")
                    random_sleep(5, 10)

    save_cookies(driver)
    driver.quit()

if __name__ == "__main__":
    main()
