In [25]:
#imports
import os
from dotenv import load_dotenv
from serpapi import Client
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import threading
import time
from multiprocessing import Process, Queue
import signal

In [2]:
load_dotenv()
serp_api_key = os.getenv("SERP_API_KEY")
serp_api_client = Client(api_key=serp_api_key)

In [19]:
error_messages = [
    "We can't find that page",
    "404 Not Found",
    "The requested URL was not found on this server",
    "Page not found",
    "Looks like the page you’re looking for doesn’t exist"
]

irrelevant_body_tags = ["script", "style", "img", "input"]
error_codes = [
    204, 301, 400, 401, 402, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 421, 422, 423, 424, 425, 426, 428, 429, 431, 451, 500, 501, 502, 503, 504, 505, 506, 507, 508, 510, 511 
]

In [17]:
def search_job(job_role, location="India", next_page_token=""):
    try:
        params = {
            "engine": "google_jobs",
            "google_domain": "google.co.in",
            "q": f"{job_role}",
            "hl": "en",
            "gl": "in",
            "location": f"{location}",
            "no_cache": "true",
            "next_page_token": f"{next_page_token}"
        }

        search = serp_api_client.search(params)
        results = search.as_dict()
        return results
    except Exception as e:
        print(f"Error while trying to search jobs with exception: ${e}")
        return ""

In [26]:
class TimeoutException(Exception):
    pass

def selenium_get_content(link, timeout=10):
    """Cross-platform implementation of selenium with timeout"""
    driver = None
    result = ["Unknown"]  # Use a list to store the result from the worker thread
    
    def worker():
        nonlocal driver
        try:
            chrome_options = Options()
            chrome_options.add_argument("--headless")
            chrome_options.add_argument("--disable-gpu")
            chrome_options.add_argument("--no-sandbox")
            chrome_options.add_argument("--window-size=1920,1080")
            chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36")

            driver = webdriver.Chrome(options=chrome_options)
            driver.set_page_load_timeout(timeout)  # Set page load timeout
            driver.get(link)

            # Wait a moment for JavaScript to execute
            time.sleep(3)

            body_element = driver.find_element(By.TAG_NAME, "body")
            body_html = body_element.get_attribute("innerHTML")
            
            soup = BeautifulSoup(body_html, 'html.parser')
            for irrelevant in soup(irrelevant_body_tags):
                irrelevant.decompose()
            text = soup.get_text(separator="\n", strip=True)

            if text and not any(msg in text for msg in error_messages):
                result[0] = "Working"
        except Exception as e:
            print(f"Selenium error: {e}")
            result[0] = "Unknown"
    
    # Create and start the worker thread
    t = threading.Thread(target=worker)
    t.daemon = True
    t.start()
    
    # Wait for the worker thread to complete or timeout
    t.join(timeout)
    
    # If the thread is still alive after the timeout, it's stuck
    if t.is_alive():
        print("Selenium operation timed out")
        
        # Clean up resources
        if driver:
            try:
                # Force quit the browser on timeout
                driver.quit()
            except:
                pass
                
        # If on Windows, we may need to forcefully kill chromedriver processes
        if os.name == 'nt':
            try:
                os.system("taskkill /f /im chromedriver.exe")
                os.system("taskkill /f /im chrome.exe")
            except:
                pass
    
    return result[0]

def check_job_link_status(link):
    is_selenium_needed = True
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
        }
        response = requests.get(link, headers=headers, timeout=10)
        status = response.status_code
        print(f"Status: {status} -- {link}")
        if status == 200:
            body = response.text
            soup = BeautifulSoup(body, 'html.parser')
            text = ""
            if soup.body:
                for irrelevant in soup.body(irrelevant_body_tags):
                    irrelevant.decompose()
                text = soup.body.get_text(separator="\n", strip=True)

            if text and not any(msg in text for msg in error_messages):
                is_selenium_needed = False
                return "Working"
        elif status in error_codes:
            return "Unknown"

    except Exception as e:
        print(f"Requests error: {e}")

    if is_selenium_needed:
        return selenium_get_content(link, timeout=10)

    return "Working"

In [None]:
# def selenium_check(link):
#     try:
#         chrome_options = Options()
#         chrome_options.add_argument("--headless")
#         chrome_options.add_argument("--disable-gpu")
#         chrome_options.add_argument("--no-sandbox")
#         chrome_options.add_argument("--window-size=1920,1080")
#         chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36")

#         driver = webdriver.Chrome(options=chrome_options)
#         driver.get(link)

#         time.sleep(3)  # Optional: wait for page content

#         body_element = driver.find_element(By.TAG_NAME, "body")
#         body_html = body_element.get_attribute("innerHTML")
#         soup = BeautifulSoup(body_html, 'html.parser')

#         for irrelevant in soup(irrelevant_body_tags):
#             irrelevant.decompose()

#         text = soup.get_text(separator="\n", strip=True)

#         driver.quit()

#         if text and not any(msg in text for msg in error_messages):
#             return "Working"
#         else:
#             return "Unknown"

#     except Exception as e:
#         print(f"Error using Selenium to check job link status: {e}")
#         return "Unknown"

# def check_job_link_status(link):
#     # Try with Beautiful soup first
#     is_selenium_needed = True
#     try:
#         # Some websites need you to use proper headers when fetching them:
#         headers = {
#             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
#         }

#         response = requests.get(link, headers=headers)
#         status = response.status_code
#         if status == 200:
#             body = response.text
#             soup = BeautifulSoup(body, 'html.parser')
#             text = ""
#             if soup.body:
#                 for irrelevant in soup.body(irrelevant_body_tags):
#                     irrelevant.decompose()
#                 text = soup.body.get_text(separator="\n", strip=True)

#             if text != "":
#                 if text and not any(msg in text for msg in error_messages):
#                     is_selenium_needed = False
#                     # print("Request Library Check Success...")
#         elif status in error_codes:
#             return "Unknown"

#     except Exception as e:
#         print(f"Error using Request library to check job link status: ${e}")

#     if is_selenium_needed:
#         queue = Queue()
#         process = Process(target=selenium_check, args=(link, queue))
#         process.start()
#         process.join(timeout=10)
#         if process.is_alive():
#             process.terminate()
#             process.join()
#             print("Selenium timeout reached, process killed.")
#             return "Unknown"
#         else:
#             return queue.get()

#     return "Working"

In [6]:
job_title = input("Enter the job you are searching for: ")
results = search_job(f"{job_title}")

In [None]:
remote_results_serp_link = [result.get("serpapi_link") for result in results["filters"] if result.get("name") == "Remote"][0]
remote_results_serp_link

In [None]:
date_posted_options = [result.get("options") for result in results["filters"] if result.get("name") == "Date posted"][0]
date_posted_options

In [15]:
yesterday_results_serp_link = ""
last_three_days_results_serp_link = ""
last_week_results_serp_link = ""
last_month_results_serp_link = ""

for options in date_posted_options:
    if options.get("name") == "Yesterday":
        yesterday_results_serp_link = options.get("serpapi_link")
    elif options.get("name") == "Last 3 days":
        last_three_days_results_serp_link = options.get("serpapi_link")
    elif options.get("name") == "Last week":
        last_week_results_serp_link = options.get("serpapi_link")
    elif options.get("name") == "Last month":
        last_month_results_serp_link = options.get("serpapi_link")

In [28]:
jobs_results = results["jobs_results"]

for job in jobs_results:
    company_name = job["company_name"]
    title = job["title"]
    location = job["location"]
    schedule_type = job.get("detected_extensions").get("schedule_type", "Unknown")
    print(f"{company_name}: {title} ({location}) [{schedule_type}]", end="\n")
    # Check working job apply links and list them
    apply_options = job["apply_options"]
    if len(apply_options) > 0:
        for option in apply_options:
            # print(f"option: {option}")
            status = check_job_link_status(option.get("link", "Unknown"))
            if status == "Working":
                print(f"{option['title']}: {option['link']}", end="\n")
    print("", end="\n")        

    

Wits Innovation Lab: Sr. Full Stack Java Developer (India) [Full–time]
Status: 200 -- https://in.linkedin.com/jobs/view/sr-full-stack-java-developer-at-wits-innovation-lab-4210468457?utm_campaign=google_jobs_apply&utm_source=google_jobs_apply&utm_medium=organic
LinkedIn: https://in.linkedin.com/jobs/view/sr-full-stack-java-developer-at-wits-innovation-lab-4210468457?utm_campaign=google_jobs_apply&utm_source=google_jobs_apply&utm_medium=organic
Status: 200 -- https://www.jobaaj.com/job/wits-innovation-lab-sr-java-full-stack-developer-mohali-punjab-0-to-1-years-675723?utm_campaign=google_jobs_apply&utm_source=google_jobs_apply&utm_medium=organic
Jobaaj: https://www.jobaaj.com/job/wits-innovation-lab-sr-java-full-stack-developer-mohali-punjab-0-to-1-years-675723?utm_campaign=google_jobs_apply&utm_source=google_jobs_apply&utm_medium=organic

Assignedge Network Pvt Ltd: Full Stack Developer (Nanded, Maharashtra) [Full–time]
Status: 403 -- https://www.glassdoor.co.in/job-listing/full-stack-d