# AI Job Board Scraper

Code authored by: Shaw Talebi

### imports

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import json

### 1) Grab Job Urls

In [2]:
# Step 1: Define the URL of the job board
url = "https://aijobs.net/"

# Step 2: Make a request to get the HTML content of the page
response = requests.get(url)

# Step 3: Parse the HTML using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

In [3]:
# Step 4: Regex pattern to match job URLs like /job/12345-job-title/
pattern = re.compile(r"^/job/\d{6,}-")

# Step 5: Extract only matching job links
job_link_list = []

for link in soup.find_all('a', href=True):
    href = link['href']
    if pattern.match(href):
        full_url = "https://aijobs.net" + href
        job_link_list.append(full_url)

# Step 6: Print valid job URLs
for job_url in job_link_list:
    print(job_url)

# Step 7: Dedupelicate
job_link_list = list(set(job_link_list))

https://aijobs.net/job/1285095-manager-data-science/
https://aijobs.net/job/1234499-staff-data-scientist/
https://aijobs.net/job/1319182-product-manager-api-platform-developer-experience/
https://aijobs.net/job/1319181-integrations-project-manager/
https://aijobs.net/job/1319180-senior-manager-software-engineering-13-years-java-backend-spring-boot-microservices/
https://aijobs.net/job/1319179-staff-officer-civil-military-cooperation/
https://aijobs.net/job/1319178-institutional-capital-client-reporting-associate/
https://aijobs.net/job/1319177-sales-operations-intern/
https://aijobs.net/job/1319176-data-processing-specialist-ci/
https://aijobs.net/job/1319175-capital-risk-engineer-associate-warsaw/
https://aijobs.net/job/1319174-product-owner/
https://aijobs.net/job/1319173-ai-fes-working-student/
https://aijobs.net/job/1319172-grupo-quintoandar-ai-and-productivity-sr-manager/
https://aijobs.net/job/1319171-senior-full-stack-developer/
https://aijobs.net/job/1319144-development-intern/

In [4]:
len(job_link_list)

50

### 2) Scrape Job Details

In [11]:
def scrape_job_details(job_url):
    
    response = requests.get(job_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract structured data from the JSON-LD script
    job_data = {}
    try:
        json_ld = soup.find('script', type='application/ld+json')
        job_data = json.loads(json_ld.string)
    except Exception as e:
        print("Failed to parse structured data:", e)
    
    # Extract fields
    title = job_data.get("title", "N/A")
    company = job_data.get("hiringOrganization", {}).get("name", "N/A")
    location = job_data.get("jobLocationType", "N/A")
    salary_info = job_data.get("baseSalary", {}).get("value", {})
    salary_min = salary_info.get('minValue', None)
    salary_max = salary_info.get('maxValue', None)
    
    
    description_div = soup.find('div', id='job-description')
    description = description_div.get_text(strip=True) if description_div else "N/A"    

    return {
        "title": title,
        "company": company,
        "location": location,
        "salary_min": salary_min,
        "salary_max": salary_max,
        "description": description
    }, job_data

In [12]:
temp, jd = scrape_job_details(job_link)

In [13]:
jd

{'@context': 'https://schema.org/',
 '@type': 'JobPosting',
 'title': 'Staff Officer (Civil - Military Cooperation)',
 'description': 'Staff Officer (Civil - Military Cooperation)-250953\xa0Primary Location\xa0Belgium-MonsNATO Body\xa0Supreme Headquarters Allied Powers Europe (SHAPE)Schedule\xa0Full-timeApplication Deadline\xa024-Jun-2025, 9:59:00 PMSalary (Pay Basis)\xa0:\xa06,467.30Euro (EUR)\xa0MonthlyGrade\xa0NATO Grade G15\xa0Description:\xa0\xa0Post Context/Post Summary Supreme Headquarters Allied Powers Europe (SHAPE) provides an integrated Strategic Effects framework, employing a multi-domain and multi-region focus to create a 360-degree approach, with the flexibility to enable, upon direction, a seamless transition from Baseline Activities and Current Operations (BACO) up to the Maximum Level of Effort (MLE).\xa0 SHAPE supports SACEUR in fulfilling his terms of reference, as directed by the North Atlantic Council. The Partnerships Directorate (PD) enhances Military Cooperation

In [8]:
job_data_list = []
for job_link in job_link_list:
    try:
        job_data_list.append(scrape_job_details(job_link))
    except:
        print(f"Failed to scrape: {job_link}")

In [9]:
len(job_data_list)

50

In [10]:
job_data_list

[{'title': 'Senior Application Developer',
  'company': 'iLink Digital',
  'location': 'N/A',
  'salary_min': 128107,
  'salary_max': 237914,
  'description': "About\n            The  Company:iLink is a Global\n          Software Solution Provider and Systems Integrator, delivers\n          next-generation technology solutions to help clients solve\n          complex business challenges, improve organizational\n          effectiveness, increase business productivity, realize\n          sustainable enterprise value and transform your business\n          inside-out. iLink integrates software systems and develops\n          custom applications, components, and frameworks on the latest\n          platforms for IT departments, commercial accounts, application\n          services providers (ASP) and independent software vendors\n          (ISV). iLink solutions are used in a broad range of industries\n          and functions, including healthcare, telecom, government, oil\n          and gas,