# Example 1: Scraping AI Job Board
## ABB #8 - Session 1

Code authored by: Shaw Talebi

### imports

In [1]:
import requests
from bs4 import BeautifulSoup
import json5
import re
import pandas as pd
import time

### 0) get job urls for one page

In [2]:
url = "https://aijobs.ai/engineer?location=United%20States"

# get request
response = requests.get(url)

# parse html with beautiful soup
soup = BeautifulSoup(response.text, "html.parser")

# grab all job urls
job_cards = soup.find_all("a", class_="jobcardStyle1")
job_url_list = sorted({a["href"] for a in job_cards if a.get("href")})

# print job urls from list
for job_url in job_url_list:
    print(job_url)

https://aijobs.ai/job/associate-machine-learning-engineer
https://aijobs.ai/job/associate-machine-learning-engineer-1
https://aijobs.ai/job/director-of-engineering-ai-ml-data-collections
https://aijobs.ai/job/director-of-engineering-ai-ml-data-collections-1
https://aijobs.ai/job/machine-learning-engineer-2480
https://aijobs.ai/job/machine-learning-engineer-2481
https://aijobs.ai/job/machine-learning-engineer-ll
https://aijobs.ai/job/machine-learning-perception-software-engineer
https://aijobs.ai/job/senior-ai-engineer-ai-labs
https://aijobs.ai/job/senior-machine-learning-engineer-1804
https://aijobs.ai/job/senior-machine-learning-engineer-1808
https://aijobs.ai/job/senior-post-silicon-validation-engineer-bringup
https://aijobs.ai/job/senior-software-engineer-go-llm-team
https://aijobs.ai/job/senior-software-engineer-ml-platform
https://aijobs.ai/job/software-engineer-i-ai-platform
https://aijobs.ai/job/software-engineer-ii-ai-enablement
https://aijobs.ai/job/sr-machine-learning-enginee

### 1) get list of (unique) job urls from 5 pages

In [3]:
# intialize list to store job urls
job_url_list = []

for i in range(5):
    # construct url
    url = f"https://aijobs.ai/engineer?location=United%20States&page={i+1}"

    # perform get request
    response = requests.get(url)

    # parse html
    soup = BeautifulSoup(response.text, "html.parser")

    # grab all job urls
    job_cards = soup.find_all("a", class_="jobcardStyle1")
    job_urls_temp = sorted({a["href"] for a in job_cards if a.get("href")})

    # add new urls to list
    job_url_list = job_url_list + job_urls_temp

In [4]:
len(job_url_list)

100

### 2) scrape job data from url

Data extracted:
- Job Title
- Org
- Salary
- Location
- Job Description
- Job Type

In [5]:
def extract_job_data(soup: BeautifulSoup) -> dict:
    """
    Extracts job data from an AI job posting page using JSON-LD metadata.
    Returns keys: title, org, min_salary, max_salary, location, description, job_type
    """
    # scrape JSON-LD data
    script = soup.find("script", type="application/ld+json")

    # extract raw text
    raw = script.get_text()
    
    # Do various pre-processing to fix some JSON failure modes
    # normalize whitespace
    raw = raw.replace("\u00a0", " ").replace("\u2028", " ").replace("\u2029", " ")
    raw = raw.replace("\r", " ").replace("\n", " ")
    # remove trailing commas before } or ]
    raw = re.sub(r",\s*([}\]])", r"\1", raw)
    # insert missing commas between a string value and the next key
    raw = re.sub(r'"\s+"(?=[A-Za-z@])', '","', raw)
    
    # Format data in dictionary
    data = json5.loads(raw)
    
    # Extract relevant fields
    company_name = data['hiringOrganization']['name']
    job_title = data['title']
    job_description = data['description']
    salary_min = data['baseSalary']['value']['minValue']
    salary_max = data['baseSalary']['value']['maxValue']
    salary_currency = data['baseSalary']['currency']

    return {
        "title": job_title,
        "org": company_name,
        "min_salary": salary_min,
        "max_salary": salary_max,
        "currency": salary_currency,
        "description": job_description,
    }

In [7]:
job_data_list = []

for job_url in job_url_list:
    # add header metadata to get requests
    session = requests.Session()
    session.headers.update({
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9",
    })

    # parse HTML
    response = requests.get(job_url)
    soup = BeautifulSoup(response.text, "html.parser")

    # get job data withe error handling
    try:
        job_data = extract_job_data(soup)
        job_data_list.append(job_data)
    except Exception as e:
        print(e, job_url)

    time.sleep(2.5)

<string>:1 Unexpected "P" at column 742 https://aijobs.ai/job/machine-learning-engineer-intern-3d-generative-ai-q1-2026-1
<string>:1 Unexpected "m" at column 308 https://aijobs.ai/job/senior-machine-learning-engineer-mandarin-speaking
<string>:1 Unexpected "A" at column 9012 https://aijobs.ai/job/senior-software-engineer-machine-learning
<string>:1 Unexpected "a" at column 2461 https://aijobs.ai/job/staff-ai-software-engineer-and-technical-educator
<string>:1 Unexpected "d" at column 430 https://aijobs.ai/job/senior-machine-learning-application-engineer-computer-vision-1
<string>:1 Unexpected "m" at column 287 https://aijobs.ai/job/staff-machine-learning-engineer-434


### 3) data cleaning

In [8]:
df = pd.DataFrame(job_data_list)

In [9]:
df

Unnamed: 0,title,org,min_salary,max_salary,currency,description
0,Associate Machine Learning Engineer,PitchBook Data,0,0,USD,"<div><p><span>At PitchBook, a Morningstar comp..."
1,Associate Machine Learning Engineer,PitchBook Data,0,0,USD,"<div><p><span>At PitchBook, a Morningstar comp..."
2,"Director of Engineering, AI &amp; ML, Data Col...",PitchBook Data,0,0,USD,"<div><p><span>At PitchBook, a Morningstar comp..."
3,"Director of Engineering, AI &amp; ML, Data Col...",PitchBook Data,0,0,USD,"<div><p><span>At PitchBook, a Morningstar comp..."
4,Machine Learning Engineer,PitchBook Data,0,0,USD,"<div><p><span>At PitchBook, a Morningstar comp..."
...,...,...,...,...,...,...
89,Senior Software Engineer - AI,StubHub,250000,250000,USD,<div><p>StubHub is on a mission to redefine th...
90,"Software Engineer, ML Infra",NewsBreak,225000,225000,USD,<div><p><strong>About NewsBreak</strong></p> <...
91,"Staff Software Engineer, AI",Spot AI,0,0,USD,<div><h2><strong>Who we are.</strong></h2> <p>...
92,Staff Software Engineer - AI SDK,Temporal Technologies,3600,3600,USD,<div><div><strong>Who We Are</strong></div> <d...


### 4) save it as a .csv

In [10]:
df.to_csv('data/job_data.csv')