# Scraping AI Job Board with Python
## ABB #5 - Session 1

Code authored by: Shaw Talebi

### imports

In [1]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd

### 1) extract job listing links

In [2]:
# URL of the website
job_board_url = "https://aijobs.net"
query = "/?reg=5" # north america jobs

# Send a GET request to the website
response = requests.get(job_board_url + query)

# Check if the request was successful
if response.status_code == 200:
    # Get the HTML content
    html_content = response.text
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")

In [3]:
# Parse the HTML with BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

In [4]:
# Find all job links within the <ul> list
job_links = soup.select('ul#job-list a.col.py-2[href]')

# explanation from ChatGPT:
# This selects all <a> tags with class col py-2 inside the <ul> element with id="job-list"

In [5]:
# Extract href attributes and create full URLs
job_url_list = [job_board_url + link['href'] for link in job_links]

for job_url in job_url_list:
    print(job_url)

https://aijobs.net/job/1066507-senior-ai-engineer/
https://aijobs.net/job/1140156-scientifique-des-donnees-senior/
https://aijobs.net/job/1140154-director-consult-partner-manufacturing-ai/
https://aijobs.net/job/1140153-director-consult-partner-utilities-ai/
https://aijobs.net/job/1140150-business-analyst-intern-summer-2025/
https://aijobs.net/job/1140149-clinical-trials-feasibility-analyst/
https://aijobs.net/job/1140148-svp-full-stack-product-manager-hybrid/
https://aijobs.net/job/1140147-creative-product-manager-shopping-remote-eligible/
https://aijobs.net/job/1140145-senior-manager-pricing-data/
https://aijobs.net/job/1140142-actuarial-specialist/
https://aijobs.net/job/1140141-banamex-customer-data-sr-mgr-kafka-and-adobe-personalization-tech-lead-c13/
https://aijobs.net/job/1140140-integrated-insights-strategy-analyst/
https://aijobs.net/job/1140139-avp-quantitative-model-developer-economic-forecasting-hybrid/
https://aijobs.net/job/1140137-director-of-product-management-endpoint-

### 2) extract info from one listing

In [6]:
# extract html from job listing (same as cell 2)
job_url = job_url_list[0]
response = requests.get(job_url)
html_content = response.text

#### pull json data

In [8]:
# Find the script tag containing JSON-LD
script_tag = soup.find('script', type='application/ld+json')

# Load the JSON content
if script_tag:
    job_data = json.loads(script_tag.string)

    # Extract relevant fields
    company_name = job_data['hiringOrganization']['name']
    job_title = job_data['title']
    job_description = job_data['description']
    salary_min = job_data['baseSalary']['value']['minValue']
    salary_max = job_data['baseSalary']['value']['maxValue']

    # Print extracted data
    print(f"Company Name: {company_name}")
    print(f"Job Title: {job_title}")
    print(f"Job Description: {job_description[:500]}...")
    print(f"Salary Range: {salary_min} - {salary_max} USD")

Company Name: Lemon.io
Job Title: Senior AI Engineer
Job Description: Are you a talented Senior AI Engineer looking for a remote job that lets you show your skills and get decent compensation? Look no further than Lemon.io — the marketplace that connects you with hand-picked startups in the US and Europe. What we offer:   The rate depends on your skills and experience. We&#x27;ve already paid out over $11M to our engineers.   No more hunting for clients or negotiating rates — let us handle the business side of things so you can focus on what you do best.   We&#x27...
Salary Range: 57000 - 230000 USD


### 3) extract info from all listings

In [9]:
# write function to implement way 2

def extract_job_info(url):
    """
    Extracts job information from a given job listing URL.

    Args:
        url (str): The URL of the job listing.

    Returns:
        dict: A dictionary containing the following key-value pairs:
            - 'company_name' (str): Name of the hiring organization.
            - 'job_title' (str): Title of the job.
            - 'job_description' (str): Detailed description of the job.
            - 'salary_min' (float or str): Minimum salary offered for the job.
            - 'salary_max' (float or str): Maximum salary offered for the job.
               Returns 'N/A' if salary information is unavailable.
    """
    try:
        # Fetch the HTML content of the job listing
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status codes
        html_content = response.text
        
        # Parse the HTML with BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Find the script tag containing JSON-LD
        script_tag = soup.find('script', type='application/ld+json')
        
        if script_tag:
            job_data = json.loads(script_tag.string)
            
            # Extract relevant fields with default values if not present
            company_name = job_data.get('hiringOrganization', {}).get('name', 'N/A')
            job_title = job_data.get('title', 'N/A')
            job_description = job_data.get('description', 'N/A')
            salary_data = job_data.get('baseSalary', {}).get('value', {})
            salary_min = salary_data.get('minValue', 'N/A')
            salary_max = salary_data.get('maxValue', 'N/A')
            
            return {
                'company_name': company_name,
                'job_title': job_title,
                'job_description': job_description,
                'salary_min': salary_min,
                'salary_max': salary_max
            }
        else:
            return {'error': 'No JSON-LD script found in the page'}
    
    except requests.RequestException as e:
        return {'error': f"Request failed: {e}"}
    
    except json.JSONDecodeError:
        return {'error': 'Failed to parse JSON-LD content'}
    
    except Exception as e:
        return {'error': f"An unexpected error occurred: {e}"}

In [10]:
# extract job info from all job urls
job_info_list = []

for job_url in job_url_list:
    # extract job info
    job_info = extract_job_info(job_url)

    # store results in list if no errors occured
    try:
        print(job_info["job_title"])
        job_info_list.append(job_info)
    except:
        print(f"Could not extract info from: {job_url}")
        continue

Senior AI Engineer
Scientifique des données senior
Director, Consult Partner - Manufacturing / AI
Director, Consult Partner - Utilities / AI
Business Analyst Intern - Summer 2025
Clinical Trials Feasibility Analyst
SVP, Full-Stack Product Manager (Hybrid)
Creative Product Manager, Shopping (Remote-Eligible)
Senior Manager, Pricing Data
Actuarial Specialist
BANAMEX - Customer Data Sr Mgr – (Kafka and Adobe) Personalization Tech Lead C13
Integrated Insights &amp; Strategy Analyst
AVP Quantitative Model Developer – Economic Forecasting (Hybrid)
Director of Product Management - Endpoint Technology
Manager, Product Management - Customer Services &amp; Strategy (US Card)
Engineering Senior Manager
Algorithm Engineer (Image Processing)
Lead, Patient Access and Navigation Analytics and Innovation
Data Quality Scientist
VP, Data Quality Lead Analyst - C13 (Hybrid)
Data Quality Senior Analyst - AVP - IRVING
Analytics Engineer
SVP, Data Integration Sr Lead Analyst - C14 (Hybrid)
Research Data Ana

### 4) Store data in Pandas dataframe

In [11]:
df = pd.DataFrame(job_info_list)
df.head()

Unnamed: 0,company_name,job_title,job_description,salary_min,salary_max
0,Lemon.io,Senior AI Engineer,Are you a talented Senior AI Engineer looking ...,57000,230000
1,Aviva,Scientifique des données senior,"Individuellement, nous sommes des personnes, m...",82368,192192
2,Kyndryl,"Director, Consult Partner - Manufacturing / AI","Who We AreAt Kyndryl, we design, build, manage...",151560,327240
3,Kyndryl,"Director, Consult Partner - Utilities / AI","Who We AreAt Kyndryl, we design, build, manage...",151560,327240
4,CACI International Inc,Business Analyst Intern - Summer 2025,Business Analyst Intern - Summer 2025Job Categ...,43900,87800


In [12]:
# save to file
df.to_csv("data/ai_job_data.csv", index=False)

#### Future directions
- extract other fields from job listings e.g. tags, key skills
- add filters to job search e.g. remote, Product, salary