# Scraping AI Job Board

Code authored by: Shaw Talebi

### imports

In [1]:
import requests
from bs4 import BeautifulSoup
import json
import re
import pandas as pd

### 1) get list of (unique) job urls

In [2]:
job_url_list = []

for i in range(5):
    # construct url
    url = f"https://aijobs.ai/engineer?location=United%20States&page={i+1}"

    # perform get request
    response = requests.get(url)

    # parse html
    soup = BeautifulSoup(response.text, "html.parser")

    # grab all job urls
    job_cards = soup.find_all("a", class_="jobcardStyle1")
    job_urls_temp = sorted({a["href"] for a in job_cards if a.get("href")})

    # add new urls to list
    job_url_list = job_url_list + job_urls_temp

### 2) scrape job data from url

Data extracted:
- Job Title
- Org
- Salary
- Location
- Job Description
- Job Type

In [4]:
def extract_job_data(soup: BeautifulSoup):
    """Extract job data using direct HTML scraping."""
    
    job_data = {
        "Job Title": None,
        "Org": None,
        "Job Type": None,
        "Date Posted": None,
        "Job Description": None,
        "Salary": None,
        "Location": None,
        "Remote": None,
    }
    
    # Extract Job Title
    title_elem = soup.find("div", class_="post-main-title2")
    if title_elem:
        job_data["Job Title"] = title_elem.get_text(strip=True)
    
    # Extract Company Name
    # Look for the company link or text
    company_elem = soup.find("span", string=lambda x: x and "at" in x)
    if company_elem:
        # Get the next span which contains company name
        company_span = company_elem.find_next_sibling("span")
        if company_span:
            job_data["Org"] = company_span.get_text(strip=True)
    
    # Alternative: look for company link
    if not job_data["Org"]:
        company_link = soup.find("a", href=re.compile(r"/company/"))
        if company_link:
            company_name = company_link.find("span", class_="tw-card-title")
            if company_name:
                job_data["Org"] = company_name.get_text(strip=True)
    
    # Extract Job Type (Full Time, Part Time, etc.)
    job_type_elem = soup.find("span", class_=re.compile(r"tw-bg-\[#0BA02C\]"))
    if job_type_elem:
        job_data["Job Type"] = job_type_elem.get_text(strip=True)
    
    # Extract Remote status
    remote_elem = soup.find("span", class_=re.compile(r"tw-bg-\[#FFEDED\]"))
    if remote_elem:
        job_data["Remote"] = remote_elem.get_text(strip=True)
    
    # Extract Location
    location_elem = soup.find("div", class_="remote")
    if location_elem:
        location_p = location_elem.find("p", class_="tw-mb-0")
        if location_p:
            job_data["Location"] = location_p.get_text(strip=True)
    
    # Extract Date Posted
    date_elem = soup.find("div", string=re.compile(r"Job Posted:"))
    if date_elem:
        date_span = date_elem.find_next_sibling("span")
        if date_span:
            job_data["Date Posted"] = date_span.get_text(strip=True)
    
    # Extract Job Description
    desc_elem = soup.find("div", class_="job-description-container")
    if desc_elem:
        job_data["Job Description"] = desc_elem.get_text(strip=True)
    
    # Extract Salary (if available)
    # Look in job overview section
    salary_section = soup.find("div", string=re.compile(r"Salary", re.IGNORECASE))
    if salary_section:
        salary_value = salary_section.find_next("span")
        if salary_value:
            job_data["Salary"] = salary_value.get_text(strip=True)
    
    # Alternative salary location
    if not job_data["Salary"]:
        for div in soup.find_all("div"):
            text = div.get_text()
            if "Salary:" in text:
                # Extract the salary portion
                salary_match = re.search(r'Salary:\s*(.+?)(?=\n|\Z)', text)
                if salary_match:
                    job_data["Salary"] = salary_match.group(1).strip()
                    break
    
    return job_data

In [5]:
job_data_list = []

for job_url in job_url_list:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    job_data = extract_job_data(soup)

    job_data_list.append(job_data)

In [6]:
len(job_data_list)

105

### 3) data cleaning

In [7]:
df = pd.DataFrame(job_data_list)

In [8]:
df

Unnamed: 0,Job Title,Org,Job Type,Date Posted,Job Description,Salary,Location,Remote
0,,,,,,"$169,500 - $291,500",,
1,,,,,,"$169,500 - $291,500",,
2,,,,,,"$169,500 - $291,500",,
3,,,,,,"$169,500 - $291,500",,
4,,,,,,"$169,500 - $291,500",,
...,...,...,...,...,...,...,...,...
100,,,,,,,,
101,,,,,,,,
102,,,,,,,,
103,,,,,,,,


### 4) save it as a .csv