In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

import yaml
import pandas as pd
import numpy as np
import datetime as dt
import re

BASE_URL = "https://cityjobs.nyc.gov/jobs?options=&page={}"
JOB_BASE_URL = "https://cityjobs.nyc.gov"

In [3]:


def get_job_links(listing_url):
    """Extracts job detail page links from a listing page."""
    res = requests.get(listing_url)
    soup = BeautifulSoup(res.text, 'html.parser')
    # Find job links (update selector as needed)
    links = []
    for a in soup.select('a[href^="/job/"]'):
        href = a['href']
        if href.startswith("/job/"):
            links.append(JOB_BASE_URL + href)
    return links

def parse_job_detail(job_url):
    """Extracts required fields from a job detail page."""
    res = requests.get(job_url)
    soup = BeautifulSoup(res.text, 'html.parser')
    # Helper function to find labeled field text
    def get_text_by_label(label):
        tag = soup.find(text=lambda t: t and label in t)
        if tag:
            next_tag = tag.parent.find_next_sibling(text=True)
            if next_tag:
                return next_tag.strip()
        return ""
    # Business Title
    title_tag = soup.find('h1')
    business_title = title_tag.text.strip() if title_tag else ""
    # Agency
    agency = get_text_by_label("DEPARTMENT")
    # Posting Date
    posting_date = get_text_by_label("Posted On:")
    # Preferred Skills (may need to search in Job Description section)
    preferred_skills = ""
    job_desc = soup.find('div', {'id': 'job-description'})
    if job_desc:
        ps_tag = job_desc.find(text=lambda t: "Preferred Skills" in t)
        if ps_tag:
            preferred_skills = ps_tag.parent.find_next_sibling('p').text.strip()
    # Number of Positions
    num_positions = get_text_by_label("Number of positions:")
    # Career Level
    career_level = get_text_by_label("Experience level:")
    # Level
    level = get_text_by_label("Job level")
    # Post Until
    post_until = get_text_by_label("Posted until")
    # Job Category
    job_category = get_text_by_label("Category:")
    # Return as dict
    return {
        'business_title': business_title,
        'agency': agency,
        'posting_date': posting_date,
        'preferred_skills': preferred_skills,
        '#_of_positions': num_positions,
        'career_level': career_level,
        'level': level,
        'post_until': post_until,
        'job_category': job_category,
        'job_url': job_url
    }

def scrape_city_jobs(max_pages=5):
    """Scrapes multiple pages of NYC job postings."""
    jobs = []
    for page in range(1, max_pages + 1):
        listing_url = BASE_URL.format(page)
        job_links = get_job_links(listing_url)
        print(f"Scraping page {page}, found {len(job_links)} jobs.")
        for job_url in job_links:
            try:
                job_info = parse_job_detail(job_url)
                jobs.append(job_info)
                time.sleep(0.5)  # Be polite!
            except Exception as e:
                print(f"Error scraping job: {job_url} - {e}")
    return pd.DataFrame(jobs)



In [4]:
# --- Run scraper for first N pages ---
df_jobs = scrape_city_jobs(max_pages=2)
df_jobs.head()

Scraping page 1, found 0 jobs.
Scraping page 2, found 0 jobs.
