In [2]:
from bs4 import BeautifulSoup
import requests
import json
import pandas as pd

In [3]:
def retrieve_job_urls(job_search_url):
    # Make an HTTP GET request to get the HTML of the page
    response = requests.get(job_search_url)
    html = response.text
    soup = BeautifulSoup(html, "html.parser")

    job_urls = []
    job_url_elements = soup.select('[data-tracking-control-name="public_jobs_jserp-result_search-card"]')
    for job_url_element in job_url_elements:
        job_url = job_url_element["href"]
        job_urls.append(job_url)
    return job_urls


In [4]:
def scrape_job(job_url):
    response = requests.get(job_url)
    html = response.text
    soup = BeautifulSoup(html, "html.parser")

    # Extract job details
    title = soup.select_one("h1").get_text().strip()
    company_element = soup.select_one('[data-tracking-control-name="public_jobs_topcard-org-name"]')
    company_name = company_element.get_text().strip()
    company_url = company_element["href"]

    location = soup.select_one(".topcard__flavor--bullet").get_text().strip()
    applicants = soup.select_one(".num-applicants__caption").get_text().strip()

    salary_element = soup.select_one(".salary")
    salary = salary_element.get_text().strip() if salary_element else None

    description = soup.select_one(".description__text .show-more-less-html").get_text().strip()

    criteria = []
    criteria_elements = soup.select(".description__job-criteria-list li")
    for criteria_element in criteria_elements:
        name = criteria_element.select_one(".description__job-criteria-subheader").get_text().strip()
        value = criteria_element.select_one(".description__job-criteria-text").get_text().strip()
        criteria.append({"name": name, "value": value})

    job = {
        "url": job_url,
        "title": title,
        "company": {"name": company_name, "url": company_url},
        "location": location,
        "applications": applicants,
        "salary": salary,
        "description": description,
        "criteria": criteria
    }
    return job


In [5]:
if __name__ == "__main__":
    # Example: Software Engineer jobs in United States
    public_job_search_url = (
        "https://www.linkedin.com/jobs/search?"
        "keywords=Cyber%20Security&location=United%20States"
        "&geoId=103644278&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0"
    )

    print("Starting job retrieval...")
    job_urls = retrieve_job_urls(public_job_search_url)
    print(f"Retrieved {len(job_urls)} job URLs\n")

    scraping_limit = 5  # keep small for demo
    jobs = []
    for job_url in job_urls[:scraping_limit]:
        print(f"Scraping job: {job_url}")
        jobs.append(scrape_job(job_url))

    # Instead of writing to file, show JSON
    print(json.dumps(jobs, indent=2, ensure_ascii=False))

    # Convert to DataFrame for analysis
    df = pd.DataFrame(jobs)
    print("\nDataFrame preview:")

Starting job retrieval...
Retrieved 60 job URLs

Scraping job: https://www.linkedin.com/jobs/view/security-analyst-1-at-uber-4335779142?position=1&pageNum=0&refId=m602lRfTmz10Feq0jtmX0w%3D%3D&trackingId=3T4XQ58ed4l79HSqAVA1hQ%3D%3D
Scraping job: https://www.linkedin.com/jobs/view/information-security-officer-at-long-beach-unified-school-district-4335261144?position=2&pageNum=0&refId=m602lRfTmz10Feq0jtmX0w%3D%3D&trackingId=sCC52KgTQ%2F5ZT3MWs%2BPXow%3D%3D
Scraping job: https://www.linkedin.com/jobs/view/cybersecurity-manager-full-time-days-in-office-position-at-concord-hospital-health-system-4335356472?position=3&pageNum=0&refId=m602lRfTmz10Feq0jtmX0w%3D%3D&trackingId=aa0aTdeRCvLEpJ2hx3qRTg%3D%3D
Scraping job: https://www.linkedin.com/jobs/view/soc-tier-1-at-lensa-4305445387?position=4&pageNum=0&refId=m602lRfTmz10Feq0jtmX0w%3D%3D&trackingId=9%2B27mx39ZL%2BjkiHfGipFQA%3D%3D
Scraping job: https://www.linkedin.com/jobs/view/cybersecurity-analyst-at-mclane-global-4334627012?position=5&pageN

In [6]:
df[["title", "company", "location","description"]].head()

Unnamed: 0,title,company,location,description
0,Security Analyst 1,"{'name': 'Uber', 'url': 'https://www.linkedin....","Seattle, WA",About The RoleThe CyberSecurity Incident Respo...
1,Information Security Officer,"{'name': 'Long Beach Unified School District',...","Lakewood, CA",To be considered for this position you will ne...
2,Cybersecurity Manager | Full Time | Days | In ...,"{'name': 'Concord Hospital Health System', 'ur...","Concord, NH",This position is an in-office position.Summary...
3,SOC Tier 1,"{'name': 'Lensa', 'url': 'https://www.linkedin...","Washington, DC",Lensa is a career site that helps job seekers ...
4,Cybersecurity Analyst,"{'name': 'McLane Global', 'url': 'https://www....","Houston, TX",As one of the world's largest food-grade logis...


In [7]:
# Option B: Save to CSV (if filesystem allows) ---

df.to_csv("jobs.csv", index=False, encoding="utf-8")
print("Jobs successfully saved to jobs.csv")

Jobs successfully saved to jobs.csv
