# USAJobs.gov Webscraping
#### June 17th, 2024
A step by step script to retrieve job listing data from the US federal government's official jobs site. Scrape conducted on 06/17/2024 for Pursuit.org by Youssef Agour for internal use purposes. 

Source site: https://www.usajobs.gov/

In [1]:
# Library Install
!pip install selenium 
!pip install webdriver_manager



### Step 1: Import Necessary Libraries

In [2]:
# Selenium for webpage navigation
from selenium import webdriver 
from selenium.webdriver.common.by import By

# Beautiful Soup for Website HTML Parsing
from bs4 import BeautifulSoup

# Requests for calling website
import urllib.request

# Pandas, Numpy, and datetime for data manipulation & storage
import time
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import date

# record when the data is crawled
date_today = date.today()

### Step 2: Initialize Website for Data Source

In [31]:
# Preparing Query url
url = 'https://www.usajobs.gov/search/results/?g=0&g=1&g=2&g=3&g=4&g=5&g=6&g=7&g=8&g=9&g=10&j=0357&j=1560&j=2210&j=1550&l=&hp=public&p=6&smin=0&smax=85844&gs=true&k='

# Jobs to crawl
num_jobs = 20

### Step 3: Investigate HTML structure and understand variables that can be scraped

In [27]:
# Empty lists for data to be scraped
jl = [] # Job Listing
title = [] # Job Title
agency = [] # Hiring Entity
salary = [] # Salary Range Text
grade = [] # Federal Grade Level
timestamp = [] # When data is collected

### Step 4: Initialize Driver and collect data

In [32]:
# Initialize the driver
driver = webdriver.Chrome()
driver.set_window_size(1120, 1300)
driver.get(url)

# Number of Jobs crawled
jobcounts = 0

# Initializing Scrape
while num_jobs > jobcounts:
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    job_elements = soup.find_all('a', {'class': 'usajobs-search-result--core__title search-joa-link'})
    
    for i in range(len(job_elements)):
        if num_jobs > jobcounts:
            
            s1 = job_elements[i]
            
            # Job Link
            job_link = 'https://www.usajobs.gov/' + s1.get('href')
            jl.append(job_link)
            
            # Go into specific job link
            driver.get(job_link)
            
            # If job counts > 2 may need human clicks
            if jobcounts > 2:
                time.sleep(1)
                
            s_l2 = BeautifulSoup(driver.page_source, 'html.parser')
            
            # job title
            try:
                title.append(s_l2.find('h1', {'class': "usajobs-joa-banner__title"}).text)
            except Exception as e:
                title.append('N/A')
                print(f"Failed to scrape job title: {e}")
            
            # agency
            try:
                agency.append(s_l2.find('a', {'class': "usajobs-joa-banner__agency usajobs-joa-banner--v1-3__agency"}).text)
            except Exception as e:
                agency.append('N/A')
                print(f"Failed to scrape exam requirement: {e}")
            
            try:
                salary_widget_div = s_l2.find('p', {'class': 'usajobs-joa-summary__salary salary-text-normal'})
    
                if salary_widget_div:
                    salary_text = salary_widget_div.get_text(strip=True)
                    salary.append(salary_text)
                else:
                    salary.append('N/A')

            except Exception as e:
                salary.append('N/A')
                print(f"Failed to scrape salary: {e}")
            
                   
            # Grade
            try:
                grade.append(s_l2.find('p', {'class':'usajobs-joa-summary__grades'}).text)
            except Exceptions as e:
                grade.append('N/A')
                print(f"Failed to scrape number of open positions: {e}")
                
            # timestamp
            timestamp.append(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
            
            # Increment job count
            jobcounts += 1
            
            time.sleep(1)
            
            
        else: 
            break
    
    # filp page
    # click to the next page
    
    driver.get(url)
    
    # wait for 2 seconds to click the button
    time.sleep(2)
    
    try:
        nextpage_path = '/html/body/section/section/div/main/div[5]/div[12]/ul/li[8]/a'
        driver.find_element(By.XPATH,nextpage_path).click()
        time.sleep(10)
    except:
        break 
        
    # update main page url
    url = driver.current_url
    
# Close the driver
driver.close()

### Step 5: Pool Data into Dataframe format and ensure output is useable

In [33]:
# Assemble dataframe
df = pd.DataFrame(list(zip(jl, title, salary, agency, grade)),
                 columns = ['Job Link', 'Job Title', 'Salary', 'Agnecy', 'Pay & Qualification Grade'])

In [34]:
df.shape

(145, 5)

In [35]:
df.tail()

Unnamed: 0,Job Link,Job Title,Salary,Agnecy,Pay & Qualification Grade
140,https://www.usajobs.gov//job/795837300,\n ICAM Specialist \n ...,"$68,405- $88,926 per year",\n Federal Bureau of In...,GS 9
141,https://www.usajobs.gov//job/795878000,\n Information Technolo...,$18.77- $21.77 per hour,\n Air Force Global Str...,NF 3
142,https://www.usajobs.gov//job/795889800,\n COMPUTER SCIENTIST \...,"$79,735- $103,658 per year",\n United States Fleet ...,GG 7
143,https://www.usajobs.gov//job/795939400,\n Information Technolo...,"$82,764- $113,106 per year",\n U.S. Army Cyber Comm...,GG 11
144,https://www.usajobs.gov//job/795971000,\n IT Specialist INFOSE...,"$66,732- $86,750 per year",\n U.S. Pacific Fleet\n...,GS 9


### Step 6: Export to CSV

In [36]:
# save to csv
df.to_csv('USAJOBS.govjobs_dataframe_final_0617.csv')