# JobsNYC.gov Webscraping 
#### June 11th, 2024
A step by step script to retrieve job listing data from The City of New York's official government jobs site. Scrape conducted on 06/11/2024 for Pursuit.org by Youssef Agour for internal use purposes. 

Source site: https://cityjobs.nyc.gov/

### Necessary Libraries

In [1]:
!pip install selenium
!pip install webdriver_manager



In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import urllib.request

import time
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import date

# record when the data is crawled
date_today = date.today()

### Establishing Scrape Parameters

In [61]:
# Preparing Query url
url = 'https://cityjobs.nyc.gov/jobs?q=cyber&options=&page=2&size=24'

# Jobs to crawl
num_jobs = 44

In [4]:
# Empty lists for data to be scraped
jl = []
title = []
salary = []
exam_req = []
exp = []
num_open = []
timestamp = []

In [63]:
# Initialize the driver
driver = webdriver.Chrome()
driver.set_window_size(1120, 1300)
driver.get(url)

# Number of Jobs crawled
jobcounts = 0

# Initializing Scrape
while num_jobs > jobcounts:
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    job_elements = soup.find_all('a', {'aria-label': 'Learn more'})
    
    for i in range(len(job_elements)):
        if num_jobs > jobcounts:
            
            s1 = job_elements[i]
            
            # Job Link
            job_link = 'https://cityjobs.nyc.gov' + s1.get('href')
            jl.append(job_link)
            
            # Go into specific job link
            driver.get(job_link)
            
            # If job counts > 2 may need human clicks
            if jobcounts > 2:
                time.sleep(1)
                
            s_l2 = BeautifulSoup(driver.page_source, 'html.parser')
            
            # job title
            try:
                title.append(s_l2.find('span', {'class': "header__text"}).text)
            except Exception as e:
                title.append('N/A')
                print(f"Failed to scrape job title: {e}")
            
            # salary estimates
            try:
                salary_widget_div = s_l2.find('div', {'data-type': 'SalaryWidget'})
                salary_range_span = salary_widget_div.find_all('span')[0]
                salary_text_span = salary_range_span.find_all('span')[0]
                salary_text = salary_range_span.get_text(strip=True)
                salary.append(salary_text)

            except Exception as e:
                salary.append('N/A')
                print(f"Failed to scrape salary: {e}")
            
            # exam requirement
            try:
                exam_req.append(s_l2.find('li', {'class': "TitleClassification-wrapper"}).text)
            except Exception as e:
                exam_req.append('N/A')
                print(f"Failed to scrape exam requirement: {e}")
                
            # experience level
            try:
                exp.append(s_l2.find('li', {'class': 'Experiencelevel-wrapper'}).text)
            except Exception as e:
                exp.append('N/A')
                print(f"Failed to scrape experience level: {e}")
                
            # Number of open positions
            try:
                num_open.append(s_l2.find('div', {'class':'attrax-job-information-widget__dynamic-field number-of-positions'}).text)
            except Exceptions as e:
                num_open.append('N/A')
                print(f"Failed to scrape number of open positions: {e}")
                
            # timestamp
            timestamp.append(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
            
            # Increment job count
            jobcounts += 1
            
            time.sleep(1)
            
            
        else: 
            break
    
    # filp page
    # click to the next page
    
    driver.get(url)
    
    # wait for 2 seconds to click the button
    time.sleep(2)
    
    try:
        nextpage_path = '//*[@id="left-column"]/div[2]/div/button'
        driver.find_element(By.XPATH,nextpage_path).click()
        time.sleep(10)
    except:
        break 
        
    # update main page url
    url = driver.current_url
    
# Close the driver
driver.close()

In [64]:
# Assemble dataframe
df = pd.DataFrame(list(zip(jl, title, salary, exam_req, exp, num_open)),
                 columns = ['Job Link', 'Job Title', 'Salary', 'Exam Requirement', 'Experience Level', 'Number of Positions Open'])

In [65]:
df.shape

(605, 6)

In [66]:
df.tail()

Unnamed: 0,Job Link,Job Title,Salary,Exam Requirement,Experience Level,Number of Positions Open
600,https://cityjobs.nyc.gov/job/project-director-...,\n PROJECT DIRECTOR\n,"Salary range:$108,071.00 – $124,282.00",\n\n Title Classificati...,\n\n Experience level:\...,\nNumber of positions\n1\n
601,https://cityjobs.nyc.gov/job/cloud-engineer-in...,\n Cloud Engineer\n,"Salary range:$115,854.00 – $130,701.00",\n\n Title Classificati...,\n\n Experience level:\...,\nNumber of positions\n1\n
602,https://cityjobs.nyc.gov/job/database-administ...,\n Database Administrator\n,"Salary range:$115,854.00 – $130,701.00",\n\n Title Classificati...,\n\n Experience level:\...,\nNumber of positions\n1\n
603,https://cityjobs.nyc.gov/job/devops-support-en...,\n DevOps Support Engineer\n,"Salary range:$107,281.00 – $120,190.00",\n\n Title Classificati...,\n\n Experience level:\...,\nNumber of positions\n1\n
604,https://cityjobs.nyc.gov/job/senior-software-e...,\n Senior Software Engineer (MS-Dyn...,"Salary range:$115,854.00 – $130,701.00",\n\n Title Classificati...,\n\n Experience level:\...,\nNumber of positions\n1\n


In [67]:
# save to csv
df.to_csv('nyc_govjobs_dataframe_final_0611.csv')