In [1]:
import csv
from time import sleep
import json
import requests
from bs4 import BeautifulSoup
import re

In [2]:
template = 'https://www.salary.com/research/salary/alternate/{}-salary/{}'

In [3]:
position = 'senior-accountant'
city = 'charlotte-nc'

url = template.format(position, city)

# request the raw html
response = requests.get(url)

In [4]:
soup = BeautifulSoup(response.text, 'html.parser')

In [5]:
pattern = re.compile(r'Occupation')
script = soup.find('script', {'type': 'application/ld+json'}, text=pattern)

In [6]:
script

<script type="application/ld+json">
 {
    "@context": "http://schema.org",
    "@type": "Occupation",
    "name": "Senior Accountant",
    "mainEntityOfPage": {
        "@type": "WebPage",
        "lastReviewed": "2022-04-26T00:00:00Z"
    },
    "description": "The Senior Accountant ensures the accuracy of entries to ledger accounts and reconciles subsidiary ledger accounts to the general ledger. Maintains financial records and ensures that financial transactions are properly recorded. Being a Senior Accountant analyzes current costs, revenues, financial commitments, and obligations incurred to predict future revenues and expenses. Prepares complex balance sheets, profit and loss statements and other financial reports. In addition, Senior Accountant may supervise and guide lower-level accountants. Requires a bachelor&#39;s degree. May require CPA certification. Typically reports to a supervisor or manager. Being a Senior Accountant contributes to moderately complex aspects of a proje

In [7]:
json_raw = script.contents[0]

In [8]:
json_data = json.loads(json_raw)

In [9]:
json_data

{'@context': 'http://schema.org',
 '@type': 'Occupation',
 'name': 'Senior Accountant',
 'mainEntityOfPage': {'@type': 'WebPage',
  'lastReviewed': '2022-04-26T00:00:00Z'},
 'description': 'The Senior Accountant ensures the accuracy of entries to ledger accounts and reconciles subsidiary ledger accounts to the general ledger. Maintains financial records and ensures that financial transactions are properly recorded. Being a Senior Accountant analyzes current costs, revenues, financial commitments, and obligations incurred to predict future revenues and expenses. Prepares complex balance sheets, profit and loss statements and other financial reports. In addition, Senior Accountant may supervise and guide lower-level accountants. Requires a bachelor&#39;s degree. May require CPA certification. Typically reports to a supervisor or manager. Being a Senior Accountant contributes to moderately complex aspects of a project. Work is generally independent and collaborative in nature. Working as 

In [10]:
job_title = json_data['name']
location = json_data['occupationLocation'][0]['name']
description = json_data['description']

ntile_10 = json_data['estimatedSalary'][0]['percentile10']
ntile_25 = json_data['estimatedSalary'][0]['percentile25']
ntile_50 = json_data['estimatedSalary'][0]['median']
ntile_75 = json_data['estimatedSalary'][0]['percentile75']
ntile_90 = json_data['estimatedSalary'][0]['percentile90']

salary_data = (job_title, location, description, ntile_10, ntile_25, ntile_50, ntile_75, ntile_90)
print(salary_data)

('Senior Accountant', 'Charlotte, NC', 'The Senior Accountant ensures the accuracy of entries to ledger accounts and reconciles subsidiary ledger accounts to the general ledger. Maintains financial records and ensures that financial transactions are properly recorded. Being a Senior Accountant analyzes current costs, revenues, financial commitments, and obligations incurred to predict future revenues and expenses. Prepares complex balance sheets, profit and loss statements and other financial reports. In addition, Senior Accountant may supervise and guide lower-level accountants. Requires a bachelor&#39;s degree. May require CPA certification. Typically reports to a supervisor or manager. Being a Senior Accountant contributes to moderately complex aspects of a project. Work is generally independent and collaborative in nature. Working as a Senior Accountant typically requires 4 to 7 years of related experience.', '67797', '74990', '82890', '91790', '99893')


In [11]:
def extract_salary_info(job_title, job_city):
    """Extract and return salary information"""
    template = 'https://www.salary.com/research/salary/alternate/{}-salary/{}'
 
    # build the url based on search criteria
    url = template.format(job_title, job_city)

    # request the raw html .. check for valid request
    try:
        response = requests.get(url)
        if response.status_code != 200:
            return None
    except requests.exceptions.ConnectionError:
        return None
   
    # parse the html and extract json data
    soup = BeautifulSoup(response.text, 'html.parser')
    pattern = re.compile(r'Occupation')
    script = soup.find('script', {'type': 'application/ld+json'}, text=pattern)
    json_raw = script.contents[0]
    json_data = json.loads(json_raw)

    # extract salary data
    job_title = json_data['name']
    location = json_data['occupationLocation'][0]['name']
    description = json_data['description']

    ntile_10 = json_data['estimatedSalary'][0]['percentile10']
    ntile_25 = json_data['estimatedSalary'][0]['percentile25']
    ntile_50 = json_data['estimatedSalary'][0]['median']
    ntile_75 = json_data['estimatedSalary'][0]['percentile75']
    ntile_90 = json_data['estimatedSalary'][0]['percentile90']

    data = (job_title, location, description, ntile_10, ntile_25, ntile_50, ntile_75, ntile_90)
    return data

In [20]:
with open('https://github.com/israel-dryer/Salary-Dot-Com-Scraper/blob/37ad88e98413b04aec98bf17649a5c2654ebd727/largest_cities.csv', newline='') as f:
    reader = csv.reader(f)
    # a reader typically returns each row as a list... so I need to flatten the list to make a single list
    cities = [city for row in reader for city in row]

OSError: [Errno 22] Invalid argument: 'https://github.com/israel-dryer/Salary-Dot-Com-Scraper/blob/37ad88e98413b04aec98bf17649a5c2654ebd727/largest_cities.csv'

In [None]:
print(cities[:10])

In [None]:
salary_data = []

for city in cities:
    result = extract_salary_info('senior-accountant', city)
    if result:
        salary_data.append(result)
        sleep(0.5)

In [16]:
with open('salary-results.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['Title','Location', 'Description', 'nTile10', 'nTile25', 'nTile50', 'nTile75', 'nTile90'])
    writer.writerows(salary_data)

In [17]:
# print the first 5 records
for row in salary_data[:5]:
    print(row)

In [18]:
def main(job_title):
    """Extract salary data from top us cities"""
    
    # get the list of largest us cities
    with open('largest_cities.csv', newline='') as f:
        reader = csv.reader(f)
        # a reader typically returns each row as a list... so I need to flatten the list to make a single list
        cities = [city for row in reader for city in row]
        
    # extract salary data for each city
    salary_data = []
    for city in cities:
        result = extract_salary_info(job_title, city)
        if result:
            salary_data.append(result)
            sleep(0.5)
            
    # save data to csv file
    with open('salary-results.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Title','Location', 'Description', 'nTile10', 'nTile25', 'nTile50', 'nTile75', 'nTile90'])
        writer.writerows(salary_data)
        
    return salary_data