In [1]:
import re
import csv
import json
from time import sleep
from bs4 import BeautifulSoup
import requests

In [2]:
template = 'https://www.salary.com/research/salary/alternate/{}-salary/{}'

In [3]:
position = 'senior-accountant'
city = 'charlotte-nc'

url = template.format(position, city)

response = requests.get(url)

In [4]:
soup = BeautifulSoup(response.text, 'lxml')

In [5]:
pattern = re.compile(r'Occupation')
script = soup.find('script', {'type': 'application/ld+json'},text=pattern)
print(script)

<script type="application/ld+json">
 {
    "@context": "http://schema.org",
    "@type": "Occupation",
    "name": "Senior Accountant",
    "mainEntityOfPage": {
        "@type": "WebPage",
        "lastReviewed": "2020-12-28T00:00:00Z"
    },
    "description": "The Senior Accountant ensures the accuracy of entries to ledger accounts and reconciles subsidiary ledger accounts to the general ledger. Maintains financial records and ensures that financial transactions are properly recorded. Being a Senior Accountant analyzes current costs, revenues, financial commitments, and obligations incurred to predict future revenues and expenses. Prepares complex balance sheets, profit and loss statements and other financial reports. In addition, Senior Accountant may supervise and guide lower-level accountants. Requires a bachelor&#39;s degree. May require CPA certification. Typically reports to a supervisor or manager. Being a Senior Accountant contributes to moderately complex aspects o

In [6]:
json_raw = script.contents[0]

In [7]:
json_data = json.loads(json_raw)

In [8]:
json_data

{'@context': 'http://schema.org',
 '@type': 'Occupation',
 'name': 'Senior Accountant',
 'mainEntityOfPage': {'@type': 'WebPage',
  'lastReviewed': '2020-12-28T00:00:00Z'},
 'description': 'The Senior Accountant ensures the accuracy of entries to ledger accounts and reconciles subsidiary ledger accounts to the general ledger. Maintains financial records and ensures that financial transactions are properly recorded. Being a Senior Accountant analyzes current costs, revenues, financial commitments, and obligations incurred to predict future revenues and expenses. Prepares complex balance sheets, profit and loss statements and other financial reports. In addition, Senior Accountant may supervise and guide lower-level accountants. Requires a bachelor&#39;s degree. May require CPA certification. Typically reports to a supervisor or manager. Being a Senior Accountant contributes to moderately complex aspects of a project. Work is generally independent and collaborative in nature. Working as 

In [9]:
job_title = json_data['name']
location = json_data['occupationLocation'][0]['name']
description = json_data['description']

ntile_10 = json_data['estimatedSalary'][0]['percentile10']
ntile_25 = json_data['estimatedSalary'][0]['percentile25']
ntile_50 = json_data['estimatedSalary'][0]['median']
ntile_75 = json_data['estimatedSalary'][0]['percentile75']
ntile_90 = json_data['estimatedSalary'][0]['percentile90']

salary_data = (job_title, location, description, ntile_10, ntile_25, ntile_50, ntile_75, ntile_90)
print(salary_data)

('Senior Accountant', 'Charlotte, NC', 'The Senior Accountant ensures the accuracy of entries to ledger accounts and reconciles subsidiary ledger accounts to the general ledger. Maintains financial records and ensures that financial transactions are properly recorded. Being a Senior Accountant analyzes current costs, revenues, financial commitments, and obligations incurred to predict future revenues and expenses. Prepares complex balance sheets, profit and loss statements and other financial reports. In addition, Senior Accountant may supervise and guide lower-level accountants. Requires a bachelor&#39;s degree. May require CPA certification. Typically reports to a supervisor or manager. Being a Senior Accountant contributes to moderately complex aspects of a project. Work is generally independent and collaborative in nature. Working as a Senior Accountant typically requires 4 to 7 years of related experience.', '64962', '71790', '79290', '87890', '95720')


In [10]:
def extract_salary_info(job_title, job_city):
    template = 'https://www.salary.com/research/salary/alternate/{}-salary/{}'
    
    url = template.format(job_title, job_city)
    
    try:
        response = requests.get(url)
        if response.status_code !=200:
            return None
    except requests.exceptions.ConnectionError:
        return None
    soup = BeautifulSoup(response.text, 'lxml')
    pattern = re.compile(r'Occupation')
    script = soup.find('script', {'type': 'application/ld+json'},text=pattern)
    json_raw = script.contents[0]
    json_data = json.loads(json_raw)
    job_title = json_data['name']
    location = json_data['occupationLocation'][0]['name']
    description = json_data['description']

    ntile_10 = json_data['estimatedSalary'][0]['percentile10']
    ntile_25 = json_data['estimatedSalary'][0]['percentile25']
    ntile_50 = json_data['estimatedSalary'][0]['median']
    ntile_75 = json_data['estimatedSalary'][0]['percentile75']
    ntile_90 = json_data['estimatedSalary'][0]['percentile90']

    salary_data = (job_title, location, description, ntile_10, ntile_25, ntile_50, ntile_75, ntile_90)
    return salary_data

In [11]:
with open('largest_cities.csv', newline='') as f:
    reader = csv.reader(f)
    cities = [city for row in reader for city in row]

In [12]:
print(cities[:10])

['New-York-NY', 'Los-Angeles-CA', 'Chicago-IL', 'Houston-TX', 'Phoenix-AZ', 'Philadelphia-PA', 'San-Antonio-TX', 'San-Diego-CA', 'Dallas-TX', 'San-Jose-CA']


In [None]:
salary_data = []

for city in cities:
    result = extract_salary_info(position, city)
    if result:
        salary_data.append(result)
        sleep(0.5)
for row in salary_data[:5]:
    print(row)

In [None]:
with open('salary-results.csv','w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['Title', 'Location', 'Description', 'nTile10', 'nTile25', 'nTile50', 'nTile75', 'nTile90'])
    writer.writerows(salary_data)

In [None]:
def main(job_title):
    with open('largest_cities.csv', newline='') as f:
        reader = csv.reader(f)
        cities = [city for row in reader for city in row]
    salary_data = []

    for city in cities:
        result = extract_salary_info(position, city)
        if result:
            salary_data.append(result)
            sleep(0.5)
    
    with open('salary-results.csv','w', newline='', encoding='utf-8') as g:
        writer = csv.writer(g)
        writer.writerow(['Title', 'Location', 'Description', 'nTile10', 'nTile25', 'nTile50', 'nTile75', 'nTile90'])
        writer.writerows(salary_data)
    return salary_data  