# Indeed Job Scraper
create a general purpose job scraper

In [1]:
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup

### Setup the query and url

In [2]:
#function to take data from the user 
def get_url(position, location):
    """Generate url from position and location"""
    #the link to connect to indeed site:
    template = 'https://www.indeed.com/jobs?q={}&l={}'
    url = template.format(position, location)
    return url

In [3]:
url = get_url('senior accountant', 'charlotte nc')
print(url)

https://www.indeed.com/jobs?q=senior+accountant&l=charlotte+nc


### Extract the html data

In [4]:
#we will send request to the site through the url
response = requests.get(url)

In [5]:
#we will parse the html file through the response
soup = BeautifulSoup(response.text, 'html.parser')

In [6]:
#we can inspect the card :<div class="jobsearch-SerpJobCard"
cards = soup.find_all('div', 'jobsearch-SerpJobCard')

### Prototype the model with a single record

In [7]:
#there are 15 such cards in a page ,for each card
card = cards[0]

In [8]:
#job_title is a clickable property so href 
'''<h2 class='title'
<a

get method never throws an error if the property not found
we can also use card.h2.a['title'] but it throws error for the same'''

job_title = card.h2.a.get('title')

In [23]:
'''<div>
<span class="company"'''
company = card.find('span', 'company').text.strip()

In [10]:
'''<div id="recJobLoc_1b771b99161b3911" class="recJobLoc" data-rc-loc="Charlotte, NC" style="display: none"></div>'''
job_location = card.find('div', 'recJobLoc').get('data-rc-loc')

In [11]:
'''<span class="date ">21 days ago</span> 
we will extract text from the tag'''
post_date = card.find('span', 'date').text
today = datetime.today().strftime('%Y-%m-%d')

In [12]:
'''<div class="summary">
<ul style="list-style-type:circle;margin-top: 0px;margin-bottom: 0px;padding-left:20px;"> 
 <li style="margin-bottom:0px;">Ability to work flexible schedules and hours in completion of routine work and special assignments.</li>
 <li>Under general supervisions this position will assist with…</li>
</ul></div>
we will extract text from the tag'''
summary = card.find('div', 'summary').text.strip().replace('\n', ' ')

In [13]:
# this does not exists for all jobs, so handle the exceptions
'''<span class="salaryText">
$60,806 - $76,007 a year</span>'''
salary_tag = card.find('span', 'salaryText')
if salary_tag:
    salary = salary_tag.text.strip()
else:
    salary = ''    

In [14]:
'''<a target="" id="jl_1b771b99161b3911" href="/rc/clk?jk=1b771b99161b3911&amp;fccid=d7e0d7cc61aa08f2&amp;vjs=3" onmousedown="return rclk(this,jobmap[0],1);" onclick="setRefineByCookie([]); return rclk(this,jobmap[0],true,1);" rel="noopener nofollow" title="Accountant II (Senior Budget Officer)" class="jobtitle turnstileLink visited" data-tn-element="jobTitle">
<b>Accountant</b> II (<b>Senior</b> Budget Officer)</a>'''
job_url = 'https://www.indeed.com' + card.h2.a.get('href')

In [15]:
#we will create a tuple of all the record
record = (job_title, company, job_location, post_date, today, summary, salary, job_url)

In [16]:
record

('Accountant II (Senior Budget Officer)',
 'City of Charlotte and Mecklenburg County',
 'Charlotte, NC',
 '21 days ago',
 '2021-02-02',
 'Ability to work flexible schedules and hours in completion of routine work and special assignments. Under general supervisions this position will assist with…',
 '$60,806 - $76,007 a year',
 'https://www.indeed.com/rc/clk?jk=1b771b99161b3911&fccid=d7e0d7cc61aa08f2&vjs=3')

### Generalize the model with a function

In [17]:
def get_record(card):
    """Extract job data from a single record"""
    
    job_title = card.h2.a.get('title')
    company = card.find('span', 'company').text.strip()
    job_location = card.find('div', 'recJobLoc').get('data-rc-loc')
    post_date = card.find('span', 'date').text
    today = datetime.today().strftime('%Y-%m-%d')
    summary = card.find('div', 'summary').text.strip().replace('\n', ' ')
    job_url = 'https://www.indeed.com' + card.h2.a.get('href')

    # this does not exists for all jobs, so handle the exceptions
    salary_tag = card.find('span', 'salaryText')
    if salary_tag:
        salary = salary_tag.text.strip()
    else:
        salary = ''  
        
    record = (job_title, company, job_location, post_date, today, summary, salary, job_url)
    return record

In [18]:
#we will create list of all the records in the cards
records = []

for card in cards:
    record = get_record(card)
    records.append(record)

### Get the next page

In [19]:
while True:
    try:
        url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
    except AttributeError:
        #as at the end arrow there are no more page after end arrow so there is no next page so it will throw error.
        break

    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    cards = soup.find_all('div', 'jobsearch-SerpJobCard')

    for card in cards:
        record = get_record(card)
        records.append(record)

### Putting it all together

In [20]:
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup


def get_url(position, location):
    """Generate url from position and location"""
    template = 'https://www.indeed.com/jobs?q={}&l={}'
    position = position.replace(' ', '+')
    location = location.replace(' ', '+')
    url = template.format(position, location)
    return url


def get_record(card):
    """Extract job data from a single record"""
    
    job_title = card.h2.a.get('title')
    company = card.find('span', 'company').text.strip()
    job_location = card.find('div', 'recJobLoc').get('data-rc-loc')
    post_date = card.find('span', 'date').text
    today = datetime.today().strftime('%Y-%m-%d')
    summary = card.find('div', 'summary').text.strip().replace('\n', ' ')
    job_url = 'https://www.indeed.com' + card.h2.a.get('href')

    # this does not exists for all jobs, so handle the exceptions
    salary_tag = card.find('span', 'salaryText')
    if salary_tag:
        salary = salary_tag.text.strip()
    else:
        salary = ''  
        
    record = (job_title, company, job_location, post_date, today, summary, salary, job_url)
    return record


def main(position, location):
    """Run the main program routine"""
    records = []
    url = get_url(position, location)
    
    # extract the job data
    while True:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        cards = soup.find_all('div', 'jobsearch-SerpJobCard')
        for card in cards:
            record = get_record(card)
            records.append(record)
        try:
            url = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
        except AttributeError:
            break
        
    # save the job data
    with open('results.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['JobTitle', 'Company', 'Location', 'PostDate', 'ExtractDate', 'Summary', 'Salary', 'JobUrl'])
        writer.writerows(records)

In [21]:
# run the main program
main('senior accountant', 'charlotte nc')