In [303]:
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup

In [304]:
template = 'https://ca.indeed.com/jobs?q={}&l={}&jt=fulltime&fromage=14&sort=date'

In [305]:
def get_url(position, location):
    """Generate a url from position and location"""
    template = 'https://ca.indeed.com/jobs?q={}&l={}&jt=fulltime&fromage=14&sort=date'
    url = template.format(position, location)
    return url

In [306]:
url = get_url('title:"business analyst"','Toronto, ON')

## Extract raw html

In [307]:
response = requests.get(url)

In [308]:
response

<Response [200]>

In [309]:
response.reason

''

In [310]:
soup = BeautifulSoup(response.text, 'html.parser')

In [311]:
cards = soup.find_all('a', 'tapItem')

In [312]:
len(cards)

15

## Prototype the model with a single record

In [375]:
card = cards[0]

In [348]:
job_title = card.select_one("span[title]").text.strip()

In [376]:
job_url = 'https://ca.indeed.com' + card.get('href')

In [349]:
company = card.find('span', 'companyName').text.strip()

In [328]:
job_location = card.find('div', 'companyLocation').text.strip()

In [329]:
job_summary = card.find('div','job-snippet').text.strip().replace('\n', '. ')

In [330]:
posted_date = card.find('span', 'date').text.strip()

In [331]:
today = datetime.today().strftime('%Y-%m-%d')

In [373]:
try:
    job_salary = card.find('div','metadata salary-snippet-container').text.strip()
except AttributeError:
    job_salary = ''

In [382]:
response2 = requests.get(job_url)
soup2 = BeautifulSoup(response2.text, 'html.parser')
job_description = soup2.find('div',{'id':'jobDescriptionText'},{'class':'jobsearch-jobDescriptionText'}).text.strip().replace('\n','. ')

## Generalize the model with a function

In [384]:
def get_record(card):
    """Extract job data from a single record"""
    job_title = card.select_one("span[title]").text.strip()
    job_url = 'https://ca.indeed.com' + card.get('href')
    company = card.find('span', 'companyName').text.strip()
    job_location = card.find('div', 'companyLocation').text.strip()
    job_summary = card.find('div','job-snippet').text.strip().replace('\n', '. ')
    posted_date = card.find('span', 'date').text.strip()
    today = datetime.today().strftime('%Y-%m-%d')
    try:
        job_salary = card.find('div','metadata salary-snippet-container').text.strip()
    except AttributeError:
        job_salary = ''
    
    response2 = requests.get(job_url)
    soup2 = BeautifulSoup(response2.text, 'html.parser')
    job_description = soup2.find('div',{'id':'jobDescriptionText'},{'class':'jobsearch-jobDescriptionText'}).text.strip().replace('\n','. ')
    
    record = (job_title, company, job_location, job_salary, job_summary, job_description, posted_date, today, job_url)
    
    return record

In [355]:
records = []

for card in cards:
    record = get_record(card)
    records.append(record)

In [360]:
records[0]

('Business Analyst',
 'Total Credit Recovery',
 'North York, ON',
 '$16 - $20 an hour',
 'Provide consultation to key clients on strategies and business trends.. Conduct analysis and present insights to business leaders in monthly meetings.',
 'PostedJust posted',
 '2022-03-11',
 'https://ca.indeed.com/company/Total-credit-recovery/jobs/Business-Analyst-869a7b988f69b16a?fccid=fbd36ea7a8fc690f&vjs=3')

## Getting the next page

In [364]:
while True:
    try:
        url = 'https://ca.indeed.com' + soup.find('a', {'aria-label':'Next'}).get('href')
    except AttributeError:
        break
        
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    cards = soup.find_all('a', 'tapItem')
    
    for card in cards:
        record = get_record(card)
        records.append(record)

In [365]:
len(records)

117

In [369]:
records[116]

('Senior Salesforce Business Analyst',
 'Salesforce',
 'Toronto, ON',
 '$70,600 a year',
 'Document future state business processes.. Write user stories for future state business processes and get client sign-off.. 3+ years of relevant work experience.',
 'Posted14 days ago',
 '2022-03-11',
 'https://ca.indeed.com/rc/clk?jk=f5f53cbd554587c3&fccid=4027cfd917e1ee29&vjs=3')

## Putting it all together

In [13]:
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup
from time import sleep

headers = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.9',
    'cache-control': 'max-age=0',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'none',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36 Edg/87.0.664.47'
}

def get_url(position, location):
    """Generate a url from position and location"""
    template = 'https://ca.indeed.com/jobs?q={}&l={}&jt=fulltime&fromage=14&sort=date'
    url = template.format(position, location)
    return url

def get_record(card):
    """Extract job data from a single record"""
    try:
        job_title = card.select_one("span[title]").text.strip()
    except AttributeError:
        job_title = ''
    try:
        job_url = 'https://ca.indeed.com' + card.get('href')
    except AttributeError:
        job_url = ''
    try:
        company = card.find('span', 'companyName').text.strip()
    except AttributeError:
        company = ''
    try:    
        job_location = card.find('div', 'companyLocation').text.strip()
    except AttributeError:
        job_location = ''
    try:    
        job_summary = card.find('div','job-snippet').text.strip().replace('\n', '. ')
    except AttributeError:
        job_summary = ''
    try:
        posted_date = card.find('span', 'date').text.strip()
    except AttributeError:
        posted_date = ''
    today = datetime.today().strftime('%Y-%m-%d')
    try:
        job_salary = card.find('div','metadata salary-snippet-container').text.strip()
    except AttributeError:
        job_salary = ''
    
    job_description = get_description(job_url)
        
    record = (job_title, company, job_location, job_salary, job_summary, job_description, posted_date, today, job_url) 
    return record

def get_description(job_url):
    """Extrace job descrition from job posting link"""
    response2 = requests.get(job_url)
    sleep(3)
    soup2 = BeautifulSoup(response2.text, 'html.parser')
    try:
        job_description = soup2.find('div','jobsearch-jobDescriptionText').text.strip().replace('\n','. ')
    except AttributeError:
        job_description = ''
    return job_description
    
def main(position, location):
    """Run the main program routine"""
    records = []
    url = get_url(position, location)
    
    # extract the job data 
    while True:
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        cards = soup.find_all('a', 'tapItem')

        for card in cards:
            record = get_record(card)
            records.append(record)
    
        try:
            url = 'https://ca.indeed.com' + soup.find('a', {'aria-label':'Next'}).get('href')
            sleep(3)
        except AttributeError:
            break
            
    # save the job data
    with open('results.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['JobTitle', 'Company', 'Location', 'Salary', 'Summary', 'Description', 'PostDate', 'ExtractDate', 'JobUrl'])
        writer.writerows(records)

In [14]:
# run the main program
main('title:"data analyst"', 'Toronto, ON')