### Indeed Job Scraper

In [1]:
import csv 
from datetime import datetime
import requests 
from bs4 import BeautifulSoup

In [2]:
template = 'https://www.indeed.co.uk/jobs?q={}&l={}'

In [3]:
def get_url(position, location):
    """Generate a url from position and location"""
    template = 'https://www.indeed.co.uk/jobs?q={}&l={}'
    url = template.format(position,location)
    return url

In [4]:
url = get_url('junior developer', 'London' )

### Extract raw html

In [5]:
response = requests.get(url)

In [6]:
response

<Response [200]>

In [7]:
response.reason

'OK'

In [8]:
soup = BeautifulSoup(response.text, "html.parser")

In [9]:
cards = soup.find_all("div", "jobsearch-SerpJobCard")

In [10]:
len(cards)

15

### Prototype the model with a single record 

In [11]:
card = cards[0]

In [12]:
atag = card.h2.a

In [13]:
job_title = atag.get('title')

In [14]:
job_url = 'https://www.indeed.co.uk'+ atag.get('href')

In [15]:
company = card.find('span', 'company').text.strip()

In [16]:
job_location = card.find('div', 'recJobLoc').get('data-rc-loc')

In [17]:
Job_summary = card.find('div', 'summary').text.strip()

In [18]:
post_date = card.find('span', 'date').text

In [19]:
todays_date = datetime.today().strftime('%d-%m-%Y')

In [20]:
job_salary = card.find('span', 'salaryText')

In [21]:
try: 
    job_salary = card.find('span', 'salaryText').text.strip()
except AttributeError:
    job_salary = ''

### Generalise the model with a function 

In [30]:
def get_record(card):
    """Extract job data from a single record"""
    atag = card.h2.a
    job_title = atag.get('title')
    job_url = 'https://www.indeed.co.uk' + atag.get('href')
    company = card.find('span', 'company').text.strip()
    job_location = card.find('div', 'recJobLoc').get('data-rc-loc')
    job_summary = card.find('div', 'summary').text.strip()
    post_date = card.find('span', 'date').text
    todays_date = datetime.today().strftime('%d-%m-%Y')
    job_salary = card.find('span', 'salaryText')
    try:
        job_salary = card.find('span', 'salaryText')
    except AttributeError:
        job_salary = ''
        
    record = (job_title, company, job_location, post_date, todays_date, job_summary, job_salary, job_url)
    
    return record

In [31]:
records = []

for card in cards:
    record = get_record(card)
    records.append(record)

In [32]:
records[0]

('Graduate Developer',
 'Solirius',
 'London',
 '30+ days ago',
 '11-01-2021',
 'After training, working as a Java, Python, Ruby or C# developer on one of our client projects.\nAn independent software and technology consultancy who have…',
 None,
 'https://www.indeed.co.uk/rc/clk?jk=027a189440bb5eee&fccid=c44218d8c2035787&vjs=3')

### Getting the next page of results

In [36]:
while True:
    try:
        url = 'https://www.indeed.co.uk' + soup.find('a', {'aria-label': 'Next'}).get('href')
    except AttributeError:
        break
        
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    cards = soup.find_all('div', 'jobsearch-SerpJobCard')
    
    for card in cards: 
        record = get_record(card)
        records.append(record)

### Combining everything 

In [37]:
import csv 
from datetime import datetime 
import requests 
from bs4 import BeautifulSoup

def get_url(position, location):
    """Generate a url from position and location"""
    template = 'https://www.indeed.co.uk/jobs?q={}&l={}'
    url = template.format(position,location)
    return url

def get_record(card):
    """Extract job data from a single record"""
    atag = card.h2.a
    job_title = atag.get('title')
    job_url = 'https://www.indeed.co.uk' + atag.get('href')
    company = card.find('span', 'company').text.strip()
    job_location = card.find('div', 'recJobLoc').get('data-rc-loc')
    job_summary = card.find('div', 'summary').text.strip()
    post_date = card.find('span', 'date').text
    todays_date = datetime.today().strftime('%d-%m-%Y')
    job_salary = card.find('span', 'salaryText')
    try:
        job_salary = card.find('span', 'salaryText')
    except AttributeError:
        job_salary = ''
        
    record = (job_title, company, job_location, post_date, todays_date, job_summary, job_salary, job_url)
    
    return record

def main(position, location):
    """Run the main program routine"""
    records = []
    url = get_url(position, location)
    
    while True:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        cards = soup.find_all('div', 'jobsearch-SerpJobCard')
    
        for card in cards: 
            record = get_record(card)
            records.append(record)
            
        try:
            url = 'https://www.indeed.co.uk' + soup.find('a', {'aria-label': 'Next'}).get('href')
        except AttributeError:
            break
    
    # Save job data
    with open('results_10_01.csv', 'w', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['JobTitle', 'Company', 'Location', 'PostDate', ' ExtractDate', 'Summary', 'Salary', 'JobUrl'])
        writer.writerows(records)

In [38]:
# run the main program

main('junior developer', 'London')