In [162]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
import random
import time

In [73]:
URL = "https://www.seek.com.au/data-%2B-R-or-Python-jobs/in-Sydney-CBD,-Inner-West-&-Eastern-Suburbs-Sydney-NSW"
page = requests.get(URL)
soup = BeautifulSoup(page.text, 'html.parser')

The purpose of this excercise is to learn what job listings are asking for and identify all the job listings that are interesting and are worth applying for.
First step is to extract the links from a simple search, so we should have a look at them and compare them to the relevant links on the page when viewed in a browser

In [44]:
links = []
for link in soup.find_all('a'):
    links.append(link.get('href'))

This website has a lot of unnecessary information.
What we are actually looking for here is links with /job/
we then need to concatenate seek.com.au with the link extension.
easiest way is with some regex and list comprehensions

In [72]:
job_links = [link for link in links if re.search("job/",link)!=None]
job_links = ["seek.com.au" + link for link in job_links]
print(job_links)


['seek.com.au/job/38416533?type=standard&searchrequesttoken=6f547c9b-fd57-40fd-8772-ce0be77df16e', 'seek.com.au/job/38392994?type=standard&searchrequesttoken=6f547c9b-fd57-40fd-8772-ce0be77df16e', 'seek.com.au/job/38349302?type=standout&searchrequesttoken=6f547c9b-fd57-40fd-8772-ce0be77df16e', 'seek.com.au/job/38419631?type=standard&searchrequesttoken=6f547c9b-fd57-40fd-8772-ce0be77df16e', 'seek.com.au/job/38408991?type=standout&searchrequesttoken=6f547c9b-fd57-40fd-8772-ce0be77df16e', 'seek.com.au/job/38371276?type=standard&searchrequesttoken=6f547c9b-fd57-40fd-8772-ce0be77df16e', 'seek.com.au/job/38325029?type=standout&searchrequesttoken=6f547c9b-fd57-40fd-8772-ce0be77df16e', 'seek.com.au/job/38414985?type=standout&searchrequesttoken=6f547c9b-fd57-40fd-8772-ce0be77df16e', 'seek.com.au/job/38396882?type=standout&searchrequesttoken=6f547c9b-fd57-40fd-8772-ce0be77df16e', 'seek.com.au/job/38331270?type=standard&searchrequesttoken=6f547c9b-fd57-40fd-8772-ce0be77df16e', 'seek.com.au/job/38

Next:
 - we will want to use pagination to get all of our links
 - we need to extract the useful information of each job listing

In [177]:
#assuming that the structure of the next page is always the same we can find the link we want by:
#1 - find all the links with "page" in the structure
#2 - selecting the last one (Next)
# this may cause an issue on the last link so we should keep track of the link numbers so that we don't end up in some kind of loop

#should probably define a scraper class but still learning OOP

def extract_all_links(URL):
    """extracts all the useful links from a seek URL and returns them as a list 'all_links'"""
    page = requests.get(URL)
    soup = BeautifulSoup(page.text, 'html.parser')
    all_links = []
    for link in soup.find_all('a'):
        all_links.append(link.get('href'))
    return all_links

def extract_job_links(all_links):
    """extracts all the links for job listings on a given search result page and returns them as a list 'job_links'"""
    job_links = [link for link in all_links if re.search("job/",link)!=None]
    job_links = ["http://seek.com.au" + link for link in job_links]
    return job_links

def extract_page_link(all_links):
    """extracts all the links pointing to other pages of a search result and returns the last one, i.e. the one that points to the 'next page' as 'next_link'"""
    page_links = [link for link in all_links if re.search("page",link)!=None]
    next_link = page_links[-1]
    next_link = "http://seek.com.au" + next_link
    return next_link

def extract_page_number(next_link):
    """extracts the page number from the next page link and returns it as 'next_page_number'"""
    next_page_number = re.search("page=(.*)",next_link).group(1)
    return(next_page_number)

def extract_useful_information(all_links):
    """extracts all the useful information (job listings and next page) from the links on a seek search result and returns them"""
    all_links = extract_all_links(URL)
    job_links = extract_job_links(all_links)
    next_link = extract_page_link(all_links)
    page_number = extract_page_number(next_link)
    return job_links, next_link, page_number
    

pages_visited = []
jobs = []
URL = "https://www.seek.com.au/data-%2B-R-or-Python-jobs/in-Sydney-CBD,-Inner-West-&-Eastern-Suburbs-Sydney-NSW"

while True:
    #extract all the links from a page
    all_links = extract_all_links(URL)
    #extract all the information we care about for a given page (job listings and next page links)
    jl, nl, pn = extract_useful_information(all_links)
    for i in jl:
        #add the elements to our jobs list
        jobs.append(i)
    
    if (pn in pages_visited):
        #if the last page link is already in the list of pages we have visited we are done and so can exit the loop
        print("scraping complete")
        break
    else:
        #else we add the next page to the list of next pages, print its number for some user feedback and then assign the next page as the URL to continue our work
        pages_visited.append(pn)
        print(pages_visited)
        URL = nl
    #print the URL for some user feedback
    print(URL)
    #sleep for a random amount of time to avoid upsetting the server
    time.sleep(random.randint(1,10))

['2']
http://seek.com.au/data-+-R-or-Python-jobs/in-Sydney-CBD,-Inner-West-&-Eastern-Suburbs-Sydney-NSW?page=2
['2', '3']
http://seek.com.au/data-+-R-or-Python-jobs/in-Sydney-CBD,-Inner-West-&-Eastern-Suburbs-Sydney-NSW?page=3
['2', '3', '4']
http://seek.com.au/data-+-R-or-Python-jobs/in-Sydney-CBD,-Inner-West-&-Eastern-Suburbs-Sydney-NSW?page=4
['2', '3', '4', '5']
http://seek.com.au/data-+-R-or-Python-jobs/in-Sydney-CBD,-Inner-West-&-Eastern-Suburbs-Sydney-NSW?page=5
['2', '3', '4', '5', '6']
http://seek.com.au/data-+-R-or-Python-jobs/in-Sydney-CBD,-Inner-West-&-Eastern-Suburbs-Sydney-NSW?page=6
['2', '3', '4', '5', '6', '7']
http://seek.com.au/data-+-R-or-Python-jobs/in-Sydney-CBD,-Inner-West-&-Eastern-Suburbs-Sydney-NSW?page=7
['2', '3', '4', '5', '6', '7', '8']
http://seek.com.au/data-+-R-or-Python-jobs/in-Sydney-CBD,-Inner-West-&-Eastern-Suburbs-Sydney-NSW?page=8
['2', '3', '4', '5', '6', '7', '8', '9']
http://seek.com.au/data-+-R-or-Python-jobs/in-Sydney-CBD,-Inner-West-&-Easter

In [184]:
#next we will want to look at the actual job listings themselves
URL = jobs[0]
URL

'http://seek.com.au/job/38416533?type=standard&searchrequesttoken=985ab00d-1e8b-41cf-8b11-c6d78fceb298'

In [181]:
page = requests.get(URL)
soup = BeautifulSoup(page.text, 'html.parser')

In [186]:
soup.title.text

'Financial Crime Analysts - SAS, SQL, R or Python Job in Sydney - SEEK'