In [162]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
import random
import time

In [73]:
URL = "https://www.seek.com.au/data-%2B-R-or-Python-jobs/in-Sydney-CBD,-Inner-West-&-Eastern-Suburbs-Sydney-NSW"
page = requests.get(URL)
soup = BeautifulSoup(page.text, 'html.parser')

The purpose of this excercise is to learn what job listings are asking for and identify all the job listings that are interesting and are worth applying for.
First step is to extract the links from a simple search, so we should have a look at them and compare them to the relevant links on the page when viewed in a browser

In [44]:
links = []
for link in soup.find_all('a'):
    links.append(link.get('href'))

This website has a lot of unnecessary information.
What we are actually looking for here is links with /job/
we then need to concatenate seek.com.au with the link extension.
easiest way is with some regex and list comprehensions

In [72]:
job_links = [link for link in links if re.search("job/",link)!=None]
job_links = ["seek.com.au" + link for link in job_links]
print(job_links)


['seek.com.au/job/38416533?type=standard&searchrequesttoken=6f547c9b-fd57-40fd-8772-ce0be77df16e', 'seek.com.au/job/38392994?type=standard&searchrequesttoken=6f547c9b-fd57-40fd-8772-ce0be77df16e', 'seek.com.au/job/38349302?type=standout&searchrequesttoken=6f547c9b-fd57-40fd-8772-ce0be77df16e', 'seek.com.au/job/38419631?type=standard&searchrequesttoken=6f547c9b-fd57-40fd-8772-ce0be77df16e', 'seek.com.au/job/38408991?type=standout&searchrequesttoken=6f547c9b-fd57-40fd-8772-ce0be77df16e', 'seek.com.au/job/38371276?type=standard&searchrequesttoken=6f547c9b-fd57-40fd-8772-ce0be77df16e', 'seek.com.au/job/38325029?type=standout&searchrequesttoken=6f547c9b-fd57-40fd-8772-ce0be77df16e', 'seek.com.au/job/38414985?type=standout&searchrequesttoken=6f547c9b-fd57-40fd-8772-ce0be77df16e', 'seek.com.au/job/38396882?type=standout&searchrequesttoken=6f547c9b-fd57-40fd-8772-ce0be77df16e', 'seek.com.au/job/38331270?type=standard&searchrequesttoken=6f547c9b-fd57-40fd-8772-ce0be77df16e', 'seek.com.au/job/38

Next:
 - we will want to use pagination to get all of our links
 - we need to extract the useful information of each job listing

In [177]:
#assuming that the structure of the next page is always the same we can find the link we want by:
#1 - find all the links with "page" in the structure
#2 - selecting the last one (Next)
# this may cause an issue on the last link so we should keep track of the link numbers so that we don't end up in some kind of loop

#should probably define a scraper class but still learning OOP

def extract_all_links(URL):
    """extracts all the useful links from a seek URL and returns them as a list 'all_links'"""
    page = requests.get(URL)
    soup = BeautifulSoup(page.text, 'html.parser')
    all_links = []
    for link in soup.find_all('a'):
        all_links.append(link.get('href'))
    return all_links

def extract_job_links(all_links):
    """extracts all the links for job listings on a given search result page and returns them as a list 'job_links'"""
    job_links = [link for link in all_links if re.search("job/",link)!=None]
    job_links = ["http://seek.com.au" + link for link in job_links]
    return job_links

def extract_page_link(all_links):
    """extracts all the links pointing to other pages of a search result and returns the last one, i.e. the one that points to the 'next page' as 'next_link'"""
    page_links = [link for link in all_links if re.search("page",link)!=None]
    next_link = page_links[-1]
    next_link = "http://seek.com.au" + next_link
    return next_link

def extract_page_number(next_link):
    """extracts the page number from the next page link and returns it as 'next_page_number'"""
    next_page_number = re.search("page=(.*)",next_link).group(1)
    return(next_page_number)

def extract_useful_information(all_links):
    """extracts all the useful information (job listings and next page) from the links on a seek search result and returns them"""
    all_links = extract_all_links(URL)
    job_links = extract_job_links(all_links)
    next_link = extract_page_link(all_links)
    page_number = extract_page_number(next_link)
    return job_links, next_link, page_number
    

pages_visited = []
jobs = []
URL = "https://www.seek.com.au/data-%2B-R-or-Python-jobs/in-Sydney-CBD,-Inner-West-&-Eastern-Suburbs-Sydney-NSW"

while True:
    #extract all the links from a page
    all_links = extract_all_links(URL)
    #extract all the information we care about for a given page (job listings and next page links)
    jl, nl, pn = extract_useful_information(all_links)
    for i in jl:
        #add the elements to our jobs list
        jobs.append(i)
    
    if (pn in pages_visited):
        #if the last page link is already in the list of pages we have visited we are done and so can exit the loop
        print("scraping complete")
        break
    else:
        #else we add the next page to the list of next pages, print its number for some user feedback and then assign the next page as the URL to continue our work
        pages_visited.append(pn)
        print(pages_visited)
        URL = nl
    #print the URL for some user feedback
    print(URL)
    #sleep for a random amount of time to avoid upsetting the server
    time.sleep(random.randint(1,10))

['2']
http://seek.com.au/data-+-R-or-Python-jobs/in-Sydney-CBD,-Inner-West-&-Eastern-Suburbs-Sydney-NSW?page=2
['2', '3']
http://seek.com.au/data-+-R-or-Python-jobs/in-Sydney-CBD,-Inner-West-&-Eastern-Suburbs-Sydney-NSW?page=3
['2', '3', '4']
http://seek.com.au/data-+-R-or-Python-jobs/in-Sydney-CBD,-Inner-West-&-Eastern-Suburbs-Sydney-NSW?page=4
['2', '3', '4', '5']
http://seek.com.au/data-+-R-or-Python-jobs/in-Sydney-CBD,-Inner-West-&-Eastern-Suburbs-Sydney-NSW?page=5
['2', '3', '4', '5', '6']
http://seek.com.au/data-+-R-or-Python-jobs/in-Sydney-CBD,-Inner-West-&-Eastern-Suburbs-Sydney-NSW?page=6
['2', '3', '4', '5', '6', '7']
http://seek.com.au/data-+-R-or-Python-jobs/in-Sydney-CBD,-Inner-West-&-Eastern-Suburbs-Sydney-NSW?page=7
['2', '3', '4', '5', '6', '7', '8']
http://seek.com.au/data-+-R-or-Python-jobs/in-Sydney-CBD,-Inner-West-&-Eastern-Suburbs-Sydney-NSW?page=8
['2', '3', '4', '5', '6', '7', '8', '9']
http://seek.com.au/data-+-R-or-Python-jobs/in-Sydney-CBD,-Inner-West-&-Easter

In [184]:
#next we will want to look at the actual job listings themselves
URL = jobs[0]
URL

'http://seek.com.au/job/38416533?type=standard&searchrequesttoken=985ab00d-1e8b-41cf-8b11-c6d78fceb298'

In [181]:
page = requests.get(URL)
soup = BeautifulSoup(page.text, 'html.parser')

In [186]:
soup.title.text

'Financial Crime Analysts - SAS, SQL, R or Python Job in Sydney - SEEK'

In [198]:
soup.find_all("h1")

[<h1>Financial Crime Analysts - SAS, SQL, R or Python</h1>,
 <h1 id="jobDescription">Job Description</h1>,
 <h1>Financial Crime Analysts - SAS, SQL, R or Python</h1>,
 <h1 id="jobInfoHeader">Job Information</h1>,
 <h1 class="jobtitle">Financial Crime Analysts - SAS, SQL, R or Python </h1>,
 <h1 id="jobApplyHeader">Job Apply</h1>,
 <h1 id="jobInfoHeader">Job Information</h1>]

In [202]:
soup.find_all("p")

[<p>A number of opportunities exist for Financial Crime Analysts within a leading financial institution in Sydney.</p>,
 <p>In this role you will be involved in collecting, analysing and modelling data which relates to financial crime, information security &amp; security risks. This includes involvement in investigating incidents through data mining and analysis, researching
           financial crime, information security and security related trends and emerging issues, and identifying triggers to alert suspicious behaviour. </p>,
 <p>Duties would include;</p>,
 <p>Requirements: </p>,
 <p>Please apply online or for more information please contact Alex Slocombe on <a class="_2ZKuACf" data-contact-match="true" href="tel:02 9270 2602">02 9270 2602</a>.</p>,
 <p>Only candidates with valid Australian work rights will be considered.</p>,
 <p style="text-align:center;">IMPORTANT: By submitting your email address and any other personal information when you APPLY to a job, you consent to such 

In [203]:
soup.find_all("ul")

[<ul class="_2Q_vgSl"><li class="_2jJSFKy _2njvnpA"><a class="_3mf0vNI" href="/" target="_self" url="https://www.seek.com.au/job/38416533?type=standard&amp;searchrequesttoken=985ab00d-1e8b-41cf-8b11-c6d78fceb298"><span>Job Search</span><span class="_1_mILXg _3sCZNzW _2PNGBpo"><svg class="_1BytwqT _3aMWH1P" focusable="false" height="16" viewbox="0 0 1024 1024" width="16" xmlns="http://www.w3.org/2000/svg"><path d="M807 631q25-45 38.5-96.5T859 428q0-89-34-167.5T733 124 596.5 32 430-2q-89 0-167.5 34T126 124 34 260.5 0 428t34 167 92 136 136.5 92T430 857q55 0 106.5-13.5T633 805q7-3 35-20.5t33-20.5l257 225 66-66-257-225 19.5-32 20.5-35zM66 428q0-76 28.5-142t78-115T288 93t142-29q75 0 141 29t115.5 78 78 115T793 428q0 75-28.5 141t-78 115.5-115.5 78T430 791q-76 0-142-28.5t-115.5-78-78-115.5T66 428z"></path></svg></span></a></li><li class=""><a class="_3mf0vNI" href="/profile/" target="_self" url="https://www.seek.com.au/job/38416533?type=standard&amp;searchrequesttoken=985ab00d-1e8b-41cf-8b11-c6

In [205]:
soup.body.h1

<h1>Financial Crime Analysts - SAS, SQL, R or Python</h1>

In [219]:
soup.body

<body>
<div id="app"><div class="_3xZ0K-X"><div class="Eadjc1o"><h1>Financial Crime Analysts - SAS, SQL, R or Python</h1></div><div><a class="_17vdsLW" href="#start-of-content">Skip to content</a><div class="_2zT6XU7 _3gx5K5X _1XypYuB v6HVT-3 _2njvnpA"><div class="_3EP6gLk"><div class="Pdwn1mb"><svg class="vwOmdmr" height="60" viewbox="0 0 60 60" width="60"><defs><path d="M0 4.007C0 1.794 1.796 0 4.006 0h51.988C58.204 0 60 1.796 60 4.007v51.986C60 58.206 58.203 60 55.994 60H4.006C1.794 60 0 58.204 0 55.993V4.007z" id="a"></path></defs><clippath id="b"><use overflow="visible" xlink:href="#a"></use></clippath><g clip-path="url(#b)"><defs><path d="M-250-105h720v305h-720z" id="c"></path></defs><clippath id="d"><use overflow="visible" xlink:href="#c"></use></clippath><path clip-path="url(#d)" d="M-5-5h70v70H-5z" fill="#E40C79"></path></g><g><defs><circle cx="30" cy="30" id="e" r="22.5"></circle></defs><clippath id="f"><use overflow="visible" xlink:href="#e"></use></clippath><g clip-path="ur

we can make this a little more readable and when we do we find that a lot of the useful information is in the div: <div class="job-template__wrapper"> . This is the box with the job description and other information. There are other divs that we may want to extract data from but for now we will focus on this

In [237]:
for i in soup.find_all("div"):
    if i.has_attr("class"):
        print(i.attrs["class"][0])

_3xZ0K-X
Eadjc1o
_2zT6XU7
_3EP6gLk
Pdwn1mb
Pdwn1mb
_10UF3wx
_3xzTyTV
_3vSDecy

_3vAiMK3
_3q1w4kH
_1zpmjC8
_1OrVNW3
_3d7SR1c
_3d7SR1c
_3d7SR1c
_3d7SR1c
_3QLrnJD
_1br_n0E
_2Pz6NLx
_2Uivm2d
BAYwSAl
YcmbIzn
_3unTpv-

_2NO2gPd

BAYwSAl
_3ck-hby

BAYwSAl
_2NBcXnG
EzWN_sJ
_3394qVu
_3qq-KJp
JyFVSRZ
Pdwn1mb
JyFVSRZ
_2njvnpA
JyFVSRZ
Pdwn1mb
_28sXRcp
job-template__wrapper
tempborder
templogo
temptoptext
tempmargin
templatetext
details
tempweb
_2e4Pi2B
JyFVSRZ
_1pia1SL
_1pia1SL
_1MsES6s
JyFVSRZ
_2Rte-wV
JyFVSRZ
_3SVJ3E7
_2XvHhDP

_34LLxI9
JyFVSRZ
Pdwn1mb
_33vNfUX
dSewnZr
dSewnZr
K1Fdmkw
Pdwn1mb

_2_weaBw
_1-21IxK
_2oexuIf
_2oL63Bg
_2bUAXYO
_2zNkD9I
_2EwTw6Q
_22fl1wY
A6VFv1T


In [238]:
relevant = "job-template__wrapper"

for i in soup.find_all("div"):
    if i.has_attr("class"):
        print(i.attrs["class"][0])

In [254]:
#find_all returns a result set, an array of objects. We only want the 0th one
interesting_section = soup.find_all(attrs="job-template__wrapper")[0]

In [257]:
#that looks about right. We should probably extract some of the key information from this and create a dict, we can then make a list of dicts (one for each job listing we scraped earlier and then join them together into a dataframe for analysis)
interesting_section.text

'\n\n\nData Analytics Recruitment Solutions\n\nFinancial Crime Analysts - SAS, SQL, R or Python \n\nLeading Australian Bank \nCBD location \nInnovative and forward thinking organisation \n\n\nA number of opportunities exist for Financial Crime Analysts within a leading financial institution in Sydney.\nIn this role you will be involved in collecting, analysing and modelling data which relates to financial crime, information security & security risks. This includes involvement in investigating incidents through data mining and analysis, researching\n          financial crime, information security and security related trends and emerging issues, and identifying triggers to alert suspicious behaviour. \nDuties would include;\n\nIdentify patterns & characteristics within data to drive the development of fraud mitigation strategies\nOptimisation and development of new detection scenarios\nUtilise statistical modelling/machine learning to drive predictive capability\nWork with key business p

In [265]:
other_section = re.search('window.SK_DL = (.*);',soup.text).group(0)

In [266]:
other_section

'window.SK_DL = {"country":"au","isLoggedIn":false,"experiments":[],"jobId":38416533,"jobTitle":"Financial Crime Analysts - SAS, SQL, R or Python","jobListingDate":"2019-02-22T02:57:44.000Z","jobClassification":"Banking & Financial Services","jobClassificationId":1203,"jobSubClassification":"Analysis & Reporting","jobSubClassificationId":6175,"advertiserName":"Bluefin Resources Pty Limited","advertiserId":27705684,"jobIsLinkOut":false,"jobHasScreen":false,"jobHasRoleRequirements":false,"contactMatches":["Phone"],"jobLocation":"Sydney","jobArea":"CBD, Inner West & Eastern Suburbs","appBanner":{"type":"no-impression","deviceType":"Other"}};'

In [269]:
output_dict = {'main_text': interesting_section, 'meta_data': other_section}

In [285]:
all_listings = []

for i, job in enumerate(jobs):
    page = requests.get(job)
    soup = BeautifulSoup(page.text, 'html.parser')
    try:
        interesting_section = soup.find_all(attrs="job-template__wrapper")[0]
    except:
        interesting_section = soup.text
    try:
        other_section = re.search('window.SK_DL = (.*);',soup.text).group(0)
    except:
        other_section = soup.text
    output_dict = {'url': job,'main_text': interesting_section, 'meta_data': other_section}
    all_listings.append(output_dict)
    print(f"job number {i} complete")
    time.sleep(random.randint(1,5))


job number 0 complete
job number 1 complete
job number 2 complete
job number 3 complete
job number 4 complete
job number 5 complete
job number 6 complete
job number 7 complete
job number 8 complete
job number 9 complete
job number 10 complete
job number 11 complete
job number 12 complete
job number 13 complete
job number 14 complete
job number 15 complete
job number 16 complete
job number 17 complete
job number 18 complete
job number 19 complete
job number 20 complete
job number 21 complete
job number 22 complete
job number 23 complete
job number 24 complete
job number 25 complete
job number 26 complete
job number 27 complete
job number 28 complete
job number 29 complete
job number 30 complete
job number 31 complete
job number 32 complete
job number 33 complete
job number 34 complete
job number 35 complete
job number 36 complete
job number 37 complete
job number 38 complete
job number 39 complete
job number 40 complete
job number 41 complete
job number 42 complete
job number 43 complet

job number 346 complete
job number 347 complete
job number 348 complete
job number 349 complete
job number 350 complete
job number 351 complete
job number 352 complete
job number 353 complete
job number 354 complete
job number 355 complete
job number 356 complete
job number 357 complete
job number 358 complete
job number 359 complete
job number 360 complete
job number 361 complete
job number 362 complete
job number 363 complete
job number 364 complete
job number 365 complete
job number 366 complete
job number 367 complete
job number 368 complete
job number 369 complete
job number 370 complete
job number 371 complete
job number 372 complete
job number 373 complete
job number 374 complete
job number 375 complete
job number 376 complete
job number 377 complete
job number 378 complete
job number 379 complete
job number 380 complete
job number 381 complete
job number 382 complete
job number 383 complete
job number 384 complete
job number 385 complete
job number 386 complete
job number 387 c

In [288]:
df = pd.DataFrame(all_listings)

In [292]:
df.to_csv("listings_df.csv")