# Australia 'Data' Salary Web Scrapping

Data Collection will be any jobs with the world data inside

Each of these groups will be further divied into different groups based on pay

| Pay Group | From ('000) | To ('000)|
| --- | --- | --- |
| Sixty | 0 | 60 |
| Eight | 60 | 80 |
| Hundred | 80 | 100 |
| One Hundred Twenty | 100 | 120 |
| One Hundred Fifty | 120 | 150 |
| Two Hundred | 150 | 200 |
| Two Hundred Fifty | 200 | 250+ |



Import python related libraries for data processing and web scraping

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

plt.style.use('fivethirtyeight')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))


In [2]:
from bs4 import BeautifulSoup
import requests
import re

# import sleep
from time import sleep

In [3]:
baseurl = "https://www.seek.com.au/"

In [4]:
def getting_data(url):
    ###
    ###  This function send request to the web server
    ###  Get the response back, Check Status Code
    ###  Grab HTML and turns it to Beautiful Soup
    ###
    
    #send request to the url given
    response = requests.get(url)
    
    if response.status_code == 200:
        #received the data basck
        html = response.text
    
        #parse HTML to Beautiful Soup
        soup = BeautifulSoup(html, 'lxml')
    
        return soup
    
    else:
        return None

In [5]:
def get_job_summary(link):
    
    ###
    ###  This function get the Link to the Job Ad
    ###  Get the response back and Extract Job Summary
    ###  Turns it to Beautiful Soup and Extract Text
    ###
    
    # open that page and get the job summary
    url = baseurl + link
    soup = getting_data(url)
    
    try:
        jobDescription = soup.find('div', {'data-automation':'mobileTemplate'})
        
        return jobDescription.get_text()
        
    except:
        
        return None

In [6]:
def get_job_details(job_article):
    
    ###
    ###  Each job advertised on Seek on the list displayed is called Article
    ###  This function receive one article at a time to extract job related info
    ###  Then return it as a list of data
    ###
    
    
    try:
        #send one job article at a time to extract each job data
        job_ad = job_article.find('a', {'data-automation':'jobTitle'})
        title = job_ad.text
        link = job_ad['href']
        #print ( 'title:', title)
        #print ( 'link:', link)

        job_ad = job_article.find('span', {'data-automation':'jobSalary'})
        if (job_ad is None):
            salary = 'NA'
        else:
            salary = job_ad.text
        #print ('salary:', salary)

        job_ad = job_article.find('a', {'data-automation':'jobCompany'})
        if (job_ad is None):
            company = 'NA'
        else:
            company = job_ad.text
        #print ('company:', company)

        job_ad = job_article.find('a', {'data-automation':'jobLocation'})
        if (job_ad is None):
            location = 'NA'
        else:
            location = job_ad.text
        #print ('location:', location)


        job_ad = job_article.find('a', {'data-automation':'jobClassification'})
        if (job_ad is None):
            classification = 'NA'
        else:
            classification = job_ad.text
        #print ('classification:', classification)
        
        
        jobsummary = get_job_summary(link)
        
        return [title, link, salary, location, company, classification, jobsummary]

    except:
        
        return None


In [7]:
def get_job_ads_no(jobSearch):
    
    job_listings = []
    
    
    #send the html request for this job search
    html = getting_data(jobSearch + '1')
    
    # get no of toal jobs found
    jobcounts = html.find('strong',{'data-automation':'totalJobsCount'})
    
    # extract the jobs no
    jobsfound = int(jobcounts.text.replace(",", ""))

    #calculate how many pages need to collect the job listing
    pages =  int(jobsfound / 20) + 1
    
    print (jobsfound, pages)
    
    
    return pages
    

In [8]:
def get_job_articles(jobSearch, startPage, endPage):
    
    articleList = []
    # getting job listing for each page
    # each job is an article
    for p in np.arange(startPage,endPage):

        pageUrl = jobSearch + str(p + 1)
        print (pageUrl)
        
        # wait one second before getting a page
        sleep(1)
        html = getting_data(pageUrl)
        
        # find each job listing
        articleListPreminum = html.find_all('article',{'data-automation':'premiumJob'})
        articleList.extend(articleListPreminum)
        
        articleListNormal = html.find_all('article',{'data-automation':'normalJob'})
        articleList.extend(articleListNormal)
    
    return articleList
    

In [9]:
def get_df_Article(articleList):

    ###
    ###  This function recieved list of articles, then split the article list into half
    ###  Then feed an article at a time into another function to extract job info
    ###  Once processed return dataframes
    ###
    # ------------------------------------------------
    # Process the articles
    # ------------------------------------------------   
    job_postings = []
    
    for i in range(len(articleList)):
        new_postings = get_job_details(articleList[i])
        
        if new_postings is not None:
            job_postings.append(new_postings)    
    
    #print (len(job_postings))
    
    # now create a dataframe frome the data
    df = pd.DataFrame(job_postings, columns=['title', 'link', 'salary', 'location', 'company', 'classification', 'summary'])
    
    print ('finish process article')
    
    
    return df

In [10]:
### 
### Define variables related to the URL for web scraping on SEEK
### Each sets are group based on Salary and to make it scrape in smaller chunks
###
baseurl = "https://www.seek.com.au/"

sixty_suffix = '?salaryrange=0-60000&salarytype=annual&page='
eighty_suffix = '?salaryrange=60000-80000&salarytype=annual&page='
hundred_suffix = '?salaryrange=80000-100000&salarytype=annual&page='
onetwenty_suffix = '?salaryrange=100000-120000&salarytype=annual&page='
onefifty_suffix = '?salaryrange=120000-150000&salarytype=annual&page='
twohundred_suffix = '?salaryrange=150000-200000&salarytype=annual&page='
twofifty_suffix = '?salaryrange=200000-999999&salarytype=annual&page='

dataUrl = baseurl + "data-jobs/"

dataSixty = dataUrl + sixty_suffix
dataEighty = dataUrl + eighty_suffix
dataHundred = dataUrl + hundred_suffix
dataOnetwenty = dataUrl + onetwenty_suffix
dataOnefifty = dataUrl + onefifty_suffix
dataTwohundred = dataUrl + twohundred_suffix
dataTwofifty = dataUrl + twofifty_suffix


### Getting 0 - 60K data related jobs posting info

In [None]:
# For those salary between 0 - 60K

# get job ads numbers
pages = get_job_ads_no(dataSixty)

# get articles in chucks based on number of pages, make it 10 pages or 200~220 jobs increment at a time
for startPage in np.arange(0, pages, 10):
    
    endPage = startPage + 10
    articleList = get_job_articles(dataSixty, startPage, endPage)
    
    # sometimes Seek cuts off the link - no more articles are returned
    if not articleList:
        print("List is empty")
        #break
    else:
        new_df = get_df_Article(articleList)
        if startPage == 0:
            df = new_df
        else:
            df = pd.concat([df, new_df])
            print(len(df))
            

5872 294
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=1
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=2
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=3
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=4
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=5
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=6
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=7
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=8
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=9
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=10
finish process article
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=11
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=12
https

https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=100
finish process article
2010
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=101
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=102
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=103
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=104
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=105
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=106
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=107
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=108
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=109
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=110
finish process article
2218
https://www.seek.com.au/data-jobs/?salaryr

https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=197
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=198
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=199
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=200
finish process article
4020
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=201
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=202
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=203
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=204
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=205
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=206
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=207
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annu

https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=296
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=297
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=298
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=299
https://www.seek.com.au/data-jobs/?salaryrange=0-60000&salarytype=annual&page=300
List is empty


In [None]:
df.to_csv('./datasets/dataSixty.csv')
df.shape

### Getting 60 - 80K data related jobs posting info

In [11]:
# For those salary between 60K - 80K

# get job ads numbers
pages = get_job_ads_no(dataEighty)

# get articles in chunks based on number of pages, make it 10 pages or 200~220 jobs increment at a time
for startPage in np.arange(0, pages, 10):
    
    endPage = startPage + 10
    articleList = get_job_articles(dataEighty, startPage, endPage)
    
    # sometimes Seek cuts off the link - no more articles are returned
    if not articleList:
        print("List is empty")
        break
    else:
        new_df = get_df_Article(articleList)
        if startPage == 0:
            df = new_df
        else:
            df = pd.concat([df, new_df])
            print(len(df))
    

6115 306
https://www.seek.com.au/data-jobs/?salaryrange=60000-80000&salarytype=annual&page=1
https://www.seek.com.au/data-jobs/?salaryrange=60000-80000&salarytype=annual&page=2
https://www.seek.com.au/data-jobs/?salaryrange=60000-80000&salarytype=annual&page=3
https://www.seek.com.au/data-jobs/?salaryrange=60000-80000&salarytype=annual&page=4
https://www.seek.com.au/data-jobs/?salaryrange=60000-80000&salarytype=annual&page=5
https://www.seek.com.au/data-jobs/?salaryrange=60000-80000&salarytype=annual&page=6
https://www.seek.com.au/data-jobs/?salaryrange=60000-80000&salarytype=annual&page=7
https://www.seek.com.au/data-jobs/?salaryrange=60000-80000&salarytype=annual&page=8
https://www.seek.com.au/data-jobs/?salaryrange=60000-80000&salarytype=annual&page=9
https://www.seek.com.au/data-jobs/?salaryrange=60000-80000&salarytype=annual&page=10
finish process article
https://www.seek.com.au/data-jobs/?salaryrange=60000-80000&salarytype=annual&page=11
https://www.seek.com.au/data-jobs/?salaryr

https://www.seek.com.au/data-jobs/?salaryrange=60000-80000&salarytype=annual&page=95
https://www.seek.com.au/data-jobs/?salaryrange=60000-80000&salarytype=annual&page=96
https://www.seek.com.au/data-jobs/?salaryrange=60000-80000&salarytype=annual&page=97
https://www.seek.com.au/data-jobs/?salaryrange=60000-80000&salarytype=annual&page=98
https://www.seek.com.au/data-jobs/?salaryrange=60000-80000&salarytype=annual&page=99
https://www.seek.com.au/data-jobs/?salaryrange=60000-80000&salarytype=annual&page=100
finish process article
2028
https://www.seek.com.au/data-jobs/?salaryrange=60000-80000&salarytype=annual&page=101
https://www.seek.com.au/data-jobs/?salaryrange=60000-80000&salarytype=annual&page=102
https://www.seek.com.au/data-jobs/?salaryrange=60000-80000&salarytype=annual&page=103
https://www.seek.com.au/data-jobs/?salaryrange=60000-80000&salarytype=annual&page=104
https://www.seek.com.au/data-jobs/?salaryrange=60000-80000&salarytype=annual&page=105
https://www.seek.com.au/data-jo

https://www.seek.com.au/data-jobs/?salaryrange=60000-80000&salarytype=annual&page=188
https://www.seek.com.au/data-jobs/?salaryrange=60000-80000&salarytype=annual&page=189
https://www.seek.com.au/data-jobs/?salaryrange=60000-80000&salarytype=annual&page=190
finish process article
3854
https://www.seek.com.au/data-jobs/?salaryrange=60000-80000&salarytype=annual&page=191
https://www.seek.com.au/data-jobs/?salaryrange=60000-80000&salarytype=annual&page=192
https://www.seek.com.au/data-jobs/?salaryrange=60000-80000&salarytype=annual&page=193
https://www.seek.com.au/data-jobs/?salaryrange=60000-80000&salarytype=annual&page=194
https://www.seek.com.au/data-jobs/?salaryrange=60000-80000&salarytype=annual&page=195
https://www.seek.com.au/data-jobs/?salaryrange=60000-80000&salarytype=annual&page=196
https://www.seek.com.au/data-jobs/?salaryrange=60000-80000&salarytype=annual&page=197
https://www.seek.com.au/data-jobs/?salaryrange=60000-80000&salarytype=annual&page=198
https://www.seek.com.au/da

In [12]:
df.to_csv('./datasets/dataEighty.csv')
df.shape

(4056, 7)

### Getting 80K - 100K data related jobs posting info

In [13]:
# For those salary between 80K - 100K

# get job ads numbers
pages = get_job_ads_no(dataHundred)

# get articles in chunks based on number of pages, make it 10 pages or 200~220 jobs increment at a time
for startPage in np.arange(0, pages, 10):
    
    endPage = startPage + 10
    articleList = get_job_articles(dataHundred, startPage, endPage)
    
    # sometimes Seek cuts off the link - no more articles are returned
    if not articleList:
        print("List is empty")
        break
    else:
        new_df = get_df_Article(articleList)
        if startPage == 0:
            df = new_df
        else:
            df = pd.concat([df, new_df])
            print(len(df))
            

4991 250
https://www.seek.com.au/data-jobs/?salaryrange=80000-100000&salarytype=annual&page=1
https://www.seek.com.au/data-jobs/?salaryrange=80000-100000&salarytype=annual&page=2
https://www.seek.com.au/data-jobs/?salaryrange=80000-100000&salarytype=annual&page=3
https://www.seek.com.au/data-jobs/?salaryrange=80000-100000&salarytype=annual&page=4
https://www.seek.com.au/data-jobs/?salaryrange=80000-100000&salarytype=annual&page=5
https://www.seek.com.au/data-jobs/?salaryrange=80000-100000&salarytype=annual&page=6
https://www.seek.com.au/data-jobs/?salaryrange=80000-100000&salarytype=annual&page=7
https://www.seek.com.au/data-jobs/?salaryrange=80000-100000&salarytype=annual&page=8
https://www.seek.com.au/data-jobs/?salaryrange=80000-100000&salarytype=annual&page=9
https://www.seek.com.au/data-jobs/?salaryrange=80000-100000&salarytype=annual&page=10
finish process article
https://www.seek.com.au/data-jobs/?salaryrange=80000-100000&salarytype=annual&page=11
https://www.seek.com.au/data-jo

https://www.seek.com.au/data-jobs/?salaryrange=80000-100000&salarytype=annual&page=94
https://www.seek.com.au/data-jobs/?salaryrange=80000-100000&salarytype=annual&page=95
https://www.seek.com.au/data-jobs/?salaryrange=80000-100000&salarytype=annual&page=96
https://www.seek.com.au/data-jobs/?salaryrange=80000-100000&salarytype=annual&page=97
https://www.seek.com.au/data-jobs/?salaryrange=80000-100000&salarytype=annual&page=98
https://www.seek.com.au/data-jobs/?salaryrange=80000-100000&salarytype=annual&page=99
https://www.seek.com.au/data-jobs/?salaryrange=80000-100000&salarytype=annual&page=100
finish process article
2018
https://www.seek.com.au/data-jobs/?salaryrange=80000-100000&salarytype=annual&page=101
https://www.seek.com.au/data-jobs/?salaryrange=80000-100000&salarytype=annual&page=102
https://www.seek.com.au/data-jobs/?salaryrange=80000-100000&salarytype=annual&page=103
https://www.seek.com.au/data-jobs/?salaryrange=80000-100000&salarytype=annual&page=104
https://www.seek.com.

https://www.seek.com.au/data-jobs/?salaryrange=80000-100000&salarytype=annual&page=186
https://www.seek.com.au/data-jobs/?salaryrange=80000-100000&salarytype=annual&page=187
https://www.seek.com.au/data-jobs/?salaryrange=80000-100000&salarytype=annual&page=188
https://www.seek.com.au/data-jobs/?salaryrange=80000-100000&salarytype=annual&page=189
https://www.seek.com.au/data-jobs/?salaryrange=80000-100000&salarytype=annual&page=190
finish process article
3834
https://www.seek.com.au/data-jobs/?salaryrange=80000-100000&salarytype=annual&page=191
https://www.seek.com.au/data-jobs/?salaryrange=80000-100000&salarytype=annual&page=192
https://www.seek.com.au/data-jobs/?salaryrange=80000-100000&salarytype=annual&page=193
https://www.seek.com.au/data-jobs/?salaryrange=80000-100000&salarytype=annual&page=194
https://www.seek.com.au/data-jobs/?salaryrange=80000-100000&salarytype=annual&page=195
https://www.seek.com.au/data-jobs/?salaryrange=80000-100000&salarytype=annual&page=196
https://www.see

In [14]:
df.to_csv('./datasets/dataHundred.csv')
df.shape

(4036, 7)

### Getting 100K - 120K data related jobs posting info

In [15]:
# For those salary between 100K - 120K

# get job ads numbers
pages = get_job_ads_no(dataOnetwenty)

# get articles in chunks based on number of pages, make it 10 pages or 200~220 jobs increment at a time
for startPage in np.arange(0, pages, 10):
    
    endPage = startPage + 10
    articleList = get_job_articles(dataOnetwenty, startPage, endPage)
    
    # sometimes Seek cuts off the link - no more articles are returned
    if not articleList:
        print("List is empty")
        break
    else:
        new_df = get_df_Article(articleList)
        if startPage == 0:
            df = new_df
        else:
            df = pd.concat([df, new_df])
            print(len(df))


4611 231
https://www.seek.com.au/data-jobs/?salaryrange=100000-120000&salarytype=annual&page=1
https://www.seek.com.au/data-jobs/?salaryrange=100000-120000&salarytype=annual&page=2
https://www.seek.com.au/data-jobs/?salaryrange=100000-120000&salarytype=annual&page=3
https://www.seek.com.au/data-jobs/?salaryrange=100000-120000&salarytype=annual&page=4
https://www.seek.com.au/data-jobs/?salaryrange=100000-120000&salarytype=annual&page=5
https://www.seek.com.au/data-jobs/?salaryrange=100000-120000&salarytype=annual&page=6
https://www.seek.com.au/data-jobs/?salaryrange=100000-120000&salarytype=annual&page=7
https://www.seek.com.au/data-jobs/?salaryrange=100000-120000&salarytype=annual&page=8
https://www.seek.com.au/data-jobs/?salaryrange=100000-120000&salarytype=annual&page=9
https://www.seek.com.au/data-jobs/?salaryrange=100000-120000&salarytype=annual&page=10
finish process article
https://www.seek.com.au/data-jobs/?salaryrange=100000-120000&salarytype=annual&page=11
https://www.seek.com

https://www.seek.com.au/data-jobs/?salaryrange=100000-120000&salarytype=annual&page=93
https://www.seek.com.au/data-jobs/?salaryrange=100000-120000&salarytype=annual&page=94
https://www.seek.com.au/data-jobs/?salaryrange=100000-120000&salarytype=annual&page=95
https://www.seek.com.au/data-jobs/?salaryrange=100000-120000&salarytype=annual&page=96
https://www.seek.com.au/data-jobs/?salaryrange=100000-120000&salarytype=annual&page=97
https://www.seek.com.au/data-jobs/?salaryrange=100000-120000&salarytype=annual&page=98
https://www.seek.com.au/data-jobs/?salaryrange=100000-120000&salarytype=annual&page=99
https://www.seek.com.au/data-jobs/?salaryrange=100000-120000&salarytype=annual&page=100
finish process article
2015
https://www.seek.com.au/data-jobs/?salaryrange=100000-120000&salarytype=annual&page=101
https://www.seek.com.au/data-jobs/?salaryrange=100000-120000&salarytype=annual&page=102
https://www.seek.com.au/data-jobs/?salaryrange=100000-120000&salarytype=annual&page=103
https://www

https://www.seek.com.au/data-jobs/?salaryrange=100000-120000&salarytype=annual&page=184
https://www.seek.com.au/data-jobs/?salaryrange=100000-120000&salarytype=annual&page=185
https://www.seek.com.au/data-jobs/?salaryrange=100000-120000&salarytype=annual&page=186
https://www.seek.com.au/data-jobs/?salaryrange=100000-120000&salarytype=annual&page=187
https://www.seek.com.au/data-jobs/?salaryrange=100000-120000&salarytype=annual&page=188
https://www.seek.com.au/data-jobs/?salaryrange=100000-120000&salarytype=annual&page=189
https://www.seek.com.au/data-jobs/?salaryrange=100000-120000&salarytype=annual&page=190
finish process article
3828
https://www.seek.com.au/data-jobs/?salaryrange=100000-120000&salarytype=annual&page=191
https://www.seek.com.au/data-jobs/?salaryrange=100000-120000&salarytype=annual&page=192
https://www.seek.com.au/data-jobs/?salaryrange=100000-120000&salarytype=annual&page=193
https://www.seek.com.au/data-jobs/?salaryrange=100000-120000&salarytype=annual&page=194
http

In [16]:
df.to_csv('./datasets/dataOnetwenty.csv')
df.shape

(4030, 7)

### Getting 120K - 150K data related jobs posting info

In [17]:
# For those salary between 120K - 150K

# get job ads numbers
pages = get_job_ads_no(dataOnefifty)

# get articles in chunks based on number of pages, make it 10 pages or 200~220 jobs increment at a time
for startPage in np.arange(0, pages, 10):
    
    endPage = startPage + 10
    articleList = get_job_articles(dataOnefifty, startPage, endPage)
    # sometimes Seek cuts off the link - no more articles are returned
    if not articleList:
        print("List is empty")
        break
    else:
        new_df = get_df_Article(articleList)
        if startPage == 0:
            df = new_df
        else:
            df = pd.concat([df, new_df])
            print(len(df))
            

3705 186
https://www.seek.com.au/data-jobs/?salaryrange=120000-150000&salarytype=annual&page=1
https://www.seek.com.au/data-jobs/?salaryrange=120000-150000&salarytype=annual&page=2
https://www.seek.com.au/data-jobs/?salaryrange=120000-150000&salarytype=annual&page=3
https://www.seek.com.au/data-jobs/?salaryrange=120000-150000&salarytype=annual&page=4
https://www.seek.com.au/data-jobs/?salaryrange=120000-150000&salarytype=annual&page=5
https://www.seek.com.au/data-jobs/?salaryrange=120000-150000&salarytype=annual&page=6
https://www.seek.com.au/data-jobs/?salaryrange=120000-150000&salarytype=annual&page=7
https://www.seek.com.au/data-jobs/?salaryrange=120000-150000&salarytype=annual&page=8
https://www.seek.com.au/data-jobs/?salaryrange=120000-150000&salarytype=annual&page=9
https://www.seek.com.au/data-jobs/?salaryrange=120000-150000&salarytype=annual&page=10
finish process article
https://www.seek.com.au/data-jobs/?salaryrange=120000-150000&salarytype=annual&page=11
https://www.seek.com

https://www.seek.com.au/data-jobs/?salaryrange=120000-150000&salarytype=annual&page=93
https://www.seek.com.au/data-jobs/?salaryrange=120000-150000&salarytype=annual&page=94
https://www.seek.com.au/data-jobs/?salaryrange=120000-150000&salarytype=annual&page=95
https://www.seek.com.au/data-jobs/?salaryrange=120000-150000&salarytype=annual&page=96
https://www.seek.com.au/data-jobs/?salaryrange=120000-150000&salarytype=annual&page=97
https://www.seek.com.au/data-jobs/?salaryrange=120000-150000&salarytype=annual&page=98
https://www.seek.com.au/data-jobs/?salaryrange=120000-150000&salarytype=annual&page=99
https://www.seek.com.au/data-jobs/?salaryrange=120000-150000&salarytype=annual&page=100
finish process article
2007
https://www.seek.com.au/data-jobs/?salaryrange=120000-150000&salarytype=annual&page=101
https://www.seek.com.au/data-jobs/?salaryrange=120000-150000&salarytype=annual&page=102
https://www.seek.com.au/data-jobs/?salaryrange=120000-150000&salarytype=annual&page=103
https://www

https://www.seek.com.au/data-jobs/?salaryrange=120000-150000&salarytype=annual&page=184
https://www.seek.com.au/data-jobs/?salaryrange=120000-150000&salarytype=annual&page=185
https://www.seek.com.au/data-jobs/?salaryrange=120000-150000&salarytype=annual&page=186
https://www.seek.com.au/data-jobs/?salaryrange=120000-150000&salarytype=annual&page=187
https://www.seek.com.au/data-jobs/?salaryrange=120000-150000&salarytype=annual&page=188
https://www.seek.com.au/data-jobs/?salaryrange=120000-150000&salarytype=annual&page=189
https://www.seek.com.au/data-jobs/?salaryrange=120000-150000&salarytype=annual&page=190
finish process article
3712


In [18]:
df.to_csv('./datasets/dataOnefifty.csv')
df.shape

(3712, 7)

### Getting 150K - 200K data related jobs posting info

In [19]:
# For those salary between 150K - 200K

# get job ads numbers
pages = get_job_ads_no(dataTwohundred)

# get articles in chunks based on number of pages, make it 10 pages or 200~220 jobs increment at a time
for startPage in np.arange(0, pages, 10):
    
    endPage = startPage + 10
    articleList = get_job_articles(dataTwohundred, startPage, endPage)
    # sometimes Seek cuts off the link - no more articles are returned
    if not articleList:
        print("List is empty")
        break
    else:
        new_df = get_df_Article(articleList)
        if startPage == 0:
            df = new_df
        else:
            df = pd.concat([df, new_df])
            print(len(df))

2652 133
https://www.seek.com.au/data-jobs/?salaryrange=150000-200000&salarytype=annual&page=1
https://www.seek.com.au/data-jobs/?salaryrange=150000-200000&salarytype=annual&page=2
https://www.seek.com.au/data-jobs/?salaryrange=150000-200000&salarytype=annual&page=3
https://www.seek.com.au/data-jobs/?salaryrange=150000-200000&salarytype=annual&page=4
https://www.seek.com.au/data-jobs/?salaryrange=150000-200000&salarytype=annual&page=5
https://www.seek.com.au/data-jobs/?salaryrange=150000-200000&salarytype=annual&page=6
https://www.seek.com.au/data-jobs/?salaryrange=150000-200000&salarytype=annual&page=7
https://www.seek.com.au/data-jobs/?salaryrange=150000-200000&salarytype=annual&page=8
https://www.seek.com.au/data-jobs/?salaryrange=150000-200000&salarytype=annual&page=9
https://www.seek.com.au/data-jobs/?salaryrange=150000-200000&salarytype=annual&page=10
finish process article
https://www.seek.com.au/data-jobs/?salaryrange=150000-200000&salarytype=annual&page=11
https://www.seek.com

https://www.seek.com.au/data-jobs/?salaryrange=150000-200000&salarytype=annual&page=93
https://www.seek.com.au/data-jobs/?salaryrange=150000-200000&salarytype=annual&page=94
https://www.seek.com.au/data-jobs/?salaryrange=150000-200000&salarytype=annual&page=95
https://www.seek.com.au/data-jobs/?salaryrange=150000-200000&salarytype=annual&page=96
https://www.seek.com.au/data-jobs/?salaryrange=150000-200000&salarytype=annual&page=97
https://www.seek.com.au/data-jobs/?salaryrange=150000-200000&salarytype=annual&page=98
https://www.seek.com.au/data-jobs/?salaryrange=150000-200000&salarytype=annual&page=99
https://www.seek.com.au/data-jobs/?salaryrange=150000-200000&salarytype=annual&page=100
finish process article
2006
https://www.seek.com.au/data-jobs/?salaryrange=150000-200000&salarytype=annual&page=101
https://www.seek.com.au/data-jobs/?salaryrange=150000-200000&salarytype=annual&page=102
https://www.seek.com.au/data-jobs/?salaryrange=150000-200000&salarytype=annual&page=103
https://www

In [20]:
df.to_csv('./datasets/dataTwohundred.csv')
df.shape

(2676, 7)

### Getting 200K - 250K and above data related jobs posting info

In [21]:
# For those salary between 200K - 250K and Above

# get job ads numbers
pages = get_job_ads_no(dataTwofifty)

# get articles in chunks based on number of pages, make it 10 pages or 200~220 jobs increment at a time
for startPage in np.arange(0, pages, 10):
    endPage = startPage + 10
    articleList = get_job_articles(dataTwofifty, startPage, endPage)
    # sometimes Seek cuts off the link - no more articles are returned
    if not articleList:
        print("List is empty")
        break
    else:
        new_df = get_df_Article(articleList)
        if startPage == 0:
            df = new_df
        else:
            df = pd.concat([df, new_df])
            print(len(df))
            

1204 61
https://www.seek.com.au/data-jobs/?salaryrange=200000-999999&salarytype=annual&page=1
https://www.seek.com.au/data-jobs/?salaryrange=200000-999999&salarytype=annual&page=2
https://www.seek.com.au/data-jobs/?salaryrange=200000-999999&salarytype=annual&page=3
https://www.seek.com.au/data-jobs/?salaryrange=200000-999999&salarytype=annual&page=4
https://www.seek.com.au/data-jobs/?salaryrange=200000-999999&salarytype=annual&page=5
https://www.seek.com.au/data-jobs/?salaryrange=200000-999999&salarytype=annual&page=6
https://www.seek.com.au/data-jobs/?salaryrange=200000-999999&salarytype=annual&page=7
https://www.seek.com.au/data-jobs/?salaryrange=200000-999999&salarytype=annual&page=8
https://www.seek.com.au/data-jobs/?salaryrange=200000-999999&salarytype=annual&page=9
https://www.seek.com.au/data-jobs/?salaryrange=200000-999999&salarytype=annual&page=10
finish process article
https://www.seek.com.au/data-jobs/?salaryrange=200000-999999&salarytype=annual&page=11
https://www.seek.com.

In [22]:
df.to_csv('./datasets/dataTwofifty.csv')
df.shape

(1204, 7)