# Australia 'Data' Salary Web Scrapping based on Job Title

Data Collection will be any jobs with the world data inside

Each of these groups will be further divied into different groups based on pay

Job Title and If the pay is less than 100K or higher than 100K
* Data Analyst
* Data Engineering
* Data Science
* Data Scienst

Import python related libraries for data processing and web scraping

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

plt.style.use('fivethirtyeight')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
from bs4 import BeautifulSoup
import requests
import re

# import sleep
from time import sleep

In [3]:
baseurl = "https://www.seek.com.au/"

In [4]:
def getting_data(url):
    ###
    ###  This function send request to the web server
    ###  Get the response back, Check Status Code
    ###  Grab HTML and turns it to Beautiful Soup
    ###
    
    #send request to the url given
    response = requests.get(url)
    
    if response.status_code == 200:
        #received the data basck
        html = response.text
    
        #parse HTML to Beautiful Soup
        soup = BeautifulSoup(html, 'lxml')
    
        return soup
    
    else:
        return None

In [5]:
def get_job_summary(link):
    
    ###
    ###  This function get the Link to the Job Ad
    ###  Get the response back and Extract Job Summary
    ###  Turns it to Beautiful Soup and Extract Text
    ###
    
    # open that page and get the job summary
    url = baseurl + link
    soup = getting_data(url)
    
    try:
        jobDescription = soup.find('div', {'data-automation':'mobileTemplate'})
        
        return jobDescription.get_text()
        
    except:
        
        return None

In [6]:
def get_job_details(job_article):
    
    ###
    ###  Each job advertised on Seek on the list displayed is called Article
    ###  This function receive one article at a time to extract job related info
    ###  Then return it as a list of data
    ###
    
    
    try:
        #send one job article at a time to extract each job data
        job_ad = job_article.find('a', {'data-automation':'jobTitle'})
        title = job_ad.text
        link = job_ad['href']
        #print ( 'title:', title)
        #print ( 'link:', link)

        job_ad = job_article.find('span', {'data-automation':'jobSalary'})
        if (job_ad is None):
            salary = 'NA'
        else:
            salary = job_ad.text
        #print ('salary:', salary)

        job_ad = job_article.find('a', {'data-automation':'jobCompany'})
        if (job_ad is None):
            company = 'NA'
        else:
            company = job_ad.text
        #print ('company:', company)

        job_ad = job_article.find('a', {'data-automation':'jobLocation'})
        if (job_ad is None):
            location = 'NA'
        else:
            location = job_ad.text
        #print ('location:', location)


        job_ad = job_article.find('a', {'data-automation':'jobClassification'})
        if (job_ad is None):
            classification = 'NA'
        else:
            classification = job_ad.text
        #print ('classification:', classification)
        
        
        jobsummary = get_job_summary(link)
        
        return [title, link, salary, location, company, classification, jobsummary]

    except:
        
        return None


In [7]:
def get_job_ads_no(jobSearch):
    
    job_listings = []
    
    
    #send the html request for this job search
    html = getting_data(jobSearch + '1')
    
    # get no of toal jobs found
    jobcounts = html.find('strong',{'data-automation':'totalJobsCount'})
    
    # extract the jobs no
    jobsfound = int(jobcounts.text.replace(",", ""))

    #calculate how many pages need to collect the job listing
    pages =  int(jobsfound / 20) + 1
    
    print (jobsfound, pages)
    
    
    return pages
    

In [8]:
def get_job_articles(jobSearch, startPage, endPage):
    
    articleList = []
    # getting job listing for each page
    # each job is an article
    for p in np.arange(startPage,endPage):

        pageUrl = jobSearch + str(p + 1)
        print (pageUrl)
        
        # wait one second before getting a page
        sleep(1)
        html = getting_data(pageUrl)
        
        # find each job listing
        articleListPreminum = html.find_all('article',{'data-automation':'premiumJob'})
        articleList.extend(articleListPreminum)
        
        articleListNormal = html.find_all('article',{'data-automation':'normalJob'})
        articleList.extend(articleListNormal)
    
    return articleList
    

In [9]:
def get_df_Article(articleList):

    ###
    ###  This function recieved list of articles, then split the article list into half
    ###  Then feed an article at a time into another function to extract job info
    ###  Once processed return dataframes
    ###
    # ------------------------------------------------
    # Process the articles
    # ------------------------------------------------   
    job_postings = []
    
    for i in range(len(articleList)):
        new_postings = get_job_details(articleList[i])
        
        if new_postings is not None:
            job_postings.append(new_postings)    
    
    #print (len(job_postings))
    
    # now create a dataframe frome the data
    df = pd.DataFrame(job_postings, columns=['title', 'link', 'salary', 'location', 'company', 'classification', 'summary'])
    
    print ('finish process article')
    
    
    return df

In [10]:
### 
### Define variables related to the URL for web scraping on SEEK
### Each sets are group based on Salary and to make it scrape in smaller chunks
###
baseurl = "https://www.seek.com.au/"

low_suffix = '?salaryrange=0-100000&salarytype=annual&page='
high_suffix = '?salaryrange=100000-999999&salarytype=annual&page='

dataUrl = baseurl + "data-jobs/"

analyst = "data-analyst-jobs"
dataAnalystLow = dataUrl + analyst + low_suffix
dataAnalystHigh  = dataUrl + analyst + high_suffix


engineer = "data-engineer-jobs"
dataEngineerLow = dataUrl + engineer + low_suffix
dataEngineerHigh = dataUrl + engineer + high_suffix

science = "data-science-jobs"
dataScienceLow = dataUrl + science + low_suffix
dataScienceHigh = dataUrl + science + high_suffix


scientist = "data-scientist-jobs"
dataScientistLow = dataUrl + scientist + low_suffix
dataScientistHigh = dataUrl + scientist + high_suffix


In [11]:
def get_jobs_data(url):
    
    # get job ads numbers
    pages = get_job_ads_no(url)
    

    # get articles in chunks based on number of pages, make it 10 pages or 200~220 jobs increment at a time
    for startPage in np.arange(0, pages, 10):

        endPage = startPage + 10
        articleList = get_job_articles(url, startPage, endPage)

        # sometimes Seek cuts off the link - no more articles are returned
        if not articleList:
            print("List is empty")
            break
        else:
            new_df = get_df_Article(articleList)
            if startPage == 0:
                df = new_df
            else:
                df = pd.concat([df, new_df])
                print(len(df))
                
                
    return df


### Getting data analyst jobs posting

In [12]:
df = get_jobs_data(dataAnalystLow)
df.to_csv('./datasets/dataAnalystLow.csv')
df.shape

396 20
https://www.seek.com.au/data-jobs/data-analyst-jobs?salaryrange=0-100000&salarytype=annual&page=1
https://www.seek.com.au/data-jobs/data-analyst-jobs?salaryrange=0-100000&salarytype=annual&page=2
https://www.seek.com.au/data-jobs/data-analyst-jobs?salaryrange=0-100000&salarytype=annual&page=3
https://www.seek.com.au/data-jobs/data-analyst-jobs?salaryrange=0-100000&salarytype=annual&page=4
https://www.seek.com.au/data-jobs/data-analyst-jobs?salaryrange=0-100000&salarytype=annual&page=5
https://www.seek.com.au/data-jobs/data-analyst-jobs?salaryrange=0-100000&salarytype=annual&page=6
https://www.seek.com.au/data-jobs/data-analyst-jobs?salaryrange=0-100000&salarytype=annual&page=7
https://www.seek.com.au/data-jobs/data-analyst-jobs?salaryrange=0-100000&salarytype=annual&page=8
https://www.seek.com.au/data-jobs/data-analyst-jobs?salaryrange=0-100000&salarytype=annual&page=9
https://www.seek.com.au/data-jobs/data-analyst-jobs?salaryrange=0-100000&salarytype=annual&page=10
finish proce

(396, 7)

In [13]:
df = get_jobs_data(dataAnalystHigh)
df.to_csv('./datasets/dataAnalystHigh.csv')
df.shape

435 22
https://www.seek.com.au/data-jobs/data-analyst-jobs?salaryrange=100000-999999&salarytype=annual&page=1
https://www.seek.com.au/data-jobs/data-analyst-jobs?salaryrange=100000-999999&salarytype=annual&page=2
https://www.seek.com.au/data-jobs/data-analyst-jobs?salaryrange=100000-999999&salarytype=annual&page=3
https://www.seek.com.au/data-jobs/data-analyst-jobs?salaryrange=100000-999999&salarytype=annual&page=4
https://www.seek.com.au/data-jobs/data-analyst-jobs?salaryrange=100000-999999&salarytype=annual&page=5
https://www.seek.com.au/data-jobs/data-analyst-jobs?salaryrange=100000-999999&salarytype=annual&page=6
https://www.seek.com.au/data-jobs/data-analyst-jobs?salaryrange=100000-999999&salarytype=annual&page=7
https://www.seek.com.au/data-jobs/data-analyst-jobs?salaryrange=100000-999999&salarytype=annual&page=8
https://www.seek.com.au/data-jobs/data-analyst-jobs?salaryrange=100000-999999&salarytype=annual&page=9
https://www.seek.com.au/data-jobs/data-analyst-jobs?salaryrange=10

(435, 7)

### Getting data engineer jobs posting

In [14]:
df = get_jobs_data(dataEngineerLow )
df.to_csv('./datasets/dataEngineerLow.csv')
df.shape

506 26
https://www.seek.com.au/data-jobs/data-engineer-jobs?salaryrange=0-100000&salarytype=annual&page=1
https://www.seek.com.au/data-jobs/data-engineer-jobs?salaryrange=0-100000&salarytype=annual&page=2
https://www.seek.com.au/data-jobs/data-engineer-jobs?salaryrange=0-100000&salarytype=annual&page=3
https://www.seek.com.au/data-jobs/data-engineer-jobs?salaryrange=0-100000&salarytype=annual&page=4
https://www.seek.com.au/data-jobs/data-engineer-jobs?salaryrange=0-100000&salarytype=annual&page=5
https://www.seek.com.au/data-jobs/data-engineer-jobs?salaryrange=0-100000&salarytype=annual&page=6
https://www.seek.com.au/data-jobs/data-engineer-jobs?salaryrange=0-100000&salarytype=annual&page=7
https://www.seek.com.au/data-jobs/data-engineer-jobs?salaryrange=0-100000&salarytype=annual&page=8
https://www.seek.com.au/data-jobs/data-engineer-jobs?salaryrange=0-100000&salarytype=annual&page=9
https://www.seek.com.au/data-jobs/data-engineer-jobs?salaryrange=0-100000&salarytype=annual&page=10
fi

(506, 7)

In [15]:
df = get_jobs_data(dataAnalystLow)
df.to_csv('./datasets/dataEngineerHigh.csv')
df.shape

396 20
https://www.seek.com.au/data-jobs/data-analyst-jobs?salaryrange=0-100000&salarytype=annual&page=1
https://www.seek.com.au/data-jobs/data-analyst-jobs?salaryrange=0-100000&salarytype=annual&page=2
https://www.seek.com.au/data-jobs/data-analyst-jobs?salaryrange=0-100000&salarytype=annual&page=3
https://www.seek.com.au/data-jobs/data-analyst-jobs?salaryrange=0-100000&salarytype=annual&page=4
https://www.seek.com.au/data-jobs/data-analyst-jobs?salaryrange=0-100000&salarytype=annual&page=5
https://www.seek.com.au/data-jobs/data-analyst-jobs?salaryrange=0-100000&salarytype=annual&page=6
https://www.seek.com.au/data-jobs/data-analyst-jobs?salaryrange=0-100000&salarytype=annual&page=7
https://www.seek.com.au/data-jobs/data-analyst-jobs?salaryrange=0-100000&salarytype=annual&page=8
https://www.seek.com.au/data-jobs/data-analyst-jobs?salaryrange=0-100000&salarytype=annual&page=9
https://www.seek.com.au/data-jobs/data-analyst-jobs?salaryrange=0-100000&salarytype=annual&page=10
finish proce

(396, 7)

### Getting data science jobs posting

In [16]:
df = get_jobs_data(dataScienceLow)
df.to_csv('./datasets/dataScienceLow.csv')
df.shape

393 20
https://www.seek.com.au/data-jobs/data-science-jobs?salaryrange=0-100000&salarytype=annual&page=1
https://www.seek.com.au/data-jobs/data-science-jobs?salaryrange=0-100000&salarytype=annual&page=2
https://www.seek.com.au/data-jobs/data-science-jobs?salaryrange=0-100000&salarytype=annual&page=3
https://www.seek.com.au/data-jobs/data-science-jobs?salaryrange=0-100000&salarytype=annual&page=4
https://www.seek.com.au/data-jobs/data-science-jobs?salaryrange=0-100000&salarytype=annual&page=5
https://www.seek.com.au/data-jobs/data-science-jobs?salaryrange=0-100000&salarytype=annual&page=6
https://www.seek.com.au/data-jobs/data-science-jobs?salaryrange=0-100000&salarytype=annual&page=7
https://www.seek.com.au/data-jobs/data-science-jobs?salaryrange=0-100000&salarytype=annual&page=8
https://www.seek.com.au/data-jobs/data-science-jobs?salaryrange=0-100000&salarytype=annual&page=9
https://www.seek.com.au/data-jobs/data-science-jobs?salaryrange=0-100000&salarytype=annual&page=10
finish proce

(393, 7)

In [17]:
df = get_jobs_data(dataScienceHigh)
df.to_csv('./datasets/dataScienceHigh.csv')
df.shape

335 17
https://www.seek.com.au/data-jobs/data-science-jobs?salaryrange=100000-999999&salarytype=annual&page=1
https://www.seek.com.au/data-jobs/data-science-jobs?salaryrange=100000-999999&salarytype=annual&page=2
https://www.seek.com.au/data-jobs/data-science-jobs?salaryrange=100000-999999&salarytype=annual&page=3
https://www.seek.com.au/data-jobs/data-science-jobs?salaryrange=100000-999999&salarytype=annual&page=4
https://www.seek.com.au/data-jobs/data-science-jobs?salaryrange=100000-999999&salarytype=annual&page=5
https://www.seek.com.au/data-jobs/data-science-jobs?salaryrange=100000-999999&salarytype=annual&page=6
https://www.seek.com.au/data-jobs/data-science-jobs?salaryrange=100000-999999&salarytype=annual&page=7
https://www.seek.com.au/data-jobs/data-science-jobs?salaryrange=100000-999999&salarytype=annual&page=8
https://www.seek.com.au/data-jobs/data-science-jobs?salaryrange=100000-999999&salarytype=annual&page=9
https://www.seek.com.au/data-jobs/data-science-jobs?salaryrange=10

(355, 7)

### Getting data scientist jobs posting

In [18]:
df = get_jobs_data(dataScientistLow)
df.to_csv('./datasets/dataScientistLow.csv')
df.shape

61 4
https://www.seek.com.au/data-jobs/data-scientist-jobs?salaryrange=0-100000&salarytype=annual&page=1
https://www.seek.com.au/data-jobs/data-scientist-jobs?salaryrange=0-100000&salarytype=annual&page=2
https://www.seek.com.au/data-jobs/data-scientist-jobs?salaryrange=0-100000&salarytype=annual&page=3
https://www.seek.com.au/data-jobs/data-scientist-jobs?salaryrange=0-100000&salarytype=annual&page=4
https://www.seek.com.au/data-jobs/data-scientist-jobs?salaryrange=0-100000&salarytype=annual&page=5
https://www.seek.com.au/data-jobs/data-scientist-jobs?salaryrange=0-100000&salarytype=annual&page=6
https://www.seek.com.au/data-jobs/data-scientist-jobs?salaryrange=0-100000&salarytype=annual&page=7
https://www.seek.com.au/data-jobs/data-scientist-jobs?salaryrange=0-100000&salarytype=annual&page=8
https://www.seek.com.au/data-jobs/data-scientist-jobs?salaryrange=0-100000&salarytype=annual&page=9
https://www.seek.com.au/data-jobs/data-scientist-jobs?salaryrange=0-100000&salarytype=annual&pa

(61, 7)

In [19]:
df = get_jobs_data(dataScientistHigh)
df.to_csv('./datasets/dataScientistHigh.csv')
df.shape

73 4
https://www.seek.com.au/data-jobs/data-scientist-jobs?salaryrange=100000-999999&salarytype=annual&page=1
https://www.seek.com.au/data-jobs/data-scientist-jobs?salaryrange=100000-999999&salarytype=annual&page=2
https://www.seek.com.au/data-jobs/data-scientist-jobs?salaryrange=100000-999999&salarytype=annual&page=3
https://www.seek.com.au/data-jobs/data-scientist-jobs?salaryrange=100000-999999&salarytype=annual&page=4
https://www.seek.com.au/data-jobs/data-scientist-jobs?salaryrange=100000-999999&salarytype=annual&page=5
https://www.seek.com.au/data-jobs/data-scientist-jobs?salaryrange=100000-999999&salarytype=annual&page=6
https://www.seek.com.au/data-jobs/data-scientist-jobs?salaryrange=100000-999999&salarytype=annual&page=7
https://www.seek.com.au/data-jobs/data-scientist-jobs?salaryrange=100000-999999&salarytype=annual&page=8
https://www.seek.com.au/data-jobs/data-scientist-jobs?salaryrange=100000-999999&salarytype=annual&page=9
https://www.seek.com.au/data-jobs/data-scientist-j

(73, 7)