In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import re
# from scrapy.selector import Selector
# from scrapy.http import HtmlResponse
import os

In [2]:
# create the URL
baseurl = 'https://au.indeed.com/jobs?q='
search = ['Data+Scientist','Data+Analyst','Business+Analyst','Data+Engineer',]
location =['Sydney+NSW','Melbourne+VIC','Brisbane+QLD','Perth+WA','Adelaide+SA']
limit='50'

## Function Definitions

In [3]:
# Function to get the Job URL and Job Title
def Job_Title_URL(jobs,joburl=[],jobtitle=[]):
    for i in range(len(jobs)):
        joburl.append('https://au.indeed.com/'+jobs[i]['href'])
        jobtitle.append(jobs[i]['title'])
    return joburl,jobtitle

In [4]:
# Function to get Company name
def company_name(soup, company_names=[]):
    for row in soup.find_all('div', {'id':True}):
        for header in row.find_all('span', {'class':'company'}):
            company_names.append(header.text.replace('\n',''))
    return company_names

In [5]:
# Function to get Salary if Exists otherwise replace as Null
def get_salary(soup, salary_list=[]):
    for row in soup.find_all('div', {'id':True}):
        for block in row.find_all('td', {'class':'snip'}):
            try:
                salary =  block.find('span', {'class':'no-wrap'}).text
                salary_list.append(salary)
            except:
                salary_list.append(np.nan)
                
    return salary_list

In [6]:
def sal_newline(row):
    try:
        # Replace \n with nothing
        new_row = row.replace('\n', '')
    except:
        new_row = np.nan    
    
    return new_row

In [7]:
driver = webdriver.Chrome(executable_path="./chromedriver/chromedriver.exe")

In [8]:
for i in range(len(search)):
    for j in range(len(location)):
        # Get the Soup for the First Page
        url = baseurl+search[i]+'&l='+location[j]
        driver.get(url)
        html = driver.page_source
        soup = BeautifulSoup(html,'lxml') 
        
        # Get the number of pages for a particular search term at a particular location
        num_add = soup.find(name = 'div',attrs = {'id' : 'searchCount'})
        num_jobs=int(re.findall('(\d+) jobs',num_add.text)[0])
        num_pages = int(np.floor(num_jobs /(int(limit))))
        page = list(range(int(limit),(num_pages)*int(limit),int(limit)))
        npage = [str(x) for x in page]
        npage.insert(0,'')
        
        # Go to these pages and start scraping Individual Ads
        for k in range(len(npage)):
            url_pages = url+'&start='+npage[k]
            
            # Create the sub page soup
            driver.get(url_pages)
            sub_html = driver.page_source
            sub_soup = BeautifulSoup(sub_html,'lxml') 
            
            # Find the Individual Jobs
            jobs = sub_soup.find_all('a', attrs={"data-tn-element":"jobTitle"})
            
            joburl,jobtitle = Job_Title_URL(jobs)     # JobURL and JobTitle
            salary = get_salary(sub_soup)             # Get the Salary list
            company = company_name(sub_soup)          # Get the name of the company
            job_loc = np.repeat(location[j],len(joburl)) # Create an array for Job Location
            job_search = np.repeat(search[i],len(joburl))# Create an array for Job Search Term
            
            # Create the dataframe from this data
            diction= dict({'jobtitle':jobtitle,'joburl':joburl,'company':company,
               'job_loc':job_loc,'job_search_term':job_search,'Salary':salary})
            df = pd.DataFrame(diction)
            df.Salary = df.Salary.map(sal_newline)
            # Save / Append the information in a CSV file
            # if file does not exist write header 
        if not os.path.isfile('JobData.csv'):
            df.to_csv('JobData.csv')
        else: # else it exists so append without writing the header
            df.to_csv('JobData.csv', mode='a', header=False)   
       
            
        
                   

In [9]:
driver.close()

In [10]:
# Get the Saved Dataframe from the saved file

df = pd.read_csv('JobData.csv')

In [11]:
df.shape

(16098, 7)

In [13]:
df.Salary.isnull().sum() 

13051

In [26]:
len(df.joburl)

16098

We have Salary information on about 3047 jobs.  

In [30]:
# Now we pull summary for each of the job
def job_summary(df,summary=[]):
    for m in range(len(df.joburl)):
        url = df.joburl[m]
        html_job = requests.get(url)
        soup_job = BeautifulSoup(html_job.content,'lxml')
        x= soup_job.find_all('div',{'class':'jobsearch-JobComponent-description'})
        summary.append(x[0].text)
    return summary

In [35]:
# Now we pull summary for each of the job
def job_summary(df_joburl):
    try:
        html_job = requests.get(df_joburl)
        soup_job = BeautifulSoup(html_job.content,'lxml')
        x= soup_job.find_all('div',{'class':'jobsearch-JobComponent-description'})
        return x[0].text
    except:
        return None

In [36]:
df['job_desc'] = df['joburl'].apply(job_summary)

In [38]:
df.job_desc.isnull().sum()

36

In [39]:
df.to_csv('SummarizedData.csv')