### ***Project 4: Web Scraping Job Postings***

### Section 1 - *Web Scrapping Indeed*

In [2]:
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import time
import numpy as np

In [3]:
# you will need the requests library in order to fully utilize bs4
import requests
from bs4 import BeautifulSoup


# target web page
url = "https://au.indeed.com/jobs?q=data+scientist&l=Australia&radius=100"
# establishing the connection to the webpage
response = requests.get(url)

# You can use status codes to understand how the target server responds to your request.
#Ex. 200 = OK, 400 = Bad Request, 403 = Forbidden, 404 = Not Found
print('Status Code: ',response.status_code)

# Pull HTML string out of requests and convert to a python string
soup = BeautifulSoup(response.text,"html.parser")

# print("\nFirst part of HTML document fetched as string:\n")
# print(soup.prettify())

Status Code:  200


### Extracting *Job Title*

In [78]:
def extract_job_title_from_result(soup): 
        jobs = []
        for div in soup.find_all(name='div', attrs={'class':'title'}):
            for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
                jobs.append(a['title'])
        return(jobs)

### Extracting *Company Name*

In [80]:
def extract_company_from_result(soup): 
    companies = []
    
    for div in soup.find_all(name='div', attrs={'class':'row'}):
        company = div.find_all(name='span', attrs={'class':'company'})
        if len(company) > 0:
            for b in company:
                companies.append(b.text.strip())
        else:
            sec_try = div.find_all(name='span', attrs={'class':'result-link-source'})
            for span in sec_try:
                companies.append(span.text.strip())
    
    return(companies)

### Extracting *Location*

In [80]:
# extract_location(soup)
def extract_location(soup):
    location=[]

    for div in soup.find_all('div', attrs={'class':'sjcl'}):
        div_one=div.find('div',attrs={'class':'location'})
        if div_one==None:
            div_two=div.find('span',attrs={'class':'location'})
            location.append(div_two.text.strip())
        else:
            location.append(div_one.text.strip()) 
            
    return(location)

In [30]:
job_location=extract_location(soup)

### Summary Link Extraction

In [31]:
def extract_link_from_result(soup): 
    links = []
    summary=[]
    for div in soup.find_all('div', attrs={'class':'title'}):
        div_one=div.find("a", {"class":"jobtitle turnstileLink "})['href']
        if div_one==None:
            links.append('Nothing_found')
        else:
            link='https://au.indeed.com/'+div_one
            response = requests.get(link)
            summary_soup = BeautifulSoup(response.text,"html.parser")
            summary_soup.find_all('div', attrs={'class':'jobsearch-ViewJobLayout'})
            for div in summary_soup.find_all('div', attrs={'id':'jobDescriptionText'}):
                summary.append(div.text.strip())    
            links.append(link.strip())
    df_links=pd.DataFrame(links)
    df_summary=pd.DataFrame(summary)
    df=pd.concat([df_links,df_summary], axis=1)
    df.columns=['Links','Summary']
    return df

In [32]:
%%time
extract_link_from_result(soup)

CPU times: user 953 ms, sys: 41.1 ms, total: 994 ms
Wall time: 5.41 s


Unnamed: 0,Links,Summary
0,https://au.indeed.com//pagead/clk?mo=r&ad=-6NY...,Flexible work arrangements to meet your needs\...
1,https://au.indeed.com//pagead/clk?mo=r&ad=-6NY...,"First a bit about ANZ\n\nAt ANZ, everything we..."
2,https://au.indeed.com//pagead/clk?mo=r&ad=-6NY...,The Team\n\nThe Customer Service Operations Au...
3,https://au.indeed.com//pagead/clk?mo=r&ad=-6NY...,An Sydney based company is currently looking f...
4,https://au.indeed.com//rc/clk?jk=f3ce2c76078a9...,Job Description\nData Assimilation Scientist\n...
5,https://au.indeed.com//rc/clk?jk=f3d88d86aa03a...,We are a diverse and highly regarded collectiv...
6,https://au.indeed.com//rc/clk?jk=95d3c3ee99b0f...,":\n\nAs the Customer Data Scientist, you will ..."
7,https://au.indeed.com//rc/clk?jk=324e727f49bb6...,The Opportunity\nImmerse Yourself in an Inclus...
8,https://au.indeed.com//rc/clk?jk=903d5a34c9071...,"Deliver components of the design, implementati..."
9,https://au.indeed.com//rc/clk?jk=03e20b28ddf9f...,Be part of an iconic Australian business\nGrea...


### Extracting *Salaries*

In [12]:
def extract_salary_from_result(soup): 
    salaries = []
    for div in soup.find_all('div', attrs={'class':'title'}):
        div_one=div.find('a', attrs={'class':'salary no-wrap'})
        if div_one==None:
            salaries.append('Nothing_found')
        else:
            salaries.append(div_one.text.strip())

    return salaries


In [14]:
extract_salary_from_result(soup)

['Nothing_found',
 'Nothing_found',
 'Nothing_found',
 '$110,000 - $150,000 a year',
 'Nothing_found',
 'Nothing_found',
 'Nothing_found',
 'Nothing_found',
 '$98,000 - $106,000 a year',
 'Nothing_found',
 'Nothing_found',
 'Nothing_found',
 '$84,019 - $95,329 a year',
 'Nothing_found',
 'Nothing_found',
 'Nothing_found',
 'Nothing_found',
 '$800 - $900 a day']

In [214]:
def extract_summary_from_result(soup): 
    summaries = []
    all_links = soup.find_all("a")
    for link in all_links:
        print(link.get("href"))
        summaries.append(span.text.strip())
    return summaries


### URL construction using different key words

In [10]:
columns = ['job_title', 'company_name', 'location','summary'  ,'salary']

In [9]:
urls=[]
job_titles=['data+scientist', 'intelligence+analyst','quantitative+analyst','data+engineer','business+intelligence','analytics']
clean_search_results=[]
for title in job_titles:
    
    url ='https://au.indeed.com/jobs?q='+title+'&l=Australia'+'&start=0'
    response = requests.get(url)
    if response.status_code==200:
        urls.append(url)
        soup = BeautifulSoup(response.text,"html.parser")
        search_results=soup.find('div', attrs={'id': 'searchCount'}).text.strip()
        search_results=search_results.split()
        clean_search_results.append(str(search_results[3]))
        
clean_search_results=[clean_search_results[i].replace(',','') for i in range(0,len(clean_search_results),1)]
print(urls)
print (clean_search_results)
b=enumerate(urls)

['https://au.indeed.com/jobs?q=data+scientist&l=Australia&start=0', 'https://au.indeed.com/jobs?q=intelligence+analyst&l=Australia&start=0', 'https://au.indeed.com/jobs?q=quantitative+analyst&l=Australia&start=0', 'https://au.indeed.com/jobs?q=data+engineer&l=Australia&start=0', 'https://au.indeed.com/jobs?q=business+intelligence&l=Australia&start=0', 'https://au.indeed.com/jobs?q=analytics&l=Australia&start=0']
['625', '543', '287', '2882', '1740', '5155']


<enumerate object at 0x11a3bf048>


In [42]:
def urls_list():
    urls=[]
    clean_search_results=[]
    job_titles=['data+scientist', 'intelligence+analyst','quantitative+analyst','data+engineer','business+intelligence','analytics',
                'performance+analyst','statistics']
    columns = ['job_title', 'company_name', 'location','summary'  ,'salary']
    
    for title in job_titles:
        url ='https://au.indeed.com/jobs?q='+title+'&l=Australia'+'&start='
        #search for different Url combinations
        response = requests.get(url)
        if response.status_code==200:
            #make sure the url work 
            urls.append(url)
            soup = BeautifulSoup(response.text,"html.parser")
            search_results=soup.find('div', attrs={'id': 'searchCount'}).text.strip()
            search_results=search_results.split()
            clean_search_results.append(str(search_results[3]))
            
    clean_search_results=[clean_search_results[i].replace(',','') for i in range(0,len(clean_search_results),1)]
    
    return clean_search_results,urls

for i in range(0,len(urls)):
    job_title = urls[i]
    search_items=clean_search_results[i]
    print(job_title)
    print(search_items)

https://au.indeed.com/jobs?q=data+scientist&l=Australia&start=0
625
https://au.indeed.com/jobs?q=intelligence+analyst&l=Australia&start=0
543
https://au.indeed.com/jobs?q=quantitative+analyst&l=Australia&start=0
287
https://au.indeed.com/jobs?q=data+engineer&l=Australia&start=0
2882
https://au.indeed.com/jobs?q=business+intelligence&l=Australia&start=0
1740
https://au.indeed.com/jobs?q=analytics&l=Australia&start=0
5155


In [37]:
import sys
sys.setrecursionlimit(1500)

### Final Scrapper

In [41]:
%%time

def urls_list(jobs):
    urls=[]
    clean_search_results=[]
    job_titles=['data+scientist', 'intelligence+analyst','quantitative+analyst','data+engineer','business+intelligence+analyst',
                'performance+analyst']
    columns = ['job_title', 'company_name', 'location','summary'  ,'salary']
    
    for title in job_titles:
        
        #search for different Url combinations
        url ='https://au.indeed.com/jobs?q='+title+'&l=Australia'+'&start='
        response = requests.get(url)
        if response.status_code==200:
            #make sure the url work 
            urls.append(url)
            soup = BeautifulSoup(response.text,"html.parser")
            search_results=soup.find('div', attrs={'id': 'searchCount'}).text.strip()
            search_results=search_results.split()
            clean_search_results.append(str(search_results[3]))
            
    clean_search_results=[clean_search_results[i].replace(',','') for i in range(0,len(clean_search_results),1)]
    
    return clean_search_results,urls
#----------------------------------------------------------------------------------------------------------------------------------------#

for i in range(0,len(urls)):
    job_title = urls[i]
    search_items=clean_search_results[i]

    #Start scrappring the  working url
    db=[]

    summary_soup=[]
    for page in np.arange(0, int(search_items), 10):
        page = requests.get(job_title+str(page))

        time.sleep(2)  #ensuring at least 1 second between page grabs        
        soup = BeautifulSoup(page.text,"html.parser")
        summary_soup.append(soup)
        
        for div in soup.find_all(name='div', attrs={'class':'row'}):
            
            #creating an empty list to hold the data for each posting
            job_post = [] 
            
            #grabbing job title
            for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
                job_post.append(a['title'])

            #grabbing company name
            company = div.find_all(name='span', attrs={'class':'company'})
            if len(company) > 0:
                for b in company:
                     job_post.append(b.text.strip())
            else:
                sec_try = div.find_all(name='span', attrs={'class':'result-link-source'})
                for span in sec_try:
                    job_post.append(span.text.strip())

            #grabbing location name
            div_one=div.find('div',attrs={'class':'location'})
            if div_one==None:
                div_two=div.find('span',attrs={'class':'location'})
                job_post.append(div_two.text.strip())
            else:
                job_post.append(div_one.text.strip()) 
                
            #appending our main list with the results           
        db.append(job_post)
    
    #converting results to DF
    df_data=pd.DataFrame(db)

#----------------------------------------------------------------------------------------------------------------------------------------#
   
 #saving DF as a local csv file

df_data.to_csv("/Users/sherf/Desktop/GA Units/GA_P4/Indeed_Scrapper_data.csv",index=False)

KeyboardInterrupt: 

In [71]:
df.shape

(6920, 5)

### *Part 1 Comments:*

---
In this section I managed to scrap Indeed website extracting 6920 jobs and saved these jobs to a CSV file to be used in further processing.
The scrapping code took 45 minutes to run. I didn't scrap complete job description only the job summary and will try to make the best use of that.