# Indeed Job Scraper

### How it works:

You provide a set of standard input parameters: 
- **search query**
- **location**
- **mile/range**

in addition to two non-standard paramaters: 
- **ordered_keywords**: Job roles are rated based on this ordered list. This is a list of keywords to search for in job descriptions provided in order of preference. 
- **exclude_keywords**: A list of keywords to search for in a job _title_ which renders the rating of that job zero. E.g. if you really hate roles as a recruiter you would include: "Recruitment" or "Headhunter"

The web scraper searches through 100 pages of indeed job listings with those paramaters and returns a dataframe containing all the listings ordered by the "rating" metric based on the ordered list of keywords.

You can also then output this dataframe as an excel sheet for convenience. 

In [4]:
# Imports
from bs4 import BeautifulSoup
import requests, json
import pandas as pd

In [5]:
# Scraping parameters
default_parameters = {
    'search_query':'Graduate Investment',
    'location':'Enter_Your_Postcode_Here',
    'miles':15,
    'ordered_keywords':['Investment','Banking','Finance','Hedge','Python','Fintech','SQL','Analysis','Modelling'],
    'exclude_keywords':['Recruitment','Headhunter']
}

In [6]:
def create_url(parameters):
    # create base url for all further searches
    what = parameters['search_query'].replace(" ","+")
    where = parameters['location'].replace(" ","+")
    base_url = "https://www.indeed.co.uk/jobs?q={}&l={}&radius={}".format(
        what,where,parameters['miles'])
    return base_url

In [7]:
def rate_job(j_title, j_soup, parameters):
    # rate job by keywords
    description = j_soup.find(id="jobDescriptionText").get_text()
    keywords = parameters['ordered_keywords']
    exclude_keywords = parameters['exclude_keywords']
    keywords_present = []
    rating = 0
    
    # Check for keyword, add value to rating depending on ranking
    for index,keyword in enumerate(keywords):
        if keyword in description:
            rating += len(keywords) - index
            keywords_present.append(keyword)
    
    # Normalise rating
    rating = rating/sum(range(1,len(keywords)+1))
    
    # Check for excluded keywords
    for keyword in exclude_keywords:
        if keyword in j_title:
            rating = 0
            break
        else:
            pass
    
    return description,rating,keywords_present

In [8]:
def get_job_details(job,parameters):
    
    # Get link and title
    job_url = job.find(class_='title').a['href']
    
    # Correct for truncated URLs
    job_url = "https://www.indeed.co.uk" + job_url if (job_url.startswith("/")) else job_url
    job_page = requests.get(job_url)
    job_soup = BeautifulSoup(job_page.content,'html.parser')
    
    # Give URL after redirect (ads/analytics etc.)
    job_url = job_page.url 
    
    # Get job title and company name
    job_title = job.find(class_='title').a['title']
    job_company = job_soup.find(class_="icl-u-lg-mr--sm").get_text()
    
    # Get description, rating and present keywords
    job_description, job_rating, job_keywords_present = rate_job(job_title,job_soup,parameters)
    
    return job_title, job_company, job_url, job_description, job_rating, job_keywords_present

In [9]:
def scrape(parameters):
    
    # Create base url for all further searches
    base_url = create_url(parameters)
    
    # Output list and frame
    output = []
    
    for x in range(0,100):
        if(x==0):
            page_append = ""
        else: 
            page_append = "&start=" + str(x*10)
            
        # get page
        current_page = requests.get(base_url+page_append,timeout=5)
        page_soup = BeautifulSoup(current_page.content,"html.parser")
        
        for job in page_soup.select(".jobsearch-SerpJobCard"):
            job_title, job_company, job_url, job_description, job_rating, job_keywords_present = get_job_details(job,parameters)
            output.append([job_rating,job_title,job_company,job_description,job_url,job_keywords_present,x+1])
            
        print("Page {} completed".format(x+1),end="\r")
        
    output_frame = pd.DataFrame(
        output,columns=['Rating','Job Title','Company','Description','Job URL','Keywords Present','Page Found']).sort_values(
        by='Rating',ascending=False).reset_index(drop=True)

    return output_frame
        

In [10]:
jobs = scrape(default_parameters)

Page 100 completed

In [14]:
display(jobs.head())

Unnamed: 0,Rating,Job Title,Company,Description,Job URL,Keywords Present,Page Found
0,0.733333,Graduate/ Junior C# Developer - Hedge Fund - L...,McGregor Boyall,"C#, SQL Server, Hedge Fund, Graduate, Junior, ...",https://www.indeed.co.uk/viewjob?jk=33f5c1005e...,"[Investment, Banking, Finance, Hedge, SQL]",31
1,0.711111,Junior Digital Developer -M/F-VIE-London,Société Générale,Responsibilities\nKey purposes of the role:\n\...,https://www.indeed.co.uk/viewjob?jk=d92a67aec3...,"[Investment, Banking, Finance, Python, SQL]",56
2,0.666667,Manager/Director – Legal,Ruella James,Leading recruitment consultancy with a strong ...,https://www.indeed.co.uk/viewjob?jk=3cfb44d39a...,"[Investment, Banking, Finance, Hedge]",70
3,0.666667,Trainee/Entry Level – Sales/Marketing,Ruella James,Specialist Sales and Marketing FMCG Recruitmen...,https://www.indeed.co.uk/viewjob?jk=af40e885d9...,"[Investment, Banking, Finance, Hedge]",54
4,0.666667,Senior Consultant – Accountancy/Finance,Ruella James,Leading Professional and Financial Recruitment...,https://www.indeed.co.uk/viewjob?jk=d22f214780...,"[Investment, Banking, Finance, Hedge]",71


# Output to Excel

In [13]:
with pd.ExcelWriter('Excel Output.xlsx',options={'strings_to_urls': False}) as writer:
    jobs.to_excel(writer,index=False)

# Example output

<img src="img/Example_Output.PNG">