# Indeed Job Scraper

### How it works:

You provide a set of standard input parameters: 
- **search query**
- **location**
- **mile/range**

in addition to two non-standard paramaters: 
- **ordered_keywords**: Job roles are rated based on this ordered list. This is a list of keywords to search for in job descriptions provided in order of preference. 
- **exclude_keywords**: A list of keywords to search for in a job _title_ which renders the rating of that job zero. E.g. if you really hate roles as a recruiter you would include: "Recruitment" or "Headhunter"
- **title_keywords**: A list of keywords to search for in a job _title_ which, if matched for, increase the normalised rating. (Has precedence over "ordered_keywords")
- **pages**: Number of Indeed pages to search. (Maximum that Indeed provides is 100)

The web scraper searches through all the indeed job listings with those paramaters and returns a dataframe containing all the listings ordered by the "rating" metric based on the ordered list of keywords.

You can also then output this dataframe as an excel sheet for convenience. 

In [1]:
# Imports
from bs4 import BeautifulSoup
import requests, json
import pandas as pd

In [35]:
# Scraping parameters
default_parameters = {
    'search_query':'Graduate Python',
    'location':'E78DT',
    'miles':15,
    'ordered_keywords':['Investment','Banking','Finance','Hedge','Python','Fintech','SQL','Analysis','Modelling'],
    'exclude_keywords':['Recruitment','Headhunter','Manager','Director','Senior'],
    'title_keywords':['Graduate','Junior'],
    'pages':100 
}

In [36]:
def create_url(parameters):
    # create base url for all further searches
    what = parameters['search_query'].replace(" ","+")
    where = parameters['location'].replace(" ","+")
    miles = parameters['miles']
    base_url = f"https://www.indeed.co.uk/jobs?q={what}&l={where}&radius={miles}"
    return base_url

In [37]:
def rate_job(j_title, j_soup, parameters):
    # rate job by keywords
    description = j_soup.find(id="jobDescriptionText").get_text()
    keywords = parameters['ordered_keywords']
    title_keywords = parameters['title_keywords']
    exclude_keywords = parameters['exclude_keywords']
    total_keywords = len(keywords) + len(title_keywords)
    keywords_present = []
    title_keywords_present = []
    rating = 0
    
    # Check for keyword, add value to rating depending on ranking
    for index,keyword in enumerate(keywords):
        if keyword in description:
            rating += len(keywords) - index
            keywords_present.append(keyword)
    
    # Check for title keywords
    for index,keyword in enumerate(title_keywords):
        if keyword in j_title:
            rating += total_keywords - index
            title_keywords_present.append(keyword)
    
    # Normalise rating
    rating = rating/sum(range(1,total_keywords+1))
    
    # Check for excluded keywords
    for keyword in exclude_keywords:
        if keyword in j_title:
            rating = 0
            break
    
    return description,rating,keywords_present,title_keywords_present

In [38]:
def get_job_details(job,parameters):
    
    # Get link and title
    job_url = job.find(class_='title').a['href']
    
    # Correct for truncated URLs
    job_url = "https://www.indeed.co.uk" + job_url if (job_url.startswith("/")) else job_url
    job_page = requests.get(job_url)
    job_soup = BeautifulSoup(job_page.content,'html.parser')
    
    # Give URL after redirect (ads/analytics etc.)
    job_url = job_page.url 
    
    # Get job title and company name
    title = job.find(class_='title').a['title']
    company = job_soup.find(class_="icl-u-lg-mr--sm").get_text()
    
    # Get description, rating and present keywords
    description, rating, keywords_present, title_keywords_present = rate_job(title,job_soup,parameters)
    
    return title, company, job_url, description, rating, keywords_present, title_keywords_present

In [39]:
def scrape(parameters):
    
    # Create base url for all further searches
    base_url = create_url(parameters)
    
    # Output list and frame
    output = []
    
    for x in range(0,parameters['pages']):
        if(x==0):
            page_append = ""
        else: 
            page_append = "&start=" + str(x*10)
            
        # get page
        current_page = requests.get(base_url+page_append,timeout=5)
        page_soup = BeautifulSoup(current_page.content,"html.parser")
        
        for job in page_soup.select(".jobsearch-SerpJobCard"):
            title, company, url, description, rating, keywords_present, title_keywords_present = get_job_details(job,parameters)
            output.append([rating,title,company,description,url,keywords_present,title_keywords_present,x+1])
            
        print(f"Page {x+1} completed",end="\r")
        
    df_output_frame = pd.DataFrame(
        output,columns=['Rating','Job Title','Company','Description','Job URL','Keywords Present','Title Keywords','Page Found']).sort_values(
        by='Rating',ascending=False).reset_index(drop=True)

    return df_output_frame
        

In [40]:
jobs = scrape(default_parameters)

Page 100 completed

In [41]:
display(jobs.head())

Unnamed: 0,Rating,Job Title,Company,Description,Job URL,Keywords Present,Title Keywords,Page Found
0,0.636364,Junior Digital Developer -M/F-VIE-London,Société Générale,Responsibilities\nKey purposes of the role:\n\...,https://www.indeed.co.uk/viewjob?jk=d92a67aec3...,"[Investment, Banking, Finance, Python, SQL]",[Junior],19
1,0.515152,"Graduate Treasury Desk Support, Trade Finance,...",NDK Consulting,"Graduate Treasury Desk Support, Trade Finance,...",https://www.indeed.co.uk/viewjob?cmp=NDK-Consu...,"[Banking, Finance, Python, SQL]",[Graduate],17
2,0.5,Junior Data Scientist - Investment Banking,MThree Alumni,Job Description\nLove using technology to solv...,https://www.indeed.co.uk/viewjob?jk=72a723be53...,"[Investment, Banking, Python, Modelling]",[Junior],2
3,0.5,Junior Data Scientist,MThree Consulting,Love using technology to solve complex problem...,https://www.indeed.co.uk/viewjob?cmp=MThree-Co...,"[Investment, Banking, Python, Modelling]",[Junior],5
4,0.424242,Graduate Production Data Analyst Permanent,Working Smart Limited,A leading fully integrated exploration and pro...,https://www.indeed.co.uk/viewjob?jk=f880b16778...,"[Finance, Python, SQL, Analysis]",[Graduate],8


# Output to Excel

In [43]:
with pd.ExcelWriter('Excel Output.xlsx',options={'strings_to_urls': False}) as writer:
    jobs.to_excel(writer,index=False)

# Example output

<img src="img/Example_Output.PNG">