# Inside Higher Ed Job Scraper

Collecting all job advertisements for tenure-track for North American four-year institutions.

- **[Query](https://careers.insidehighered.com/jobs/tenured-and-tenure-track/four-year-institution/north-america/)**


Everytime you scrape:

1. Load in previous job advertisements
2. Scrape all the *new job advertisements*
3. De-duplicate if necessary
4. Output to DB/CSV


In [1]:
# Data manipulation libraries
import pandas as pd
import numpy as np
# Common webscraping libaries
from bs4 import BeautifulSoup as bs
import requests

In [23]:
from time import time
def attribute_finder(tag,kw,v=False):
    """
        Returns True if the param :kw: is in the tag's class or ID, and false otherwise
    """
    
    if 'class' in tag.attrs:
        if kw in tag.attrs['class'] or any([kw in x for x in tag.attrs['class']]):
            return True
    if 'id' in tag.attrs:
        if kw in tag.attrs['id'] or any([kw in x for x in tag.attrs['id']]):
            return True
    return False

def clean_list_page_item(list_item):
    '''
        Takes in a single <li> tag, scraped from the list page that lists out all the job ads and returns
        || job id || job title || recruiter || location || details page URL
    '''
    job_id,job_title,recruiter,location,detail_page_url = [None]*5
    job_header = list_item.find("h3")
    assert job_header,f"Can't find job header for {list_item}"
    # The text in that <h3> tag is the job title
    job_title = job_header.text
    
    # Job ID and URL
    detail_page_url = job_header.find("a")['href']
    # Urls are in the format /job/[JOB_ID]/blah/blah/blah so split on '/' and then take the second item
    # to get the job Id
    job_id = detail_page_url.split('/')[2]
    # Add the base url to the job page
    detail_page_url = f"https://careers.insidehighered.com{detail_page_url.strip()}"
    
    # Recruiter and location
    meta_items = job_header.find("ul",attrs={'class':'lister__meta'})
    try:
        recruiter = meta_items.find(lambda tag : attribute_finder(tag,'recruiter')).text
    except:
        pass
    
    try:
        location = meta_items.find(lambda tag : attribute_finder(tag,'location')).text
    except:
        pass
    return job_id,job_title,recruiter,location,detail_page_url
    

def scrape_list_page(url):
    '''
        Takes in a page that lists out the job advertisements and returns all the basic info+info page urls
    '''
    r = requests.get(url,headers = {'User-Agent': 'Mozilla/5.0'})
    soup = bs(r.text).find("ul",attrs={'id':'listing'}).findAll('li')
    list_page = []
    for list_item in soup:
        time.sleep(1)
        try:
            cleaned_list_item = clean_list_page_item(list_item)
            list_page.append(cleaned_list_item)
        except:
            print(list_item)
    return list(list_page)

def scrape_details(url):
    '''
        Takes in a details page (i.e. https://careers.insidehighered.com/job/2164790/extension-4-h-program-coordinator/)
        and returns the details page for it.
        
        Return format
    '''
    pass

In [24]:
base_url = "https://careers.insidehighered.com/jobs/tenured-and-tenure-track/four-year-institution/north-america/"

scrape_list_page(base_url)

KeyboardInterrupt: 