# Chronicles of higher education job scraper

Collecting all job advertisements for tenure-track for North American four-year institutions.

- **[Query](https://jobs.chronicle.com/jobs/faculty-positions/north-america/tenured-tenured-track/)**


Everytime you scrape:

1. Load in previous job advertisements
2. Scrape all the *new job advertisements*
3. De-duplicate if necessary
4. Output to DB/CSV


In [1]:
# Data manipulation libraries
import pandas as pd
import numpy as np

# Webscraping libaries and tools
import requests
from bs4 import BeautifulSoup as bs
import re
import time
from tqdm.notebook import tqdm
tqdm.pandas()

# reading path to data files
from glob import glob

  from pandas import Panel


In [2]:
def parse_list_page_item(list_item):
    """
        Takes the list item HTML and parses out the four fields below into a list
    
    """
    title_tag = list_item.find("h3").find("a")
    job_title = title_tag.text
    job_url_suffix = title_tag['href'].strip()
    job_id = job_url_suffix.split("/")[2]
    job_url = "https://jobs.chronicle.com{}".format(job_url_suffix)
    diversity_job = False if list_item.find("p",attrs={"class":"ribbon"}) is None else True
    return [int(job_id),job_title,job_url,diversity_job]

def parse_list_page(url):
    """
        Returns the basic info from the jobs listing page
        
        || job id || job title || url || diversity job? 
    
    """
    time.sleep(1)
    r = requests.get(url,headers = {'User-Agent': 'Mozilla/5.0'})
    # The part of the webpage with the id tag "listing" contains all the job postings
    listing_page = bs(r.text).find("ul",attrs={"id":'listing'})
    # Parse out the ads
    list_items = listing_page.findAll("li",attrs={"id": re.compile("item-[0-9]+")})
    parsed_list_page = [parse_list_page_item(li) for li in list_items]
    return pd.DataFrame(parsed_list_page,columns=["Job ID","Job Title","Job URL","Diversity Job"]).set_index("Job ID")



In [3]:
# Get Job ID for most recent date posted which already exists

list_of_csv_files = glob("../data/*")
most_recent_csv = sorted(list_of_csv_files, reverse=True)[0]
ls_df = pd.read_csv(most_recent_csv).sort_values("Date Posted",ascending=False)
already_scraped = set(ls_df['Job ID'])
ls_df

Unnamed: 0,Job ID,Job Title,Job URL,Diversity Job,Employer,Location,Salary,Date Posted,Description,Position Type 0.0,Position Type 0.1,Position Type 0.2,Position Type 0.3,Position Type 0.4,Position Type 0.5,Position Type 0.6,Position Type 0.7,Position Type 0.8,Position Type 0.9,Position Type 1.0
0,422756,MENTOR PROFESSORSHIPS,https://jobs.chronicle.com/job/422756/mentor-p...,True,Florida International University,"Florida, United States",Salary Not specified,,MENTOR PROFESSORSHIPS\n \nFlorida Internationa...,Faculty Positions,Science,Technology & Mathematics,Other Science & Technology,Social & Behavioral Sciences,Other Social & Behavioral Sciences,,,,,
1,420608,Teaching and Clinical Coordination Faculty (NTT),https://jobs.chronicle.com/job/420608/teaching...,True,Western Oregon University,"Oregon, United States","$45,000.00 - $45,000.00",,Recruitment\n#: \nF-2106\nRevi...,Faculty Positions,Social & Behavioral Sciences,Other Social & Behavioral Sciences,,,,,,,,
2,423318,Assistant Professor of Criminal Justice 3461,https://jobs.chronicle.com/job/423318/assistan...,False,UNC Pembroke,"North Carolina, United States",Salary Commensurate with experience,,Established in 1887 as a school for the educat...,Faculty Positions,Social & Behavioral Sciences,Criminal Justice & Criminology,,,,,,,,
3,423320,TENURE-TRACK FACULTY POSITION IN BIOMEDICAL DA...,https://jobs.chronicle.com/job/423320/tenure-t...,False,GEISEL SCHOOL OF MEDICINE AT DARTMOUTH,"New Hampshire, United States",Competitive Salary,,The Department of Biomedical Data Science and\...,Faculty Positions,Science,Technology & Mathematics,Biology & Life Sciences,Statistics,,,,,,
4,423303,Assistant/Associate Professor of Accounting,https://jobs.chronicle.com/job/423303/assistan...,False,North Carolina Agricultural and Technical Stat...,"North Carolina, United States",Salary Not specified,,This position will support the North Carolina ...,Faculty Positions,Business & Management,Accounting & Finance,Economics,Other Business & Management,,,,,,
5,423302,Assistant/Associate Professor of Accounting,https://jobs.chronicle.com/job/423302/assistan...,False,North Carolina Agricultural and Technical Stat...,"North Carolina, United States",Salary Not specified,,This position will support the North Carolina ...,Faculty Positions,Business & Management,Accounting & Finance,Economics,Other Business & Management,,,,,,
6,423300,Lecturer in Organic Chemistry,https://jobs.chronicle.com/job/423300/lecturer...,False,University of North Carolina at Asheville,"North Carolina, United States",Salary Not specified,,The University of North Carolina Asheville Dep...,Faculty Positions,Arts,Art History,Other Arts,Humanities,Classics,Religion,Science,Technology & Mathematics,Chemistry & Biochemistry,
7,423299,Mathematics & Statistics: University Fellow fo...,https://jobs.chronicle.com/job/423299/mathemat...,False,University of North Carolina at Asheville,"North Carolina, United States",Salary Not specified,,The University of North Carolina Asheville Dep...,Faculty Positions,Arts,Other Arts,Humanities,Ethnic & Multicultural Studies,Science,Technology & Mathematics,Mathematics,Statistics,Social & Behavioral Sciences,Criminal Justice & Criminology
8,423293,Coordinator of Work-Based Learning Experiences...,https://jobs.chronicle.com/job/423293/coordina...,False,Community College of Philadelphia,"Pennsylvania, United States",Salary Not specified,,\nCommunity College of Philadelphia\n\n\nGener...,Faculty Positions,Professional Fields,Other Professional Fields,,,,,,,,
9,423212,Tenure Track Assistant Professor of Criminal J...,https://jobs.chronicle.com/job/423212/tenure-t...,False,f,"Tennessee, United States",Competitive Salary,,POSITION SUMMARY\n \nFisk university seeks a t...,Faculty Positions,Social & Behavioral Sciences,Criminal Justice & Criminology,,,,,,,,


In [4]:
url = "https://jobs.chronicle.com/jobs/faculty-positions/north-america/tenured-tenured-track/{}"

frames = []
job_ids = set()
page = 1
new_jobs = True
while new_jobs:
    print(page,end=" ")
    frame = parse_list_page(url.format(page))
    prev_job_ids = job_ids
    job_ids = set(frame.index)
    new_jobs = not (bool(job_ids.intersection(already_scraped)) or (prev_job_ids == job_ids))
    if bool(job_ids.intersection(already_scraped)): print(job_ids.intersection(already_scraped))
    frames.append(frame)
    page +=1

listing_df = pd.concat(frames)
listing_df = listing_df[~listing_df.index.isin(already_scraped)]
listing_df

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 {423299, 423300, 423302, 423303, 423212, 423318, 423286, 423320, 423291, 423293, 423287}


Unnamed: 0_level_0,Job Title,Job URL,Diversity Job
Job ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
425303,Assistant Vice President for Auxiliary Services,https://jobs.chronicle.com/job/425303/assistan...,True
423762,President,https://jobs.chronicle.com/job/423762/presiden...,True
424653,Assistant Professor of English (Disability Stu...,https://jobs.chronicle.com/job/424653/assistan...,True
425998,Assistant Professor Chemical & Biomedical Engi...,https://jobs.chronicle.com/job/425998/assistan...,False
425997,Assistant Professor in Psychology - Clinical o...,https://jobs.chronicle.com/job/425997/assistan...,False
...,...,...,...
423345,PROFESSOR IN COUNSELLING PSYCHOLOGY,https://jobs.chronicle.com/job/423345/professo...,False
423333,"Assistant Professor of Law, Justice, and Society",https://jobs.chronicle.com/job/423333/assistan...,False
423332,ASSISTANT PROFESSOR OF ENVIRONMENTAL SCIENCE,https://jobs.chronicle.com/job/423332/assistan...,False
423352,Open Rank Climate and Health Professor,https://jobs.chronicle.com/job/423352/open-ran...,False


In [5]:
def get_description_of_page(soup_page):
    """
        Parses the beautiful-soup object of the page response for the job description.
        
        :param soup_page: The beautiful soup object that contains the desired page.
        :returns: The text of the job description.
    """
    description = soup_page.find("div",attrs={"class":"mds-edited-text mds-font-body-copy-bulk"}).get_text()
    return description
    

In [6]:
def get_details_block_of_page(soup_page):
    """
        Every page has a set of details that contains information like who the employer for a job is, location, etc.
        Parses the beautiful-soup object of the page for the summary of the details of the job.
        
        :param soup_page: The beautiful soup object that contains the desired page.
        :returns: The beautiful soup tag for the details. Gets parsed for the important details later.
    """
    details_block = soup_page.find_all("dl",attrs={"class":"mds-list mds-list--definition mds-list--border mds-margin-bottom-b0"})
    return details_block
    

In [7]:
def link_keys_and_values(list_of_keys_and_values):
    """
        Takes a list of alternating elements with key and value class elements and pairs them up. The current version
        of the website has a lot of information stored in a weird format where one element has a class called 'mds-list__key'
        and the element below it contains a class called 'mds-list__value'. this function matches those two together.
        
        :param list_of_keys_and_values: List of soup elements that have alternating key and value class attributes.
        :returns: A dictionary where the key and value correspond to the keys and value in the html. The keys and values are 
        just the text from the element.
    """
    dictionary_form = {}
    key = None
    value = None
    for element in list_of_keys_and_values:
        if "mds-list__key" in element.get("class"):
            key = element
        if "mds-list__value" in element.get("class"):
            value = element
            if key != None and value != None:
                dictionary_form[key.get_text().strip()] = value.get_text().strip()
            key = None
            value = None  

    return dictionary_form

In [8]:
def aggregate_children_of_elements(list_of_elements):
    """
        Takes a list of elements with children and gathers them together.
        
        :param list_of_elements: List of beautiful soup elements.
        :returns: A list of all the children of the elements in the input list.
    """
    children = []
    for element in list_of_elements:
        for child in element.findChildren(recursive=False):
            children.append(child)
    
    return children

In [9]:
def parse_details_page(url):
    """
        Parses the details page of a university
        
        || employer || location || salary || date posted || position_type (list) || description
    
    """
    time.sleep(0.25)
    r = requests.get(url,headers = {'User-Agent': 'Mozilla/5.0'})
    details_page = bs(r.text)
    
    description = get_description_of_page(details_page)
    
    details_block = get_details_block_of_page(details_page)
    list_of_keys_and_values = aggregate_children_of_elements(details_block)
    details_dict = link_keys_and_values(list_of_keys_and_values)
    
    employer,location,salary,posted_date,position_type = None,None,None,None,None
    
    employer = None if "Employer" not in details_dict else details_dict["Employer"]
    location = None if "Location" not in details_dict else details_dict["Location"]    
    salary = None if "Salary" not in details_dict else details_dict["Salary"]
    posted_date = None if "Start date" not in details_dict else details_dict["Start date"] # not sure if the "start date" is the posted date
    
    try:
        position_type = None if "Position Type" not in details_dict else details_dict["Position Type"]
        position_type = [text.strip() for text in position_type.split(",")]
    except:
        pass
    
    return employer,location,salary,posted_date,position_type,description
    


In [10]:
listing_df

Unnamed: 0_level_0,Job Title,Job URL,Diversity Job
Job ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
425303,Assistant Vice President for Auxiliary Services,https://jobs.chronicle.com/job/425303/assistan...,True
423762,President,https://jobs.chronicle.com/job/423762/presiden...,True
424653,Assistant Professor of English (Disability Stu...,https://jobs.chronicle.com/job/424653/assistan...,True
425998,Assistant Professor Chemical & Biomedical Engi...,https://jobs.chronicle.com/job/425998/assistan...,False
425997,Assistant Professor in Psychology - Clinical o...,https://jobs.chronicle.com/job/425997/assistan...,False
...,...,...,...
423345,PROFESSOR IN COUNSELLING PSYCHOLOGY,https://jobs.chronicle.com/job/423345/professo...,False
423333,"Assistant Professor of Law, Justice, and Society",https://jobs.chronicle.com/job/423333/assistan...,False
423332,ASSISTANT PROFESSOR OF ENVIRONMENTAL SCIENCE,https://jobs.chronicle.com/job/423332/assistan...,False
423352,Open Rank Climate and Health Professor,https://jobs.chronicle.com/job/423352/open-ran...,False


In [11]:
listing_df[['Employer',
            'Location',
            'Salary',
            'Date Posted',
            'position_type',
            'Description']] = listing_df.progress_apply(lambda row: parse_details_page(row['Job URL']),
                                                        axis=1,
                                                        result_type='expand')


listing_df["Date Posted"] = pd.to_datetime(listing_df["Date Posted"],infer_datetime_format=True)

HBox(children=(FloatProgress(value=0.0, max=312.0), HTML(value='')))




In [12]:
listing_df = listing_df[listing_df['position_type'].notna()]

In [13]:
position_type = pd.DataFrame(listing_df['position_type'].values.tolist(),
                             index=listing_df.index).fillna(np.nan)
position_type = position_type.rename(columns = lambda x: (x/10)).add_prefix('Position Type ')
print("{}x{}".format(*listing_df.shape))

merged_df = pd.merge(listing_df,
                     position_type,
                     how="left",
                     left_index=True,
                     right_index=True)
print("{}x{}".format(*merged_df.shape))
merged_df = merged_df.drop("position_type",axis=1)
print("{}x{}".format(*merged_df.shape))
merged_df = merged_df.sort_values("Date Posted",ascending=False)
merged_df

312x9
314x23
314x22


Unnamed: 0_level_0,Job Title,Job URL,Diversity Job,Employer,Location,Salary,Date Posted,Description,Position Type 0.0,Position Type 0.1,...,Position Type 0.4,Position Type 0.5,Position Type 0.6,Position Type 0.7,Position Type 0.8,Position Type 0.9,Position Type 1.0,Position Type 1.1,Position Type 1.2,Position Type 1.3
Job ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
423326,"Assistant Director, Masters in Tall Buildings ...",https://jobs.chronicle.com/job/423326/assistan...,False,Council on Tall Buildings and Urban Habitat,"Illinois, United States",Salary Commensurate with experience,NaT,The College of Architecture at the Illinois In...,Faculty Positions,Professional Fields,...,,,,,,,,,,
423332,ASSISTANT PROFESSOR OF ENVIRONMENTAL SCIENCE,https://jobs.chronicle.com/job/423332/assistan...,False,Providence University College,Canada,Salary Not specified,NaT,POSITION OVERVIEW\nProvidence University Colle...,Faculty Positions,Science,...,Geology & Earth Sciences,,,,,,,,,
423333,"Assistant Professor of Law, Justice, and Society",https://jobs.chronicle.com/job/423333/assistan...,False,Washington and Lee University,"Virginia, United States",Salary Not specified,NaT,"The Law, Justice, and Society (“LJS”) Interdis...",Faculty Positions,Humanities,...,Law & Legal Studies,,,,,,,,,
423345,PROFESSOR IN COUNSELLING PSYCHOLOGY,https://jobs.chronicle.com/job/423345/professo...,False,Providence University College and Theological ...,Canada,Salary Not specified,NaT,POSITION OVERVIEW\nWe invite applications for ...,Faculty Positions,Social & Behavioral Sciences,...,,,,,,,,,,
423346,Instruction and Outreach Librarian/Assistant P...,https://jobs.chronicle.com/job/423346/instruct...,False,Eckerd College,"Florida, United States",Salary Not specified,NaT,Position Summary:\nEckerd College Library seek...,Faculty Positions,Professional Fields,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
425989,Tenure-Track Assistant Professor,https://jobs.chronicle.com/job/425989/tenure-t...,False,Marquette University,"Wisconsin, United States","$93,000.00 - $95,000.00",NaT,\nThe Department of Electrical and Computer E...,Faculty Positions,Science,...,Engineering,,,,,,,,,
425995,"Ubiquitous Data, Assistant Professor",https://jobs.chronicle.com/job/425995/ubiquito...,False,"University of Nevada, Las Vegas","Nevada, United States",Salary Not specified,NaT,"The University of Nevada, Las Vegas (UNLV) inv...",Faculty Positions,Science,...,Engineering,,,,,,,,,
425996,"Sustainability in Arid Land, Assistant/Associa...",https://jobs.chronicle.com/job/425996/sustaina...,False,"University of Nevada, Las Vegas","Nevada, United States",Salary Not specified,NaT,"The University of Nevada, Las Vegas (UNLV) inv...",Faculty Positions,Science,...,,,,,,,,,,
425997,Assistant Professor in Psychology - Clinical o...,https://jobs.chronicle.com/job/425997/assistan...,False,Cleveland State University,"Ohio, United States",Salary Not specified,NaT,Position Description: The Department of\nPsych...,Faculty Positions,Social & Behavioral Sciences,...,,,,,,,,,,


In [14]:
from datetime import datetime

timestamp = datetime.now().strftime("%Y-%m-%d")
merged_df.to_csv(f"../data/{timestamp}-chronicles_of_higher_ed.csv")