# Chronicles of higher education job scraper

Collecting all job advertisements for tenure-track for North American four-year institutions.

- **[Query](https://jobs.chronicle.com/jobs/faculty-positions/north-america/tenured-tenured-track/)**


Everytime you scrape:

1. Load in previous job advertisements
2. Scrape all the *new job advertisements*
3. De-duplicate if necessary
4. Output to DB/CSV


In [4]:
# Data manipulation libraries
import pandas as pd
import numpy as np

# Webscraping libaries and tools
import requests
from bs4 import BeautifulSoup as bs
import re
import time
from tqdm.notebook import tqdm
tqdm.pandas()

# reading path to data files
from glob import glob

  from pandas import Panel


In [5]:
def parse_list_page_item(list_item):
    """
        Takes the list item HTML and parses out the four fields below into a list
    
    """
    title_tag = list_item.find("h3").find("a")
    job_title = title_tag.text
    job_url_suffix = title_tag['href'].strip()
    job_id = job_url_suffix.split("/")[2]
    job_url = "https://jobs.chronicle.com{}".format(job_url_suffix)
    diversity_job = False if list_item.find("p",attrs={"class":"ribbon"}) is None else True
    return [int(job_id),job_title,job_url,diversity_job]

def parse_list_page(url):
    """
        Returns the basic info from the jobs listing page
        
        || job id || job title || url || diversity job? 
    
    """
    time.sleep(1)
    r = requests.get(url,headers = {'User-Agent': 'Mozilla/5.0'})
    # The part of the webpage with the id tag "listing" contains all the job postings
    listing_page = bs(r.text).find("ul",attrs={"id":'listing'})
    # Parse out the ads
    list_items = listing_page.findAll("li",attrs={"id": re.compile("item-[0-9]+")})
    parsed_list_page = [parse_list_page_item(li) for li in list_items]
    return pd.DataFrame(parsed_list_page,columns=["Job ID","Job Title","Job URL","Diversity Job"]).set_index("Job ID")



In [6]:
# Get Job ID for most recent date posted which already exists

list_of_csv_files = glob("../data/*")
most_recent_csv = sorted(list_of_csv_files, reverse=True)[0]
ls_df = pd.read_csv(most_recent_csv).sort_values("Date Posted",ascending=False)
already_scraped = set(ls_df['Job ID'])
ls_df

Unnamed: 0,Job ID,Job Title,Job URL,Diversity Job,Employer,Location,Salary,Date Posted,Description,Position Type 0.0,...,Position Type 0.4,Position Type 0.5,Position Type 0.6,Position Type 0.7,Position Type 0.8,Position Type 0.9,Position Type 1.0,Position Type 1.1,Position Type 1.2,Position Type 1.3
0,375792,Assistant Professor of Counseling Psychology,https://jobs.chronicle.com/job/375792/assistan...,False,"University of Tennessee, Knoxville","Tennessee, United States",Competitive Salary,2021-08-31,\nThe Department of Psychology at the Universi...,Faculty Positions,...,,,,,,,,,,
9,375684,Assistant Professor of Film Studies and Produc...,https://jobs.chronicle.com/job/375684/assistan...,False,Oakland University,"Michigan, United States",Salary Not specified,2021-08-31,\r\n\t\t\t\t\tThe Department of English at Oak...,Faculty Positions,...,,,,,,,,,,
16,375790,FAMU-FSU College of Engineering Tenure-Track F...,https://jobs.chronicle.com/job/375790/famu-fsu...,False,Florida A&M University - Florida State Univers...,"Florida, United States",Competitive Salary,2021-08-31,\nThe Department of Chemical & Biomedical Engi...,Faculty Positions,...,,,,,,,,,,
15,375788,Tenure Track Assistant Professor of Biochemistry,https://jobs.chronicle.com/job/375788/tenure-t...,False,Kalamazoo College,"Michigan, United States",Competitive Salary,2021-08-31,\nThe Department of Chemistry & Biochemistry a...,Faculty Positions,...,,,,,,,,,,
1,375762,Assistant Professor of Communication - Health ...,https://jobs.chronicle.com/job/375762/assistan...,False,The Ohio State University School of Communication,"Ohio, United States",Salary Commensurate with experience,2021-08-31,\nDepartment: School of Communication\nPosit...,Faculty Positions,...,Other Social & Behavioral Sciences,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,374875,Visiting Assistant Professor of Accounting,https://jobs.chronicle.com/job/374875/visiting...,False,Schreiner University,"Texas, United States",Salary Not specified,2021-08-27,\nSchreiner University seeks applications for ...,Faculty Positions,...,,,,,,,,,,
85,374879,"Assistant Professor, Department of Health Scie...",https://jobs.chronicle.com/job/374879/assistan...,False,University of Central Florida,"Florida, United States",Competitive Salary,2021-08-27,\n Job Description:\nThe Department of Health ...,Faculty Positions,...,Pharmacology,,,,,,,,,
84,374880,Assistant Professor of Chemistry,https://jobs.chronicle.com/job/374880/assistan...,False,Schreiner University,"Texas, United States",Salary Not specified,2021-08-27,\nSchreiner University\nSchreiner University s...,Faculty Positions,...,,,,,,,,,,
83,374814,Pre Tenure Associate Professor 9 Mo for Comput...,https://jobs.chronicle.com/job/374814/pre-tenu...,False,Arkansas State University,"Arkansas, United States",Salary Commensurate with experience,2021-08-27,\nPosition Summary:\nThe College of Engineerin...,Faculty Positions,...,,,,,,,,,,


In [7]:
url = "https://jobs.chronicle.com/jobs/faculty-positions/north-america/tenured-tenured-track/{}"

frames = []
job_ids = set()
page = 1
new_jobs = True
while new_jobs:
    print(page,end=" ")
    frame = parse_list_page(url.format(page))
    prev_job_ids = job_ids
    job_ids = set(frame.index)
    new_jobs = not (bool(job_ids.intersection(already_scraped)) or (prev_job_ids == job_ids))
    if bool(job_ids.intersection(already_scraped)): print(job_ids.intersection(already_scraped))
    frames.append(frame)
    page +=1

listing_df = pd.concat(frames)
listing_df = listing_df[~listing_df.index.isin(already_scraped)]
listing_df

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 {375778, 375788, 375790, 375792, 375773, 375774}


Unnamed: 0_level_0,Job Title,Job URL,Diversity Job
Job ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
388170,Tenure-Track Assistant Professor Position in A...,https://jobs.chronicle.com/job/388170/tenure-t...,True
387931,Romanoff Assistant Professor in Contemporary R...,https://jobs.chronicle.com/job/387931/romanoff...,True
388518,Assistant Professor in Economics and Education,https://jobs.chronicle.com/job/388518/assistan...,True
388852,Assistant Professor - African American History,https://jobs.chronicle.com/job/388852/assistan...,False
388926,Open Rank/Tenure-Track Faculty Positions in Co...,https://jobs.chronicle.com/job/388926/open-ran...,False
...,...,...,...
375825,Junior tenure-track faculty in Economics,https://jobs.chronicle.com/job/375825/junior-t...,False
375800,"Assistant/Associate Dean of Diversity, Inclusi...",https://jobs.chronicle.com/job/375800/assistan...,False
375795,FAMU-FSU College of Engineering Faculty Positi...,https://jobs.chronicle.com/job/375795/famu-fsu...,False
375794,"Assistant Professor of English, Creative Writi...",https://jobs.chronicle.com/job/375794/assistan...,False


In [13]:
def get_description_of_page(soup_page):
    """
        Parses the beautiful-soup object of the page response for the job description.
        
        :param soup_page: The beautiful soup object that contains the desired page.
        :returns: The text of the job description.
    """
    description = soup_page.find("div",attrs={"class":"mds-edited-text mds-font-body-copy-bulk"}).get_text()
    return description
    

In [14]:
def get_details_block_of_page(soup_page):
    """
        Every page has a set of details that contains information like who the employer for a job is, location, etc.
        Parses the beautiful-soup object of the page for the summary of the details of the job.
        
        :param soup_page: The beautiful soup object that contains the desired page.
        :returns: The beautiful soup tag for the details. Gets parsed for the important details later.
    """
    details_block = soup_page.find_all("dl",attrs={"class":"mds-list mds-list--definition mds-list--border mds-margin-bottom-b0"})
    return details_block
    

In [95]:
def link_keys_and_values(list_of_keys_and_values):
    """
        Takes a list of alternating elements with key and value class elements and pairs them up. The current version
        of the website has a lot of information stored in a weird format where one element has a class called 'mds-list__key'
        and the element below it contains a class called 'mds-list__value'. this function matches those two together.
        
        :param list_of_keys_and_values: List of soup elements that have alternating key and value class attributes.
        :returns: A dictionary where the key and value correspond to the keys and value in the html. The keys and values are 
        just the text from the element.
    """
    dictionary_form = {}
    key = None
    value = None
    for element in list_of_keys_and_values:
        if "mds-list__key" in element.get("class"):
            key = element
        if "mds-list__value" in element.get("class"):
            value = element
            if key != None and value != None:
                dictionary_form[key.get_text().strip()] = value.get_text().strip()
            key = None
            value = None  

    return dictionary_form

In [98]:
def aggregate_children_of_elements(list_of_elements):
    """
        Takes a list of elements with children and gathers them together.
        
        :param list_of_elements: List of beautiful soup elements.
        :returns: A list of all the children of the elements in the input list.
    """
    children = []
    for element in list_of_elements:
        for child in element.findChildren(recursive=False):
            children.append(child)
    
    return children

In [118]:
def parse_details_page(url):
    """
        Parses the details page of a university
        
        || employer || location || salary || date posted || position_type (list) || description
    
    """
    time.sleep(0.25)
    r = requests.get(url,headers = {'User-Agent': 'Mozilla/5.0'})
    details_page = bs(r.text)
    
    description = get_description_of_page(details_page)
    
    details_block = get_details_block_of_page(details_page)
    list_of_keys_and_values = aggregate_children_of_elements(details_block)
    details_dict = link_keys_and_values(list_of_keys_and_values)
    
    employer,location,salary,posted_date,position_type = None,None,None,None,None
    
    employer = None if "Employer" not in details_dict else details_dict["Employer"]
    location = None if "Location" not in details_dict else details_dict["Location"]    
    salary = None if "Salary" not in details_dict else details_dict["Salary"]
    posted_date = None if "Start date" not in details_dict else details_dict["Start date"] # not sure if the "start date" is the posted date
    
    try:
        position_type = None if "Position Type" not in details_dict else details_dict["Position Type"]
        position_type = [text.strip() for text in position_type.split(",")]
    except:
        pass
    
    return employer,location,salary,posted_date,position_type,description
    


In [None]:
listing_df[['Employer',
            'Location',
            'Salary',
            'Date Posted',
            'position_type',
            'Description']] = listing_df.progress_apply(lambda row: parse_details_page(row['Job URL']),
                                                        axis=1,
                                                        result_type='expand')


listing_df["Date Posted"] = pd.to_datetime(listing_df["Date Posted"],infer_datetime_format=True)

In [164]:
position_type = pd.DataFrame(listing_df['position_type'].values.tolist(),
                             index=listing_df.index).fillna(np.nan)
position_type = position_type.rename(columns = lambda x: (x/10)).add_prefix('Position Type ')
print("{}x{}".format(*listing_df.shape))
merged_df = pd.merge(listing_df,
                     position_type,
                     how="left",
                     left_index=True,
                     right_index=True)
print("{}x{}".format(*merged_df.shape))
merged_df = merged_df.drop("position_type",axis=1)
print("{}x{}".format(*merged_df.shape))
merged_df = merged_df.sort_values("Date Posted",ascending=False)
merged_df

1917x9
1923x46
1923x45


Unnamed: 0_level_0,Job Title,Job URL,Diversity Job,Employer,Location,Salary,Date Posted,Description,Position Type 0.0,Position Type 0.1,...,Position Type 2.7,Position Type 2.8,Position Type 2.9,Position Type 3.0,Position Type 3.1,Position Type 3.2,Position Type 3.3,Position Type 3.4,Position Type 3.5,Position Type 3.6
Job ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
388852,Assistant Professor - African American History,https://jobs.chronicle.com/job/388852/assistan...,False,University of Washington Department of History,"Washington, United States",Salary Not specified,2021-10-02,Description\nThe Department of History at the ...,Faculty Positions,Humanities,...,,,,,,,,,,
388926,Open Rank/Tenure-Track Faculty Positions in Co...,https://jobs.chronicle.com/job/388926/open-ran...,False,Rose-Hulman Institute of Technology,"Indiana, United States",Competitive Salary,2021-10-01,THE PLACE: Rose-Hulman Institute of\nTechnolog...,Faculty Positions,Science,...,,,,,,,,,,
388477,Tenure-Track Assistant Professor of English (A...,https://jobs.chronicle.com/job/388477/tenure-t...,False,Texas A&M International University,"Texas, United States",Salary Not specified,2021-10-01,Tenure-Track Assistant Professor of English (A...,Faculty Positions,Humanities,...,,,,,,,,,,
388396,Tenure-track Professor in History of Architect...,https://jobs.chronicle.com/job/388396/tenure-t...,False,Harvard University,"Massachusetts, United States",Competitive Salary,2021-10-01,Position Description: The\nDepartment of Hist...,Faculty Positions,Arts,...,,,,,,,,,,
388409,Assistant Professor of Statistics and Data Sci...,https://jobs.chronicle.com/job/388409/assistan...,False,"University of Pennsylvania, Wharton Department...","Pennsylvania, United States",Competitive Salary,2021-10-01,The Department of Statistics and Data Science ...,Faculty Positions,Science,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375992,Engineering FT Instructor,https://jobs.chronicle.com/job/375992/engineer...,False,Antelope Valley College,"California, United States","$57,235.00 - $84,543.00",2021-08-31,Antelope Valley College\n \nEngineering FT Ins...,Faculty Positions,Science,...,,,,,,,,,,
375995,"Assistant Professor, Asian Languages & Literat...",https://jobs.chronicle.com/job/375995/assistan...,False,"Asian Languages and Literature, University of ...","Washington, United States",Salary Not specified,2021-08-31,The Department of Asian Languages and Literatu...,Faculty Positions,Humanities,...,,,,,,,,,,
376026,Faculty-Tenure Track-Sales,https://jobs.chronicle.com/job/376026/faculty-...,False,Utah Valley University,"Utah, United States",Salary Not specified,2021-08-31,\n\nFaculty-Tenure Track-Sales\n\nPosition Cat...,Faculty Positions,Business & Management,...,,,,,,,,,,
375794,"Assistant Professor of English, Creative Writi...",https://jobs.chronicle.com/job/375794/assistan...,False,Kalamazoo College,"Michigan, United States",Competitive Salary,2021-08-31,Kalamazoo College invites poets to apply for a...,Faculty Positions,Humanities,...,,,,,,,,,,


In [166]:
from datetime import datetime

timestamp = datetime.now().strftime("%Y-%m-%d")
merged_df.to_csv(f"../data/{timestamp}-chronicles_of_higher_ed.csv")