# Chronicles of higher education job scraper

Collecting all job advertisements for tenure-track for North American four-year institutions.

- **[Query](https://jobs.chronicle.com/jobs/faculty-positions/north-america/tenured-tenured-track/)**


Everytime you scrape:

1. Load in previous job advertisements
2. Scrape all the *new job advertisements*
3. De-duplicate if necessary
4. Output to DB/CSV


In [1]:
# Data manipulation libraries
import pandas as pd
import numpy as np
# Common webscraping libaries
from bs4 import BeautifulSoup as bs
from tqdm.notebook import tqdm
import requests
tqdm.pandas()

In [2]:
import re
import time

def parse_list_page_item(list_item):
    """
        Takes the list item HTML and parses out the four fields below into a list
    
    """
    title_tag = list_item.find("h3").find("a")
    job_title = title_tag.text
    job_url_suffix = title_tag['href'].strip()
    job_id = job_url_suffix.split("/")[2]
    job_url = f"https://jobs.chronicle.com{job_url_suffix}"
    diversity_job = False if list_item.find("p",attrs={"class":"ribbon"}) is None else True
    return [int(job_id),job_title,job_url,diversity_job]

def parse_list_page(url):
    """
        Returns the basic info from the jobs listing page
        
        || job id || job title || url || diversity job? 
    
    """
    time.sleep(1)
    r = requests.get(url,headers = {'User-Agent': 'Mozilla/5.0'})
    # The part of the webpage with the id tag "listing" contains all the job postings
    listing_page = bs(r.text).find("ul",attrs={"id":'listing'})
    # Parse out the ads
    list_items = listing_page.findAll("li",attrs={"id": re.compile("item-[0-9]+")})
    parsed_list_page = [parse_list_page_item(li) for li in list_items]
    return pd.DataFrame(parsed_list_page,columns=["Job ID","Job Title","Job URL","Diversity Job"]).set_index("Job ID")



In [3]:
# Get Job ID for most recent date posted which already exists
from glob import glob

most_recent_csv = sorted(glob("../data/*"))[::-1][0]
ls_df = pd.read_csv(most_recent_csv).sort_values("Date Posted",ascending=False)
already_scraped = set(ls_df['Job ID'])

In [4]:
url = "https://jobs.chronicle.com/jobs/faculty-positions/north-america/tenured-tenured-track/{}"

frames = []
job_ids = set()
page = 1
new_jobs = True
while new_jobs:
    print(page,end=" ")
    frame = parse_list_page(url.format(page))
    prev_job_ids = job_ids
    job_ids = set(frame.index)
    new_jobs = not (bool(job_ids.intersection(already_scraped)) or (prev_job_ids == job_ids))
    if bool(job_ids.intersection(already_scraped)): print(job_ids.intersection(already_scraped))
    frames.append(frame)
    page +=1

listing_df = pd.concat(frames)
listing_df = listing_df[~listing_df.index.isin(already_scraped)]
listing_df

1 2 3 4 5 6 7 8 9 {367153, 367155, 367156}


Unnamed: 0_level_0,Job Title,Job URL,Diversity Job
Job ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
369155,"Professor & Director, School of Communication ...",https://jobs.chronicle.com/job/369155/professo...,True
363071,Lecturer or Clinical Assistant Professor of Nu...,https://jobs.chronicle.com/job/363071/lecturer...,True
367855,"Chair - Georgia Tech/Emory Univ, Wallace H. Co...",https://jobs.chronicle.com/job/367855/chair-ge...,True
370229,Professor of Linguistics--Editing and Publishing,https://jobs.chronicle.com/job/370229/professo...,False
370219,Marketing Faculty: Assistant Professor,https://jobs.chronicle.com/job/370219/marketin...,False
...,...,...,...
367374,Assistant Professor of Theatre,https://jobs.chronicle.com/job/367374/assistan...,False
367371,Assistant Professor of Biological Sciences,https://jobs.chronicle.com/job/367371/assistan...,False
367358,Assistant Professor,https://jobs.chronicle.com/job/367358/assistan...,False
367352,Tenure-Track Assistant Professor of Biology,https://jobs.chronicle.com/job/367352/tenure-t...,False


In [5]:

def parse_details_page(url):
    """
        Parses the details page of a university
        
        || employer || location || salary || date posted || position_type (list) || description
    
    """
    time.sleep(0.25)
    r = requests.get(url,headers = {'User-Agent': 'Mozilla/5.0'})
    details_page = bs(r.text)
    # Get the job description
    description = details_page.find("div",attrs={"class":re.compile("[a-zA-Z0-9]*job-description*")}).get_text()
    
    details_block = details_page.find("dl",attrs={"class":"grid"})
    employer,location,salary,posted_date,position_type = None,None,None,None,None
    
    employer_html = details_block.find("div",attrs={"class": re.compile("[a-zA-Z0-9]*description__recruiter")})
    employer = None if employer_html is None else employer_html.find("dd").text.strip()
    
    location_html = details_block.find("div",attrs={"class": re.compile("[a-zA-Z0-9]*description__location")})
    location = None if location_html is None else location_html.find("dd").text.strip()
    
    salary_html = details_block.find("div",attrs={"class": re.compile("[a-zA-Z0-9]*description__salary")})
    salary = None if salary_html is None else salary_html.find("dd").text.strip()
    
    posted_date_html = details_block.find("div",attrs={"class": re.compile("[a-zA-Z0-9]*description__posted-date")})
    posted_date = None if posted_date_html is None else posted_date_html.find("dd").text.strip()
    
    position_type_html = details_block.find("div",attrs={"class": re.compile("[a-zA-Z0-9]*description__category-PositionType")})
    try:
        position_type = None if position_type_html is None else position_type_html.find("dd").findAll("a")
        position_type = [pt.text for pt in position_type]
    except:
        pass
    
    return employer,location,salary,posted_date,position_type,description
    


In [6]:
listing_df[['Employer',
            'Location',
            'Salary',
            'Date Posted',
            'position_type',
            'Description']] = listing_df.progress_apply(lambda row: parse_details_page(row['Job URL']),
                                                        axis=1,
                                                        result_type='expand')


listing_df["Date Posted"] = pd.to_datetime(listing_df["Date Posted"],infer_datetime_format=True)

  0%|          | 0/180 [00:00<?, ?it/s]

In [7]:
position_type = pd.DataFrame(listing_df['position_type'].values.tolist(),
                             index=listing_df.index).fillna(np.nan)
position_type = position_type.rename(columns = lambda x: (x/10)).add_prefix('Position Type ')
print("{}x{}".format(*listing_df.shape))
merged_df = pd.merge(listing_df,
                     position_type,
                     how="left",
                     left_index=True,
                     right_index=True)
print("{}x{}".format(*merged_df.shape))
merged_df = merged_df.drop("position_type",axis=1)
print("{}x{}".format(*merged_df.shape))
merged_df = merged_df.sort_values("Date Posted",ascending=False)
merged_df

180x9
184x49
184x48


Unnamed: 0_level_0,Job Title,Job URL,Diversity Job,Employer,Location,Salary,Date Posted,Description,Position Type 0.0,Position Type 0.1,...,Position Type 3.0,Position Type 3.1,Position Type 3.2,Position Type 3.3,Position Type 3.4,Position Type 3.5,Position Type 3.6,Position Type 3.7,Position Type 3.8,Position Type 3.9
Job ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
370229,Professor of Linguistics--Editing and Publishing,https://jobs.chronicle.com/job/370229/professo...,False,Brigham Young University,"Utah, United States",Salary Not specified,2021-08-13,\nTo Apply: Click on the following link to acc...,Faculty Positions,Communications,...,,,,,,,,,,
370099,Assistant Professor of Sociology and Criminal ...,https://jobs.chronicle.com/job/370099/assistan...,False,Pacific Lutheran University,"Washington, United States","$55,000.00 - $57,000.00",2021-08-13,\nGeneral Description: The Department\nof Soci...,Faculty Positions,Social & Behavioral Sciences,...,,,,,,,,,,
369787,"Professor, Associate Professor or Professor in...",https://jobs.chronicle.com/job/369787/professo...,False,Harvard University Graduate School of Design,"Massachusetts, United States",Salary Not specified,2021-08-13,\nThe Department of Urban Planning and Design ...,Faculty Positions,Professional Fields,...,,,,,,,,,,
369788,Assistant/Associate Professor of Environmental...,https://jobs.chronicle.com/job/369788/assistan...,False,Harvard University Graduate School of Design,"Massachusetts, United States",Salary Not specified,2021-08-13,\nThe Harvard University Graduate School of De...,Faculty Positions,Professional Fields,...,,,,,,,,,,
369789,Associate Professor of Landscape Architecture ...,https://jobs.chronicle.com/job/369789/associat...,False,Harvard University Graduate School of Design,"Massachusetts, United States",Salary Not specified,2021-08-13,\nHarvard’s Department of Landscape Architectu...,Faculty Positions,Professional Fields,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
367587,School of Public Policy - 2 Assistant Professo...,https://jobs.chronicle.com/job/367587/school-o...,False,University of Massachusetts Amherst,"Massachusetts, United States",Salary Not specified,2021-08-06,\nJob Description\nThe UMass Amherst School of...,Faculty Positions,Professional Fields,...,,,,,,,,,,
367588,Doctoral Faculty Position for CRNA Program,https://jobs.chronicle.com/job/367588/doctoral...,False,Providence Sacred Heart Medical Center/Gonzaga...,"Washington, United States",Salary Commensurate with experience,2021-08-06,\nProvidence Sacred Heart Medical Center is se...,Faculty Positions,Health & Medicine,...,,,,,,,,,,
367593,Medical-Surgical Nursing Assistant Professor,https://jobs.chronicle.com/job/367593/medical-...,False,Contra Costa Community College District,"California, United States",Salary Not specified,2021-08-06,\n\nMedical-Surgical Nursing Assistant Profess...,Faculty Positions,Health & Medicine,...,,,,,,,,,,
367599,"Full-Time Tenure-Track Faculty, School of Law",https://jobs.chronicle.com/job/367599/full-tim...,False,University of St. Thomas,"Minnesota, United States",Salary Commensurate with experience,2021-08-06,\nThe University of St. Thomas School of Law (...,Faculty Positions,Professional Fields,...,,,,,,,,,,


In [8]:
from datetime import datetime

timestamp = datetime.now().strftime("%Y-%m-%d")
merged_df.to_csv(f"../data/{timestamp}-chronicles_of_higher_ed.csv")