# Chronicles of higher education job scraper

Collecting all job advertisements for tenure-track for North American four-year institutions.

- **[Query](https://jobs.chronicle.com/jobs/faculty-positions/north-america/tenured-tenured-track/)**


Everytime you scrape:

1. Load in previous job advertisements
2. Scrape all the *new job advertisements*
3. De-duplicate if necessary
4. Output to DB/CSV


In [1]:
# Data manipulation libraries
import pandas as pd
import numpy as np
# Common webscraping libaries
from bs4 import BeautifulSoup as bs
from tqdm.notebook import tqdm
import requests
tqdm.pandas()

In [2]:
import re
import time

def parse_list_page_item(list_item):
    """
        Takes the list item HTML and parses out the four fields below into a list
    
    """
    title_tag = list_item.find("h3").find("a")
    job_title = title_tag.text
    job_url_suffix = title_tag['href'].strip()
    job_id = job_url_suffix.split("/")[2]
    job_url = f"https://jobs.chronicle.com{job_url_suffix}"
    diversity_job = False if list_item.find("p",attrs={"class":"ribbon"}) is None else True
    return [int(job_id),job_title,job_url,diversity_job]

def parse_list_page(url):
    """
        Returns the basic info from the jobs listing page
        
        || job id || job title || url || diversity job? 
    
    """
    time.sleep(1)
    r = requests.get(url,headers = {'User-Agent': 'Mozilla/5.0'})
    # The part of the webpage with the id tag "listing" contains all the job postings
    listing_page = bs(r.text).find("ul",attrs={"id":'listing'})
    # Parse out the ads
    list_items = listing_page.findAll("li",attrs={"id": re.compile("item-[0-9]+")})
    parsed_list_page = [parse_list_page_item(li) for li in list_items]
    return pd.DataFrame(parsed_list_page,columns=["Job ID","Job Title","Job URL","Diversity Job"]).set_index("Job ID")



In [4]:
# Get Job ID for most recent date posted which already exists
from glob import glob

most_recent_csv = sorted(glob("../data/*"))[::-1][0]
ls_df = pd.read_csv(most_recent_csv).sort_values("Date Posted",ascending=False)
already_scraped = set(ls_df['Job ID'])

In [5]:
url = "https://jobs.chronicle.com/jobs/faculty-positions/north-america/tenured-tenured-track/{}"

frames = []
job_ids = set()
page = 1
new_jobs = True
while new_jobs:
    print(page,end=" ")
    frame = parse_list_page(url.format(page))
    prev_job_ids = job_ids
    job_ids = set(frame.index)
    new_jobs = not (bool(job_ids.intersection(already_scraped)) or (prev_job_ids == job_ids))
    if bool(job_ids.intersection(already_scraped)): print(job_ids.intersection(already_scraped))
    frames.append(frame)
    page +=1

listing_df = pd.concat(frames)
listing_df = listing_df[~listing_df.index.isin(already_scraped)]
listing_df

1 2 3 4 5 6 {374592, 374624, 374625, 374626, 374597, 374598, 374599, 374601, 374605, 374606, 374581}


Unnamed: 0_level_0,Job Title,Job URL,Diversity Job
Job ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
375109,Tenure-Track Positions in the College of Educa...,https://jobs.chronicle.com/job/375109/tenure-t...,True
373977,President,https://jobs.chronicle.com/job/373977/presiden...,True
375458,TENURE-TRACK FACULTY POSITION IN THE DEPARTMEN...,https://jobs.chronicle.com/job/375458/tenure-t...,True
375792,Assistant Professor of Counseling Psychology,https://jobs.chronicle.com/job/375792/assistan...,False
375790,FAMU-FSU College of Engineering Tenure-Track F...,https://jobs.chronicle.com/job/375790/famu-fsu...,False
...,...,...,...
374727,Assistant Professor (English - Multi-Ethnic Li...,https://jobs.chronicle.com/job/374727/assistan...,False
374718,Assistant Professor (English - Journalism),https://jobs.chronicle.com/job/374718/assistan...,False
374717,Assistant Professor in Medieval Islamic History,https://jobs.chronicle.com/job/374717/assistan...,False
374692,Faculty Positions in Translational Immunology,https://jobs.chronicle.com/job/374692/faculty-...,False


In [6]:

def parse_details_page(url):
    """
        Parses the details page of a university
        
        || employer || location || salary || date posted || position_type (list) || description
    
    """
    time.sleep(0.25)
    r = requests.get(url,headers = {'User-Agent': 'Mozilla/5.0'})
    details_page = bs(r.text)
    # Get the job description
    description = details_page.find("div",attrs={"class":re.compile("[a-zA-Z0-9]*job-description*")}).get_text()
    
    details_block = details_page.find("dl",attrs={"class":"grid"})
    employer,location,salary,posted_date,position_type = None,None,None,None,None
    
    employer_html = details_block.find("div",attrs={"class": re.compile("[a-zA-Z0-9]*description__recruiter")})
    employer = None if employer_html is None else employer_html.find("dd").text.strip()
    
    location_html = details_block.find("div",attrs={"class": re.compile("[a-zA-Z0-9]*description__location")})
    location = None if location_html is None else location_html.find("dd").text.strip()
    
    salary_html = details_block.find("div",attrs={"class": re.compile("[a-zA-Z0-9]*description__salary")})
    salary = None if salary_html is None else salary_html.find("dd").text.strip()
    
    posted_date_html = details_block.find("div",attrs={"class": re.compile("[a-zA-Z0-9]*description__posted-date")})
    posted_date = None if posted_date_html is None else posted_date_html.find("dd").text.strip()
    
    position_type_html = details_block.find("div",attrs={"class": re.compile("[a-zA-Z0-9]*description__category-PositionType")})
    try:
        position_type = None if position_type_html is None else position_type_html.find("dd").findAll("a")
        position_type = [pt.text for pt in position_type]
    except:
        pass
    
    return employer,location,salary,posted_date,position_type,description
    


In [7]:
listing_df[['Employer',
            'Location',
            'Salary',
            'Date Posted',
            'position_type',
            'Description']] = listing_df.progress_apply(lambda row: parse_details_page(row['Job URL']),
                                                        axis=1,
                                                        result_type='expand')


listing_df["Date Posted"] = pd.to_datetime(listing_df["Date Posted"],infer_datetime_format=True)

  0%|          | 0/112 [00:00<?, ?it/s]

In [8]:
position_type = pd.DataFrame(listing_df['position_type'].values.tolist(),
                             index=listing_df.index).fillna(np.nan)
position_type = position_type.rename(columns = lambda x: (x/10)).add_prefix('Position Type ')
print("{}x{}".format(*listing_df.shape))
merged_df = pd.merge(listing_df,
                     position_type,
                     how="left",
                     left_index=True,
                     right_index=True)
print("{}x{}".format(*merged_df.shape))
merged_df = merged_df.drop("position_type",axis=1)
print("{}x{}".format(*merged_df.shape))
merged_df = merged_df.sort_values("Date Posted",ascending=False)
merged_df

112x9
116x23
116x22


Unnamed: 0_level_0,Job Title,Job URL,Diversity Job,Employer,Location,Salary,Date Posted,Description,Position Type 0.0,Position Type 0.1,...,Position Type 0.4,Position Type 0.5,Position Type 0.6,Position Type 0.7,Position Type 0.8,Position Type 0.9,Position Type 1.0,Position Type 1.1,Position Type 1.2,Position Type 1.3
Job ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
375792,Assistant Professor of Counseling Psychology,https://jobs.chronicle.com/job/375792/assistan...,False,"University of Tennessee, Knoxville","Tennessee, United States",Competitive Salary,2021-08-31,\nThe Department of Psychology at the Universi...,Faculty Positions,Social & Behavioral Sciences,...,,,,,,,,,,
375762,Assistant Professor of Communication - Health ...,https://jobs.chronicle.com/job/375762/assistan...,False,The Ohio State University School of Communication,"Ohio, United States",Salary Commensurate with experience,2021-08-31,\nDepartment: School of Communication\nPosit...,Faculty Positions,Communications,...,Other Social & Behavioral Sciences,,,,,,,,,
375551,Founding Dean - School of Counseling,https://jobs.chronicle.com/job/375551/founding...,False,Asbury Theological Seminary,"Kentucky, United States",Salary Commensurate with experience,2021-08-31,\nFounding Dean – School of Counseling\nAsbury...,Faculty Positions,Education,...,,,,,,,,,,
375664,"Assistant, Associate or Full Professor - Stati...",https://jobs.chronicle.com/job/375664/assistan...,False,Baruch College,"New York, United States",Salary Not specified,2021-08-31,\nJob Description:\nThe largest accredited col...,Faculty Positions,"Science, Technology & Mathematics",...,,,,,,,,,,
375665,"Support Coach, Center for Male Engagement (3 P...",https://jobs.chronicle.com/job/375665/support-...,False,Community College of Philadelphia,"Pennsylvania, United States",Salary Not specified,2021-08-31,\n\nCommunity College of Philadelphia\n\n\nGen...,Faculty Positions,Professional Fields,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
374960,Assistant/Associate Teaching Professor of Comm...,https://jobs.chronicle.com/job/374960/assistan...,False,Pennsylvania State University,"Pennsylvania, United States",Salary Not specified,2021-08-27,\n\nAssistant/Associate Teaching Professor of ...,Faculty Positions,Health & Medicine,...,,,,,,,,,,
375026,Assistant Professor in Adult Neurogenic Commun...,https://jobs.chronicle.com/job/375026/assistan...,False,University of Illinois,"Illinois, United States",Salary Commensurate with experience,2021-08-27,\nAssistant Professor in Adult\nNeurogenic Com...,Faculty Positions,Health & Medicine,...,,,,,,,,,,
375049,Education for Equity and Justice: Assistant/As...,https://jobs.chronicle.com/job/375049/educatio...,False,University of Wisconsin-Eau Claire,"Wisconsin, United States",Salary Commensurate with experience,2021-08-27,\nThe Department of Education for Equity and J...,Faculty Positions,Education,...,,,,,,,,,,
375060,Communication Sciences and Disorders: Assistan...,https://jobs.chronicle.com/job/375060/communic...,False,University of Wisconsin-Eau Claire,"Wisconsin, United States",Salary Commensurate with experience,2021-08-27,\nPOSITION: A\nprobationary tenure-track facul...,Faculty Positions,Health & Medicine,...,,,,,,,,,,


In [9]:
from datetime import datetime

timestamp = datetime.now().strftime("%Y-%m-%d")
merged_df.to_csv(f"../data/{timestamp}-chronicles_of_higher_ed.csv")