# Chronicles of higher education job scraper

Collecting all job advertisements for tenure-track for North American four-year institutions.

- **[Query](https://jobs.chronicle.com/jobs/faculty-positions/north-america/tenured-tenured-track/)**


Everytime you scrape:

1. Load in previous job advertisements
2. Scrape all the *new job advertisements*
3. De-duplicate if necessary
4. Output to DB/CSV


In [1]:
# Data manipulation libraries
import pandas as pd
import numpy as np
# Common webscraping libaries
from bs4 import BeautifulSoup as bs
from tqdm.notebook import tqdm
import requests
tqdm.pandas()

In [2]:
import re
import time

def parse_list_page_item(list_item):
    """
        Takes the list item HTML and parses out the four fields below into a list
    
    """
    title_tag = list_item.find("h3").find("a")
    job_title = title_tag.text
    job_url_suffix = title_tag['href'].strip()
    job_id = job_url_suffix.split("/")[2]
    job_url = f"https://jobs.chronicle.com{job_url_suffix}"
    diversity_job = False if list_item.find("p",attrs={"class":"ribbon"}) is None else True
    return [int(job_id),job_title,job_url,diversity_job]

def parse_list_page(url):
    """
        Returns the basic info from the jobs listing page
        
        || job id || job title || url || diversity job? 
    
    """
    time.sleep(1)
    r = requests.get(url,headers = {'User-Agent': 'Mozilla/5.0'})
    # The part of the webpage with the id tag "listing" contains all the job postings
    listing_page = bs(r.text).find("ul",attrs={"id":'listing'})
    # Parse out the ads
    list_items = listing_page.findAll("li",attrs={"id": re.compile("item-[0-9]+")})
    parsed_list_page = [parse_list_page_item(li) for li in list_items]
    return pd.DataFrame(parsed_list_page,columns=["Job ID","Job Title","Job URL","Diversity Job"]).set_index("Job ID")



In [3]:
# Get Job ID for most recent date posted which already exists
from glob import glob

most_recent_csv = sorted(glob("../data/*"))[::-1][0]
ls_df = pd.read_csv(most_recent_csv).sort_values("Date Posted",ascending=False)
already_scraped = set(ls_df['Job ID'])

In [4]:
url = "https://jobs.chronicle.com/jobs/faculty-positions/north-america/tenured-tenured-track/{}"

frames = []
job_ids = set()
page = 1
new_jobs = True
while new_jobs:
    print(page,end=" ")
    frame = parse_list_page(url.format(page))
    prev_job_ids = job_ids
    job_ids = set(frame.index)
    new_jobs = not (bool(job_ids.intersection(already_scraped)) or (prev_job_ids == job_ids))
    if bool(job_ids.intersection(already_scraped)): print(job_ids.intersection(already_scraped))
    frames.append(frame)
    page +=1

listing_df = pd.concat(frames)
listing_df = listing_df[~listing_df.index.isin(already_scraped)]
listing_df

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 {371426, 371651, 371528, 371658, 371403, 371404, 371405, 371600, 371537, 371509, 371543, 371672, 371612}


Unnamed: 0_level_0,Job Title,Job URL,Diversity Job
Job ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
373488,45 Tenure-Track Positions at Cal Poly Pomona,https://jobs.chronicle.com/job/373488/45-tenur...,True
372598,Rutgers School of Communication & Information ...,https://jobs.chronicle.com/job/372598/rutgers-...,True
374384,Faculty Position (Open Rank),https://jobs.chronicle.com/job/374384/faculty-...,True
374626,"Director, Institutional Research",https://jobs.chronicle.com/job/374626/director...,False
374625,"Manager, Student Outreach & Recruitment",https://jobs.chronicle.com/job/374625/manager-...,False
...,...,...,...
371756,Associate Professor/Professor of Chemistry,https://jobs.chronicle.com/job/371756/associat...,False
371739,"Assistant, Professor, History of Race",https://jobs.chronicle.com/job/371739/assistan...,False
371733,Provost's Postdoctoral Fellow/Instuctor,https://jobs.chronicle.com/job/371733/provost-...,False
371721,"TENURED PROFESSORSHIP, Questrom School of Busi...",https://jobs.chronicle.com/job/371721/tenured-...,False


In [5]:

def parse_details_page(url):
    """
        Parses the details page of a university
        
        || employer || location || salary || date posted || position_type (list) || description
    
    """
    time.sleep(0.25)
    r = requests.get(url,headers = {'User-Agent': 'Mozilla/5.0'})
    details_page = bs(r.text)
    # Get the job description
    description = details_page.find("div",attrs={"class":re.compile("[a-zA-Z0-9]*job-description*")}).get_text()
    
    details_block = details_page.find("dl",attrs={"class":"grid"})
    employer,location,salary,posted_date,position_type = None,None,None,None,None
    
    employer_html = details_block.find("div",attrs={"class": re.compile("[a-zA-Z0-9]*description__recruiter")})
    employer = None if employer_html is None else employer_html.find("dd").text.strip()
    
    location_html = details_block.find("div",attrs={"class": re.compile("[a-zA-Z0-9]*description__location")})
    location = None if location_html is None else location_html.find("dd").text.strip()
    
    salary_html = details_block.find("div",attrs={"class": re.compile("[a-zA-Z0-9]*description__salary")})
    salary = None if salary_html is None else salary_html.find("dd").text.strip()
    
    posted_date_html = details_block.find("div",attrs={"class": re.compile("[a-zA-Z0-9]*description__posted-date")})
    posted_date = None if posted_date_html is None else posted_date_html.find("dd").text.strip()
    
    position_type_html = details_block.find("div",attrs={"class": re.compile("[a-zA-Z0-9]*description__category-PositionType")})
    try:
        position_type = None if position_type_html is None else position_type_html.find("dd").findAll("a")
        position_type = [pt.text for pt in position_type]
    except:
        pass
    
    return employer,location,salary,posted_date,position_type,description
    


In [6]:
listing_df[['Employer',
            'Location',
            'Salary',
            'Date Posted',
            'position_type',
            'Description']] = listing_df.progress_apply(lambda row: parse_details_page(row['Job URL']),
                                                        axis=1,
                                                        result_type='expand')


listing_df["Date Posted"] = pd.to_datetime(listing_df["Date Posted"],infer_datetime_format=True)

  0%|          | 0/310 [00:00<?, ?it/s]

In [7]:
position_type = pd.DataFrame(listing_df['position_type'].values.tolist(),
                             index=listing_df.index).fillna(np.nan)
position_type = position_type.rename(columns = lambda x: (x/10)).add_prefix('Position Type ')
print("{}x{}".format(*listing_df.shape))
merged_df = pd.merge(listing_df,
                     position_type,
                     how="left",
                     left_index=True,
                     right_index=True)
print("{}x{}".format(*merged_df.shape))
merged_df = merged_df.drop("position_type",axis=1)
print("{}x{}".format(*merged_df.shape))
merged_df = merged_df.sort_values("Date Posted",ascending=False)
merged_df

310x9
316x40
316x39


Unnamed: 0_level_0,Job Title,Job URL,Diversity Job,Employer,Location,Salary,Date Posted,Description,Position Type 0.0,Position Type 0.1,...,Position Type 2.1,Position Type 2.2,Position Type 2.3,Position Type 2.4,Position Type 2.5,Position Type 2.6,Position Type 2.7,Position Type 2.8,Position Type 2.9,Position Type 3.0
Job ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
374626,"Director, Institutional Research",https://jobs.chronicle.com/job/374626/director...,False,Community College of Philadelphia,"Pennsylvania, United States",Salary Not specified,2021-08-27,\n\nCommunity College of Philadelphia\n\n\nGen...,Faculty Positions,Professional Fields,...,,,,,,,,,,
374625,"Manager, Student Outreach & Recruitment",https://jobs.chronicle.com/job/374625/manager-...,False,Community College of Philadelphia,"Pennsylvania, United States",Salary Not specified,2021-08-27,\n\nCommunity College of Philadelphia\n\n\nGen...,Faculty Positions,Professional Fields,...,,,,,,,,,,
374624,"Academic Coordinator, Gateway to College",https://jobs.chronicle.com/job/374624/academic...,False,Community College of Philadelphia,"Pennsylvania, United States",Salary Not specified,2021-08-27,\n\nCommunity College of Philadelphia\n\n\nGen...,Faculty Positions,Professional Fields,...,,,,,,,,,,
374606,Native Pathways Program: 1 2 regular faculty p...,https://jobs.chronicle.com/job/374606/native-p...,False,The Evergreen State College,"Washington, United States",Salary Not Specified,2021-08-27,"\nThe Evergreen State College, a\npublic, prog...",Faculty Positions,Professional Fields,...,,,,,,,,,,
374605,2021-22 Tenure Track Faculty Openings,https://jobs.chronicle.com/job/374605/2021-22-...,False,The University of Tennessee-Knoxville (UT Knox...,"Tennessee, United States",Salary Not Specified,2021-08-27,\n2021-22\nTenure Track Faculty\nOpenings\nThe...,Faculty Positions,Business & Management,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
372004,"Finance Faculty - Assistant Professor, Tenure ...",https://jobs.chronicle.com/job/372004/finance-...,False,University of St. Thomas,"Minnesota, United States",Salary Commensurate with experience,2021-08-18,\nOVERVIEW\nThe Department of Finance in the O...,Faculty Positions,Business & Management,...,,,,,,,,,,
371994,Assistant Professor of Chemistry,https://jobs.chronicle.com/job/371994/assistan...,False,University of Oregon,"Oregon, United States",Salary Not specified,2021-08-18,\n\n\nAssistant Professor of Chemistry & Bioch...,Faculty Positions,"Science, Technology & Mathematics",...,,,,,,,,,,
371986,Assistant Clinical Professor/John and Patty Wa...,https://jobs.chronicle.com/job/371986/assistan...,False,Pennsylvania State University,"Pennsylvania, United States",Salary Not specified,2021-08-18,\n\nAssistant Clinical Professor/John and Patt...,Faculty Positions,Business & Management,...,,,,,,,,,,
371984,Assistant Professor or Associate Professor of ...,https://jobs.chronicle.com/job/371984/assistan...,False,Bryn Mawr College,"Pennsylvania, United States",Salary Not specified,2021-08-18,\nThe Graduate School of Social Work and Socia...,Faculty Positions,Professional Fields,...,,,,,,,,,,


In [9]:
from datetime import datetime

timestamp = datetime.now().strftime("%Y-%m-%d")
merged_df.to_csv(f"../data/{timestamp}-chronicles_of_higher_ed.csv")