# Chronicles of higher education job scraper

Collecting all job advertisements for tenure-track for North American four-year institutions.

- **[Query](https://jobs.chronicle.com/jobs/faculty-positions/north-america/tenured-tenured-track/)**


Everytime you scrape:

1. Load in previous job advertisements
2. Scrape all the *new job advertisements*
3. De-duplicate if necessary
4. Output to DB/CSV


In [37]:
# Data manipulation libraries
import pandas as pd
import numpy as np
# Common webscraping libaries
from bs4 import BeautifulSoup as bs
from tqdm.notebook import tqdm
import requests
tqdm.pandas()

In [38]:
import re
import time

def parse_list_page_item(list_item):
    """
        Takes the list item HTML and parses out the four fields below into a list
    
    """
    title_tag = list_item.find("h3").find("a")
    job_title = title_tag.text
    job_url_suffix = title_tag['href'].strip()
    job_id = job_url_suffix.split("/")[2]
    job_url = f"https://jobs.chronicle.com{job_url_suffix}"
    diversity_job = False if list_item.find("p",attrs={"class":"ribbon"}) is None else True
    return [int(job_id),job_title,job_url,diversity_job]

def parse_list_page(url):
    """
        Returns the basic info from the jobs listing page
        
        || job id || job title || url || diversity job? 
    
    """
    time.sleep(1)
    r = requests.get(url,headers = {'User-Agent': 'Mozilla/5.0'})
    # The part of the webpage with the id tag "listing" contains all the job postings
    listing_page = bs(r.text).find("ul",attrs={"id":'listing'})
    # Parse out the ads
    list_items = listing_page.findAll("li",attrs={"id": re.compile("item-[0-9]+")})
    parsed_list_page = [parse_list_page_item(li) for li in list_items]
    return pd.DataFrame(parsed_list_page,columns=["Job ID","Job Title","Job URL","Diversity Job"]).set_index("Job ID")



In [39]:
# Get Job ID for most recent date posted which already exists
from glob import glob

most_recent_csv = sorted(glob("../data/*"))[::-1][0]
ls_df = pd.read_csv(most_recent_csv).sort_values("Date Posted",ascending=False)
already_scraped = set(ls_df['Job ID'])

In [None]:
url = "https://jobs.chronicle.com/jobs/faculty-positions/north-america/tenured-tenured-track/{}"

frames = []
job_ids = set()
page = 1
new_jobs = True
while new_jobs:
    print(page,end=" ")
    frame = parse_list_page(url.format(page))
    prev_job_ids = job_ids
    job_ids = set(frame.index)
    new_jobs = not (bool(job_ids.intersection(already_scraped)) or (prev_job_ids == job_ids))
    if bool(job_ids.intersection(already_scraped)): print(job_ids.intersection(already_scraped))
    frames.append(frame)
    page +=1

listing_df = pd.concat(frames)
listing_df = listing_df[~listing_df.index.isin(already_scraped)]
listing_df

In [47]:

def parse_details_page(url):
    """
        Parses the details page of a university
        
        || employer || location || salary || date posted || position_type (list) || description
    
    """
    time.sleep(0.25)
    r = requests.get(url,headers = {'User-Agent': 'Mozilla/5.0'})
    details_page = bs(r.text)
    # Get the job description
    description = details_page.find("div",attrs={"class":re.compile("[a-zA-Z0-9]*job-description*")}).get_text()
    
    details_block = details_page.find("dl",attrs={"class":"grid"})
    employer,location,salary,posted_date,position_type = None,None,None,None,None
    
    employer_html = details_block.find("div",attrs={"class": re.compile("[a-zA-Z0-9]*description__recruiter")})
    employer = None if employer_html is None else employer_html.find("dd").text.strip()
    
    location_html = details_block.find("div",attrs={"class": re.compile("[a-zA-Z0-9]*description__location")})
    location = None if location_html is None else location_html.find("dd").text.strip()
    
    salary_html = details_block.find("div",attrs={"class": re.compile("[a-zA-Z0-9]*description__salary")})
    salary = None if salary_html is None else salary_html.find("dd").text.strip()
    
    posted_date_html = details_block.find("div",attrs={"class": re.compile("[a-zA-Z0-9]*description__posted-date")})
    posted_date = None if posted_date_html is None else posted_date_html.find("dd").text.strip()
    
    position_type_html = details_block.find("div",attrs={"class": re.compile("[a-zA-Z0-9]*description__category-PositionType")})
    try:
        position_type = None if position_type_html is None else position_type_html.find("dd").findAll("a")
        position_type = [pt.text for pt in position_type]
    except:
        pass
    
    return employer,location,salary,posted_date,position_type,description
    


In [48]:
listing_df[['Employer',
            'Location',
            'Salary',
            'Date Posted',
            'position_type',
            'Description']] = listing_df.progress_apply(lambda row: parse_details_page(row['Job URL']),
                                                        axis=1,
                                                        result_type='expand')


listing_df["Date Posted"] = pd.to_datetime(listing_df["Date Posted"],infer_datetime_format=True)

  0%|          | 0/358 [00:00<?, ?it/s]

In [49]:
position_type = pd.DataFrame(listing_df['position_type'].values.tolist(),
                             index=listing_df.index).fillna(np.nan)
position_type = position_type.rename(columns = lambda x: (x/10)).add_prefix('Position Type ')
print("{}x{}".format(*listing_df.shape))
merged_df = pd.merge(listing_df,
                     position_type,
                     how="left",
                     left_index=True,
                     right_index=True)
print("{}x{}".format(*merged_df.shape))
merged_df = merged_df.drop("position_type",axis=1)
print("{}x{}".format(*merged_df.shape))
merged_df = merged_df.sort_values("Date Posted",ascending=False)
merged_df

358x9
362x20
362x19


Unnamed: 0_level_0,Job Title,Job URL,Diversity Job,Employer,Location,Salary,Date Posted,Description,Position Type 0.0,Position Type 0.1,Position Type 0.2,Position Type 0.3,Position Type 0.4,Position Type 0.5,Position Type 0.6,Position Type 0.7,Position Type 0.8,Position Type 0.9,Position Type 1.0
Job ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
361338,"Department Chair, Assistant, Associate or Prof...",https://jobs.chronicle.com/job/361338/departme...,False,University of Kentucky,"Kentucky, United States",Salary Not specified,2021-07-22,"\nDepartment Chair, Assistant, Associate or Pr...",Faculty Positions,Health & Medicine,Other Health & Medicine,,,,,,,,
361155,Assistant/Associate Professor of Finance,https://jobs.chronicle.com/job/361155/assistan...,False,Washington and Lee University,"Virginia, United States",Salary Not specified,2021-07-21,\r\n\t\t\t\t\tDescription\nThe Department of A...,Faculty Positions,Business & Management,Accounting & Finance,,,,,,,,
360861,Assistant/Associate Professor of Accounting,https://jobs.chronicle.com/job/360861/assistan...,False,Washington and Lee University,"Virginia, United States",Salary Not specified,2021-07-21,\r\n\t\t\t\t\tDescription\nThe Washington and ...,Faculty Positions,Business & Management,Accounting & Finance,,,,,,,,
360978,Sociology and Social Work - Assistant Professo...,https://jobs.chronicle.com/job/360978/sociolog...,False,Hope College,"Michigan, United States",Salary Not specified,2021-07-21,\nHope College invites applicants for a tenure...,Faculty Positions,Professional Fields,Social Work & Human Services,,,,,,,,
360989,Assistant Professor of Mathematics,https://jobs.chronicle.com/job/360989/assistan...,False,Carleton College,"Minnesota, United States",Competitive Salary,2021-07-21,\n\nThe Carleton College Department of Mathema...,Faculty Positions,"Science, Technology & Mathematics",Mathematics,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
354826,Director of Literacy Center,https://jobs.chronicle.com/job/354826/director...,False,Augusta University,"Georgia, United States",Salary Not specified,2021-06-29,"\nAugusta University, Augusta,\nGeorgia\nDirec...",Administrative,Academic Affairs,Academic Administration,Faculty Positions,Education,Other Education,Teacher Education,,,,
354813,Addiction Research - Tenure Track,https://jobs.chronicle.com/job/354813/addictio...,False,Uconn Health,"Connecticut, United States",Salary Commensurate with experience,2021-06-29,\nADDICTION RESEARCH - TENURE TRACK \nThe Depa...,Faculty Positions,Health & Medicine,Other Health & Medicine,"Science, Technology & Mathematics",Other Science & Technology,,,,,,
354773,Assistant Professor of Nuclear Technology,https://jobs.chronicle.com/job/354773/assistan...,False,Columbia Basin College,"Washington, United States",Salary Not specified,2021-06-29,\n \nInterested in a career at Columbia Basin ...,Faculty Positions,"Science, Technology & Mathematics",Other Science & Technology,,,,,,,,
354685,Documentary and Social Change-Open Rank Search...,https://jobs.chronicle.com/job/354685/document...,False,"University of Michigan Department of Film, Tel...","Michigan, United States",Competitive Salary,2021-06-29,\r\n\t\t\t\t\tDOCUMENTARY AND SOCIAL CHANGE-OP...,Faculty Positions,Communications,Film & Video,,,,,,,,


In [50]:
from datetime import datetime

timestamp = datetime.now().strftime("%Y-%m-%d")
merged_df.to_csv(f"../data/{timestamp}-chronicles_of_higher_ed.csv")