# Chronicles of higher education job scraper

Collecting all job advertisements for tenure-track for North American four-year institutions.

- **[Query](https://jobs.chronicle.com/jobs/faculty-positions/north-america/tenured-tenured-track/)**


Everytime you scrape:

1. Load in previous job advertisements
2. Scrape all the *new job advertisements*
3. De-duplicate if necessary
4. Output to DB/CSV


In [1]:
# Data manipulation libraries
import pandas as pd
import numpy as np
# Common webscraping libaries
from bs4 import BeautifulSoup as bs
from tqdm.notebook import tqdm
import requests
tqdm.pandas()

In [2]:
import re
import time

def parse_list_page_item(list_item):
    """
        Takes the list item HTML and parses out the four fields below into a list
    
    """
    title_tag = list_item.find("h3").find("a")
    job_title = title_tag.text
    job_url_suffix = title_tag['href'].strip()
    job_id = job_url_suffix.split("/")[2]
    job_url = f"https://jobs.chronicle.com{job_url_suffix}"
    diversity_job = False if list_item.find("p",attrs={"class":"ribbon"}) is None else True
    return [int(job_id),job_title,job_url,diversity_job]

def parse_list_page(url):
    """
        Returns the basic info from the jobs listing page
        
        || job id || job title || url || diversity job? 
    
    """
    time.sleep(1)
    r = requests.get(url,headers = {'User-Agent': 'Mozilla/5.0'})
    # The part of the webpage with the id tag "listing" contains all the job postings
    listing_page = bs(r.text).find("ul",attrs={"id":'listing'})
    # Parse out the ads
    list_items = listing_page.findAll("li",attrs={"id": re.compile("item-[0-9]+")})
    parsed_list_page = [parse_list_page_item(li) for li in list_items]
    return pd.DataFrame(parsed_list_page,columns=["Job ID","Job Title","Job URL","Diversity Job"]).set_index("Job ID")



In [3]:
# Get Job ID for most recent date posted which already exists
from glob import glob

most_recent_csv = sorted(glob("../data/*"))[::-1][0]
ls_df = pd.read_csv(most_recent_csv).sort_values("Date Posted",ascending=False)
already_scraped = set(ls_df['Job ID'])

In [4]:
url = "https://jobs.chronicle.com/jobs/faculty-positions/north-america/tenured-tenured-track/{}"

frames = []
job_ids = set()
page = 1
new_jobs = True
while new_jobs:
    print(page,end=" ")
    frame = parse_list_page(url.format(page))
    prev_job_ids = job_ids
    job_ids = set(frame.index)
    new_jobs = not (bool(job_ids.intersection(already_scraped)) or (prev_job_ids == job_ids))
    if bool(job_ids.intersection(already_scraped)): print(job_ids.intersection(already_scraped))
    frames.append(frame)
    page +=1

listing_df = pd.concat(frames)
listing_df = listing_df[~listing_df.index.isin(already_scraped)]
listing_df

1 2 3 4 5 6 7 8 9 10 11 {361224, 360978, 361235, 361236, 361238, 361239, 361242, 361243, 360989, 360991, 360995, 361253, 361016, 361020, 361155, 361156, 361160, 361082, 361085, 361338}


Unnamed: 0_level_0,Job Title,Job URL,Diversity Job
Job ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
360079,Fung Global Fellows Program - Visiting Researc...,https://jobs.chronicle.com/job/360079/fung-glo...,True
361352,"Digital Initiatives Librarian, Librarian Assis...",https://jobs.chronicle.com/job/361352/digital-...,True
363111,NYUAD Institute / Humanities Research Fellowsh...,https://jobs.chronicle.com/job/363111/nyuad-in...,True
364792,Assistant Professor in Accounting,https://jobs.chronicle.com/job/364792/assistan...,False
364760,Research Professor and Roux Institute Member (...,https://jobs.chronicle.com/job/364760/research...,False
...,...,...,...
361476,Nutrition Science Tenure/Tenure-Track: Full Pr...,https://jobs.chronicle.com/job/361476/nutritio...,False
361474,Assistant/Associate Professor - Nutrition Scie...,https://jobs.chronicle.com/job/361474/assistan...,False
361443,Tenure track assistant professorship in Comput...,https://jobs.chronicle.com/job/361443/tenure-t...,False
361373,Assistant or Associate Professor: Behavioral S...,https://jobs.chronicle.com/job/361373/assistan...,False


In [5]:

def parse_details_page(url):
    """
        Parses the details page of a university
        
        || employer || location || salary || date posted || position_type (list) || description
    
    """
    time.sleep(0.25)
    r = requests.get(url,headers = {'User-Agent': 'Mozilla/5.0'})
    details_page = bs(r.text)
    # Get the job description
    description = details_page.find("div",attrs={"class":re.compile("[a-zA-Z0-9]*job-description*")}).get_text()
    
    details_block = details_page.find("dl",attrs={"class":"grid"})
    employer,location,salary,posted_date,position_type = None,None,None,None,None
    
    employer_html = details_block.find("div",attrs={"class": re.compile("[a-zA-Z0-9]*description__recruiter")})
    employer = None if employer_html is None else employer_html.find("dd").text.strip()
    
    location_html = details_block.find("div",attrs={"class": re.compile("[a-zA-Z0-9]*description__location")})
    location = None if location_html is None else location_html.find("dd").text.strip()
    
    salary_html = details_block.find("div",attrs={"class": re.compile("[a-zA-Z0-9]*description__salary")})
    salary = None if salary_html is None else salary_html.find("dd").text.strip()
    
    posted_date_html = details_block.find("div",attrs={"class": re.compile("[a-zA-Z0-9]*description__posted-date")})
    posted_date = None if posted_date_html is None else posted_date_html.find("dd").text.strip()
    
    position_type_html = details_block.find("div",attrs={"class": re.compile("[a-zA-Z0-9]*description__category-PositionType")})
    try:
        position_type = None if position_type_html is None else position_type_html.find("dd").findAll("a")
        position_type = [pt.text for pt in position_type]
    except:
        pass
    
    return employer,location,salary,posted_date,position_type,description
    


In [6]:
listing_df[['Employer',
            'Location',
            'Salary',
            'Date Posted',
            'position_type',
            'Description']] = listing_df.progress_apply(lambda row: parse_details_page(row['Job URL']),
                                                        axis=1,
                                                        result_type='expand')


listing_df["Date Posted"] = pd.to_datetime(listing_df["Date Posted"],infer_datetime_format=True)

  0%|          | 0/203 [00:00<?, ?it/s]

In [7]:
position_type = pd.DataFrame(listing_df['position_type'].values.tolist(),
                             index=listing_df.index).fillna(np.nan)
position_type = position_type.rename(columns = lambda x: (x/10)).add_prefix('Position Type ')
print("{}x{}".format(*listing_df.shape))
merged_df = pd.merge(listing_df,
                     position_type,
                     how="left",
                     left_index=True,
                     right_index=True)
print("{}x{}".format(*merged_df.shape))
merged_df = merged_df.drop("position_type",axis=1)
print("{}x{}".format(*merged_df.shape))
merged_df = merged_df.sort_values("Date Posted",ascending=False)
merged_df

203x9
203x21
203x20


Unnamed: 0_level_0,Job Title,Job URL,Diversity Job,Employer,Location,Salary,Date Posted,Description,Position Type 0.0,Position Type 0.1,Position Type 0.2,Position Type 0.3,Position Type 0.4,Position Type 0.5,Position Type 0.6,Position Type 0.7,Position Type 0.8,Position Type 0.9,Position Type 1.0,Position Type 1.1
Job ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
364792,Assistant Professor in Accounting,https://jobs.chronicle.com/job/364792/assistan...,False,"California State University, Fullerton","California, United States",Salary Not specified,2021-07-30,\n\n\nAssistant Professor in Accounting\nDepar...,Faculty Positions,Business & Management,Accounting & Finance,,,,,,,,,
364220,Tenure Track Assistant Professor Programs in N...,https://jobs.chronicle.com/job/364220/tenure-t...,False,Boston University,"Massachusetts, United States",Salary Commensurate with experience,2021-07-29,\nBoston University’s College of Health and Re...,Faculty Positions,Health & Medicine,Nutrition,"Science, Technology & Mathematics",Biology & Life Sciences,,,,,,,
364487,Assistant Professor of Political Science (Worl...,https://jobs.chronicle.com/job/364487/assistan...,False,University of Texas Rio Grande Valley,"Texas, United States",Salary Not specified,2021-07-29,\n\n\nAssistant Professor of Political Science...,Faculty Positions,Social & Behavioral Sciences,Political Science & International Relations,,,,,,,,,
364595,Assistant/Associate/Full Teaching Professor,https://jobs.chronicle.com/job/364595/assistan...,False,Northeastern University,"Massachusetts, United States",Salary Not specified,2021-07-29,\n\n\nAssistant/Associate/Full Teaching Profes...,Faculty Positions,Professional Fields,Law & Legal Studies,,,,,,,,,
364588,Assistant/Associate/Full Teaching Professor M...,https://jobs.chronicle.com/job/364588/assistan...,False,Northeastern University,"Massachusetts, United States",Salary Not specified,2021-07-29,\n\n\nAssistant/Associate/Full Teaching Profes...,Faculty Positions,"Science, Technology & Mathematics",Computer Sciences & Technology,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361610,Assistant Professor Public Relations/Strategic...,https://jobs.chronicle.com/job/361610/assistan...,False,Salisbury University,"Maryland, United States",Competitive Salary,2021-07-22,\nAssistant Professor Public Relations/Strateg...,Faculty Positions,Communications,Public Relations & Advertising,,,,,,,,,
361611,Assistant Professor of Biology,https://jobs.chronicle.com/job/361611/assistan...,False,Salisbury University,"Maryland, United States",Competitive Salary,2021-07-22,\nAssistant Professor of Biology\nThe Departme...,Faculty Positions,"Science, Technology & Mathematics",Biology & Life Sciences,,,,,,,,,
361352,"Digital Initiatives Librarian, Librarian Assis...",https://jobs.chronicle.com/job/361352/digital-...,True,Kennesaw State University,"Georgia, United States",Salary Not specified,2021-07-22,\nKennesaw State University is now accepting a...,Faculty Positions,Professional Fields,Library & Information Sciences,,,,,,,,,
362928,Assistant Professor of Learning Sciences or As...,https://jobs.chronicle.com/job/362928/assistan...,False,University of Wisconsin - Madison,"Wisconsin, United States",Salary Commensurate with experience,2021-07-21,\r\n\t\t\t\t\tThe UW-Madison Learning Sciences...,Faculty Positions,Education,Other Education,Social & Behavioral Sciences,Other Social & Behavioral Sciences,,,,,,,


In [9]:
from datetime import datetime

timestamp = datetime.now().strftime("%Y-%m-%d")
merged_df.to_csv(f"../data/{timestamp}-chronicles_of_higher_ed.csv")