# Chronicles of higher education job scraper

Collecting all job advertisements for tenure-track for North American four-year institutions.

- **[Query](https://jobs.chronicle.com/jobs/faculty-positions/north-america/tenured-tenured-track/)**


Everytime you scrape:

1. Load in previous job advertisements
2. Scrape all the *new job advertisements*
3. De-duplicate if necessary
4. Output to DB/CSV


In [1]:
# Data manipulation libraries
import pandas as pd
import numpy as np
# Common webscraping libaries
from bs4 import BeautifulSoup as bs
import requests

In [2]:
import re
import time

def parse_list_page_item(list_item):
    """
        Takes the list item HTML and parses out the four fields below into a list
    
    """
    title_tag = list_item.find("h3").find("a")
    job_title = title_tag.text
    job_url_suffix = title_tag['href'].strip()
    job_id = job_url_suffix.split("/")[2]
    job_url = f"https://jobs.chronicle.com{job_url_suffix}"
    diversity_job = False if list_item.find("p",attrs={"class":"ribbon"}) is None else True
    return [job_id,job_title,job_url,diversity_job]

def parse_list_page(url):
    """
        Returns the basic info from the jobs listing page
        
        || job id || job title || url || diversity job? 
    
    """
    time.sleep(1)
    r = requests.get(url,headers = {'User-Agent': 'Mozilla/5.0'})
    # The part of the webpage with the id tag "listing" contains all the job postings
    listing_page = bs(r.text).find("ul",attrs={"id":'listing'})
    # Parse out the ads
    list_items = listing_page.findAll("li",attrs={"id": re.compile("item-[0-9]+")})
    parsed_list_page = [parse_list_page_item(li) for li in list_items]
    return pd.DataFrame(parsed_list_page,columns=["Job ID","Job Title","Job URL","Diversity Job"]).set_index("Job ID")



In [3]:
url = "https://jobs.chronicle.com/jobs/faculty-positions/north-america/tenured-tenured-track/{}"
# TODO: build a function that scrapes all the details pages until we hit one that we've seen prior OR
# We hit the end (in which case the url just returns the final page)
frames = [parse_list_page(url.format(i)) for i in range(1,45)] 
listing_df = pd.concat(frames)
listing_df

Unnamed: 0_level_0,Job Title,Job URL,Diversity Job
Job ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
352438,Professor of Environmental Security,https://jobs.chronicle.com/job/352438/professo...,True
347283,"Assistant Professor, Ceramics",https://jobs.chronicle.com/job/347283/assistan...,True
347305,"Accounting-Assistant, Associate or Professor",https://jobs.chronicle.com/job/347305/accounti...,True
354432,Multiple Faculty Positions,https://jobs.chronicle.com/job/354432/multiple...,False
354411,Assistant or Associate Professor Tenure Track ...,https://jobs.chronicle.com/job/354411/assistan...,False
...,...,...,...
298881,Assistant or Associate Professor of Statistics,https://jobs.chronicle.com/job/298881/assistan...,False
297349,Research Assistant Professor - Crop Science,https://jobs.chronicle.com/job/297349/research...,False
287884,Assistant/Associate Professor of Health Psycho...,https://jobs.chronicle.com/job/287884/assistan...,False
282211,Associate Dean for Graduate Studies and Facult...,https://jobs.chronicle.com/job/282211/associat...,False


In [4]:

def parse_details_page(url):
    """
        Parses the details page of a university
        
        || employer || location || salary || date posted || position_type (list) || description
    
    """
    time.sleep(0.25)
    r = requests.get(url,headers = {'User-Agent': 'Mozilla/5.0'})
    details_page = bs(r.text)
    # Get the job description
    description = details_page.find("div",attrs={"class":re.compile("[a-zA-Z0-9]*job-description*")}).get_text()
    
    details_block = details_page.find("dl",attrs={"class":"grid"})
    employer,location,salary,posted_date,position_type = None,None,None,None,None
    
    employer_html = details_block.find("div",attrs={"class": re.compile("[a-zA-Z0-9]*description__recruiter")})
    employer = None if employer_html is None else employer_html.find("dd").text.strip()
    
    location_html = details_block.find("div",attrs={"class": re.compile("[a-zA-Z0-9]*description__location")})
    location = None if location_html is None else location_html.find("dd").text.strip()
    
    salary_html = details_block.find("div",attrs={"class": re.compile("[a-zA-Z0-9]*description__salary")})
    salary = None if salary_html is None else salary_html.find("dd").text.strip()
    
    posted_date_html = details_block.find("div",attrs={"class": re.compile("[a-zA-Z0-9]*description__posted-date")})
    posted_date = None if posted_date_html is None else posted_date_html.find("dd").text.strip()
    
    position_type_html = details_block.find("div",attrs={"class": re.compile("[a-zA-Z0-9]*description__category-PositionType")})
    try:
        position_type = None if position_type_html is None else position_type_html.find("dd").findAll("a")
        position_type = [pt.text for pt in position_type]
    except:
        pass
    
    return employer,location,salary,posted_date,position_type,description
    


In [5]:
listing_df[['Employer',
            'Location',
            'Salary',
            'Date Posted',
            'position_type',
            'Description']] = listing_df.apply(lambda row: parse_details_page(row['Job URL']),
                                               axis=1, 
                                               result_type='expand')


listing_df["Date Posted"] = pd.to_datetime(listing_df["Date Posted"],infer_datetime_format=True)

In [9]:
position_type = pd.DataFrame(listing_df['position_type'].values.tolist(),
                             index=listing_df.index).fillna(np.nan)
position_type = position_type.rename(columns = lambda x: (x/10)).add_prefix('Position Type ')
print("{}x{}".format(*listing_df.shape))
merged_df = pd.merge(listing_df,
                     position_type,
                     how="left",
                     left_index=True,
                     right_index=True)
print("{}x{}".format(*merged_df.shape))
merged_df = merged_df.drop("position_type",axis=1)
print("{}x{}".format(*merged_df.shape))
merged_df = merged_df.sort_values("Date Posted",ascending=False)
merged_df

864x9
868x29
868x28


Unnamed: 0_level_0,Job Title,Job URL,Diversity Job,Employer,Location,Salary,Date Posted,Description,Position Type 0.0,Position Type 0.1,...,Position Type 1.0,Position Type 1.1,Position Type 1.2,Position Type 1.3,Position Type 1.4,Position Type 1.5,Position Type 1.6,Position Type 1.7,Position Type 1.8,Position Type 1.9
Job ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
354411,Assistant or Associate Professor Tenure Track ...,https://jobs.chronicle.com/job/354411/assistan...,False,University of Montevallo,"Alabama, United States",Salary Commensurate with experience,2021-06-28,\r\n\t\t\t\t\tThe University of Montevallo inv...,Faculty Positions,Business & Management,...,,,,,,,,,,
354287,"Faculty, Foreign Languages",https://jobs.chronicle.com/job/354287/faculty-...,False,Hagerstown Community College,"Maryland, United States",Salary Commensurate with experience,2021-06-28,\nHagerstown Community College is pleased to a...,Faculty Positions,Humanities,...,,,,,,,,,,
354283,Visiting Assistant Professor of Chemistry,https://jobs.chronicle.com/job/354283/visiting...,False,Lewis & Clark College,"Oregon, United States",Salary Commensurate with experience,2021-06-27,\r\n\t\t\t\t\tLewis & Clark College: College o...,Faculty Positions,"Science, Technology & Mathematics",...,,,,,,,,,,
354276,Cybersecurity & Network Administration Profess...,https://jobs.chronicle.com/job/354276/cybersec...,False,South Puget Sound Community College,"Washington, United States","$64,091.00 - $64,091.00",2021-06-27,\n\nApplications must be submitted online here...,Faculty Positions,Education,...,,,,,,,,,,
354275,Cybersecurity & Network Administration Profess...,https://jobs.chronicle.com/job/354275/cybersec...,False,South Puget Sound Community College,"Washington, United States","$64,091.00 - $64,091.00",2021-06-27,\n\nApplications must be submitted online here...,Faculty Positions,Education,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297349,Research Assistant Professor - Crop Science,https://jobs.chronicle.com/job/297349/research...,False,North Carolina Agricultural and Technical Stat...,"North Carolina, United States",Salary Not specified,2020-10-31,\r\n\t\t\t\t\tThe Research Assistant Professor...,Faculty Positions,Communications,...,,,,,,,,,,
341787,"Professor/Chair, Computer Systems Technology",https://jobs.chronicle.com/job/341787/professo...,False,North Carolina Agricultural and Technical Stat...,"North Carolina, United States",Salary Not specified,2020-10-14,\r\n\t\t\t\t\tThe College of Science and Techn...,Faculty Positions,Education,...,,,,,,,,,,
287884,Assistant/Associate Professor of Health Psycho...,https://jobs.chronicle.com/job/287884/assistan...,False,North Carolina Agricultural and Technical Stat...,"North Carolina, United States",Salary Not specified,2020-09-19,\r\n\t\t\t\t\tThe successful candidate will te...,Faculty Positions,Social & Behavioral Sciences,...,,,,,,,,,,
282211,Associate Dean for Graduate Studies and Facult...,https://jobs.chronicle.com/job/282211/associat...,False,North Carolina Agricultural and Technical Stat...,"North Carolina, United States",Salary Not specified,2020-08-15,\r\n\t\t\t\t\tThis Associate Dean for Graduate...,Administrative,Academic Affairs,...,,,,,,,,,,


In [10]:
from datetime import datetime

timestamp = datetime.now().strftime("%Y-%m-%d")
merged_df.to_csv(f"../data/{timestamp}-chronicles_of_higher_ed.csv", 
                 quotechar="'")