# Academic Job Search Automation

In [35]:
# Importing necessary libraries
import requests
import json
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import time
from IPython.core.display import Pretty
from datetime import datetime

# Setting display options for pandas DataFrames
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 200)

### 1. H-Net Job Guide

In [None]:
# Ask the user to enter a custom month and year
custom_month_year = input("Please enter the month and year (e.g., 'July 2023'): ")

hnet_URL = "https://www.h-net.org/jobs/job_browse.php"
hnet_page = requests.get(hnet_URL)
hnet_soup = BeautifulSoup(hnet_page.text, "html.parser")

institution = []
position = []
link = []
post_date = []

# Use the custom month and year entered by the user
if hnet_soup.find(name='p', string=custom_month_year):
    month_post = hnet_soup.find(name='p', string=custom_month_year).next_sibling.next_sibling
else:
    print(f"No postings found for {custom_month_year}.")
    month_post = None

if month_post:
    for a in month_post.find_all(name='a'):
        institution.append(a.parent.next_element.replace(", ", ""))
        position.append(a.text)
        link.append('https://www.h-net.org/jobs/' + a['href'])

        date_str = a.find_next('span')['title']
        date = datetime.strptime(date_str, '%A, %d %B %Y, %X %p').date()
        post_date.append(date)

    d = {'Institution': institution, 'Position': position, 'Link': link, 'Posting Date': post_date}
    df_hnet = pd.DataFrame(d)
    df_hnet = df_hnet.drop_duplicates(subset=['Link'])

    df_hnet = df_hnet.reset_index(drop=True)
    print("See Below.")
else:
    print("No data to display.")

df_hnet.head(100)

In [None]:
# Prompt the user to input the select_list index
input_indices = input("Please enter the index numbers separated by commas (e.g., '1,5,8'): ")
select_list = [int(index.strip()) for index in input_indices.split(",")]

# Filter df_hnet using the select_list
df_hnet = df_hnet.iloc[select_list]

deadline = []
discipline = []

for i in list(df_hnet['Link']):
    post = requests.get(i)
    post_soup = BeautifulSoup(post.text, "html.parser")
    td1 = post_soup.find('td', string='Closing Date')
    date_str = td1.find_next().text
    date = datetime.strptime(date_str, '%m/%d/%Y')
    deadline.append(date)

    td3 = post_soup.find('td', string='Primary Category:')
    discipline.append(td3.find_next().text)

d = {'Deadline': deadline, 'Discipline': discipline}
df_hnet_add = pd.DataFrame(d)

df_hnet = df_hnet.reset_index(drop=True)
df_select_hnet = pd.concat([df_hnet, df_hnet_add], axis=1)
df_select_hnet

### 2. Higher Ed Jobs

In [None]:
# Display keyword options to the user
keyword_list = ['chinese+or+asian', 'film+or+media', 'art+or+art+history', 'comparative+literature', 'humanities+or+liberal+arts']

print("Available keywords:")
for idx, keyword in enumerate(keyword_list, 1):
    print(f"{idx}. {keyword.replace('+', ' ')}")

# Prompt user for keyword selection or custom input
choice = input("Select a number from the list above or type a custom keyword (use '+' to connect words): ")

# Determine the keyword based on user input
if choice.isdigit() and 1 <= int(choice) <= len(keyword_list):
    keyword = keyword_list[int(choice) - 1]
else:
    keyword = choice

# Server checks headers & change configuration to bypass the blockage
def get_page_source(n):
    url = f'https://www.higheredjobs.com/search/advanced_action.cfm?Keyword={keyword}&JobCat=152&JobCat=131&JobCat=82&JobCat=76&JobCat=157&JobCat=204&JobCat=97&PosType=1&InstType=1&Remote=1&Region=&Submit=Search+Jobs&SortBy=1&NumJobs=100&CatType='
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    response = requests.get(url, headers=headers)
    return response.text

n = 1006233
text = get_page_source(n)
ied_soup = BeautifulSoup(text, "html.parser")

In [None]:
institution = []
position = []
link = []
discipline = []
deadline = []
post_date = []

# Extract data from the soup
for div in ied_soup.find_all(name='div', attrs={'class':'row record'}):
    institution.append(div.find_next('br').next_element.strip())
    position.append(div.find_next('a').text.strip())
    link.append('https://www.higheredjobs.com/search/' + div.find_next('a')['href'])
    discipline.append(div.find_next(name='div', attrs={'class':'col-sm-5 text-sm-right'}).next_element.strip())

    date_1 = div.find_next(name='div', attrs={'class':'col-sm-5 text-sm-right'}).find_next('br').next_element.strip()
    date_1 = date_1.replace('Posted ','').strip()
    date_1 = datetime.strptime(date_1,'%m/%d/%y')
    post_date.append(date_1)

d = {'Institution': institution, 'Position': position, 'Posting Date': post_date, 'Discipline': discipline,'Link': link}
df_ied = pd.DataFrame(d)
df_ied = df_ied.drop_duplicates(subset=['Link'])

# Ask the user for a posting date
user_date_input = input("Enter a posting date (in the format MM/DD/YY) to filter out jobs posted before this date: ")
user_date = datetime.strptime(user_date_input, '%m/%d/%y')

# Filter the dataframe to only include rows with posting dates on or after the user-specified date
df_ied = df_ied[df_ied['Posting Date'] >= user_date]

df_ied = df_ied.reset_index(drop=True)
df_ied

In [51]:
# Prompt the user to input the select_list index
input_indices = input("Please enter the index numbers separated by commas (e.g., '2,4'): ")
select_list = [int(index.strip()) for index in input_indices.split(",")]

# Filter df_ied using the select_list
selected_df = df_ied.iloc[select_list]

# Extract the first word from the previously selected keyword
first_keyword = keyword.split('+')[0]

# Rename the dataframe
df_name = f"df_select_ied_{first_keyword}"
locals()[df_name] = selected_df

selected_df

Please enter the index numbers separated by commas (e.g., '2,4'): 2,4


### 3. The Chronicle of Higher Education

In [None]:
# Display pre-saved keywords
keyword_list = ['chinese+or+asian', 'film+or+media', 'art+or+art+history', 'comparative+literature', 'humanities+or+liberal+arts']
print("Current keywords:", [keyword.replace('+', ' ') for keyword in keyword_list])

# Ask user if they want to replace keywords
new_keywords = input("Do you want to replace the keywords? If yes, input the new keywords separated by commas. If no, press Enter: ")

# If user provides new keywords, update the keyword_list
if new_keywords:
    keyword_list = [keyword.strip().replace(' ', '+') for keyword in new_keywords.split(',')]

institution = []
position = []
link = []

# POSITION SEARCH
for keyword in keyword_list:
    che_URL = f"https://jobs.chronicle.com/searchjobs/?Keywords={keyword}&radialtown=&LocationId=&RadialLocation=20&CountryCode=&PositionType=53&PositionType=56&EmploymentLevel=170&EmploymentLevel=173&EmploymentLevel=175&EmploymentLevel=177&EmploymentType=189&sort=Date"
    che_page = requests.get(che_URL)
    che_soup = BeautifulSoup(che_page.text, "html.parser")

    for li in che_soup.find_all(name='li', attrs={'class':'lister__item cf lister__item--display-logo-on-listing lister__item--display-logo-on-listing'}):
        position.append(li.find_next('a').text)
        institution.append(li.find_next(name='li', attrs={'class':'lister__meta-item lister__meta-item--recruiter'}).text)
        link.append('https://jobs.chronicle.com' + li.find_next('a')['href'].strip())

d = {'Institution': institution, 'Position': position, 'Link': link}
df_che = pd.DataFrame(d)
df_che = df_che.drop_duplicates(subset=['Link'])
df_che = df_che.reset_index(drop=True)
df_che

In [None]:
# Ask the user to input the index numbers they want to select
select_input = input("Please input the index numbers you want to select from the list above, separated by commas (e.g., 4,17): ")

# Convert the input string to a list of integers
select_list = [int(index.strip()) for index in select_input.split(',')]

# Select the rows based on the provided index numbers
df_select_che = df_che.iloc[select_list]

# Display the selected dataframe
df_select_che

### 4. Save to Spreadsheet



In [56]:
from datetime import date
from google.colab import drive
import pandas as pd

today = str(date.today())

# automatically gather all dataframes that start with "df_select"
df_list = [value for name, value in globals().items() if name.startswith("df_select")]

# using concat to merge the dataframes
df_final = pd.concat(df_list, ignore_index=True)
df_final.to_csv(f'search_result_{today}.csv', index=False)

# mount the file to google drive
drive.mount('drive')

# use the current date in the filename
filename = f"search_result_{today}.csv"

# opy the file to the specified location
!cp $filename 'drive/My Drive/job_search'

Drive already mounted at drive; to attempt to forcibly remount, call drive.mount("drive", force_remount=True).


### 5. Other Sources

* Academic Jobs Wiki (no longer actively maintained)
* Inside Higher Ed Careers (not allowed)
* HERC (not correctly configured server)
* Indeed (not exclusively for academic jobs)
