# Academic Job Search Automation

In [None]:
import requests
import json
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import time
from IPython.core.display import Pretty
from datetime import datetime

In [37]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 200)

  """Entry point for launching an IPython kernel.


### 1. H-Net Job Guide 

In [None]:
hnet_URL = "https://www.h-net.org/jobs/job_browse.php"
hnet_page = requests.get(hnet_URL)
hnet_soup = BeautifulSoup(hnet_page.text, "html.parser")

institution = []
position = []
link = []
post_date = []

if hnet_soup.find(name='p', string='September 2021'):
    month_post = hnet_soup.find(name='p', string='September 2021').next_sibling.next_sibling
else: pass

for a in month_post.find_all(name='a'):
  institution.append(a.parent.next_element.replace(", ", ""))
  position.append(a.text)
  link.append('https://www.h-net.org/jobs/' + a['href'])

  date_str = a.find_next('span')['title']
  date = datetime.strptime(date_str, '%A, %d %B %Y, %X %p').date()
  post_date.append(date)

d = {'Institution':institution, 'Position': position, 'Link': link, 'Posting Date': post_date}
df_hnet = pd.DataFrame(d)
df_hnet = df_hnet.drop_duplicates(subset=['Link'])

df_hnet = df_hnet.reset_index(drop=True)
df_hnet

In [None]:
#SELECT THE INDEX NUMBER FROM THE LIST ABOVE
select_list = []
df_hnet = df_hnet.iloc[select_list]

deadline = []
discipline = []

for i in list(df_hnet['Link']):
    post = requests.get(i)
    post_soup = BeautifulSoup(post.text, "html.parser")
    td1 = post_soup.find('td', string='Closing Date')
    date_str = td1.find_next().text
    date = datetime.strptime(date_str, '%m/%d/%Y')
    deadline.append(date)

    td3 = post_soup.find('td', string='Primary Category:')
    discipline.append(td3.find_next().text)

d = {'Deadline':deadline, 'Discipline': discipline}
df_hnet_add = pd.DataFrame(d)

df_hnet=df_hnet.reset_index(drop=True)
df_hnet=pd.concat([df_hnet, df_hnet_add], axis=1)
df_hnet

### 1.1 H-Net Search by Category

In [None]:
#SELECT YOUR CATEGORY NUMBER IN "my_cat_list="
my_cat_list = ['10','11','12','104','29','32','38']

institution = []
position = []
post_date = []
link = []

for i in my_cat_list:
  hnet_URL = f"https://www.h-net.org/jobs/job_browse.php?category_id={i}"
  hnet_page = requests.get(hnet_URL)
  hnet_soup = BeautifulSoup(hnet_page.text, "html.parser")

#CHANGE THE JOB RELASE MONTH In 'string= September 2021'
  if hnet_soup.find(name='p', string='September 2021'):
    month_post = hnet_soup.find(name='p', string='September 2021').next_sibling.next_sibling
  else: pass
  
  for a in month_post.find_all(name='a'):
    institution.append(a.parent.next_element.replace(", ", ""))
    position.append(a.text)
    link.append('https://www.h-net.org/jobs/' + a['href'])

    date_str = a.find_next('span')['title']
    date = datetime.strptime(date_str, '%A, %d %B %Y, %X %p').date()
    post_date.append(date)

d = {'Institution':institution, 'Position': position, 'Link': link}
df_hnet = pd.DataFrame(d)
df_hnet = df_hnet.drop_duplicates(subset=['Link'])

df_hnet = df_hnet.reset_index(drop=True)
df_hnet

### 2. Higher Ed Jobs

In [None]:
# server checks headers & change configuration to bypass the blockage
keyword_list = ['chinese+or+asian','film+or+media','art+or+art+history','humanities+postdoc']

# CHANGE KEY WORDS MANUALLY
keyword = keyword_list[0]

def get_page_source(n):
    url = f'https://www.higheredjobs.com/search/advanced_action.cfm?Keyword={keyword}&JobCat=152&JobCat=131&JobCat=82&JobCat=76&JobCat=157&JobCat=204&JobCat=97&PosType=1&InstType=1&Remote=1&Region=&Submit=Search+Jobs&SortBy=1&NumJobs=100&CatType='
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    response = requests.get(url, headers=headers)
    return response.text

n = 1006233
text = get_page_source(n)
ied_soup = BeautifulSoup(text, "html.parser")

In [None]:
institution = []
position = []
link = []
discipline = []
deadline = []
post_date = []

#CHANGE THE SEARCH LIMIT IN "limit=" IF NECCESSARY
for div in ied_soup.find_all(name='div', attrs={'class':'row record'}):
  institution.append(div.find_next('br').next_element.strip())
  position.append(div.find_next('a').text.strip())
  link.append('https://www.higheredjobs.com/search/' + div.find_next('a')['href'])
  discipline.append(div.find_next(name='div', attrs={'class':'col-sm-5 text-sm-right'}).next_element.strip())

  date_1 = div.find_next(name='div', attrs={'class':'col-sm-5 text-sm-right'}).find_next('br').next_element.strip()
  date_1 = date_1.replace('Posted ','').strip()
  date_1 = datetime.strptime(date_1,'%m/%d/%y')
  post_date.append(date_1)

d = {'Institution':institution, 'Position': position, 'Posting Date':post_date, 'Link': link, 'Discipline': discipline}
df_ied = pd.DataFrame(d)
df_ied = df_ied.drop_duplicates(subset=['Link'])

df_ied = df_ied.reset_index(drop=True)
df_ied

In [None]:
#SELECT THE INDEX NUMBER FROM THE LIST ABOVE
select_list = []

#CHANGE THE NUMBERING FOR EACH KEYWORD SEARCH
df_ied_4 = df_ied.iloc[select_list]

### 3. The Chronicle of Higher Education

In [None]:
#CHANGE THE KEYWORDS IN 'my_keyword='
keyword_list = ['chinese+or+asian','film+or+media','media+art']

institution = []
position = []
link = []
deadline=[]
discipline = []
post_date = []

# POSITION SEARCH
for keyword in keyword_list:
  che_URL = f"https://jobs.chronicle.com/searchjobs/?Keywords={keyword}&radialtown=&LocationId=&RadialLocation=20&CountryCode=&PositionType=53&PositionType=56&EmploymentLevel=170&EmploymentLevel=173&EmploymentLevel=175&EmploymentLevel=177&EmploymentType=189"
  che_page = requests.get(che_URL)
  che_soup = BeautifulSoup(che_page.text,"html.parser")

  for li in che_soup.find_all(name='li', attrs={'class':'lister__item cf lister__item--display-logo-on-listing lister__item--display-logo-on-listing'}):
    position.append(li.find_next('a').text)
    institution.append(li.find_next(name='li',attrs={'class':'lister__meta-item lister__meta-item--recruiter'}).text)
    link.append('https://jobs.chronicle.com'+li.find_next('a')['href'].strip())

# POSTDOC SEARCH
che_URL='https://jobs.chronicle.com/searchjobs/?Keywords=humanities&radialtown=&LocationId=&RadialLocation=20&CountryCode=&PositionType=53&PositionType=56&EmploymentLevel=175&EmploymentType=189'
che_page = requests.get(che_URL)
che_soup = BeautifulSoup(che_page.text,"html.parser")

for li in che_soup.find_all(name='li', attrs={'class':'lister__item cf lister__item--display-logo-on-listing lister__item--display-logo-on-listing'}):
  position.append(li.find_next('a').text)
  institution.append(li.find_next(name='li',attrs={'class':'lister__meta-item lister__meta-item--recruiter'}).text)
  link.append('https://jobs.chronicle.com'+li.find_next('a')['href'].strip())

d = {'Institution':institution, 'Position': position, 'Link': link}
df_che = pd.DataFrame(d)
df_che = df_che.drop_duplicates(subset=['Link'])
df_che = df_che.reset_index(drop=True)
df_che

In [None]:
#SELECT THE INDEX NUMBER FROM THE LIST ABOVE
select_list = []
df_che = df_che.iloc[select_list]

### 3.5 Inside Higher Education

In [None]:
#CHANGE THE KEYWORDS IN 'my_keyword='
my_keyword = ['chinese or asian','film or media','art or art history','humanities postdoc']

result = pd.DataFrame()

institution = []
position = []
link = []
deadline=[]
discipline = []
post_date = []

#CHANGE THE URL SEARCH CODE IN "ihe_URL=" IF NECCESSARY
for keyword in my_keyword:
  ihe_URL = f"https://careers.insidehighered.com/searchjobs/?Keywords={keyword}&radialtown=&LocationId=&RadialLocation=20&NearFacetsShown=true&CountryCode=&FacultyJobs=1&SpecialFilters=513023&SpecialFilters=125&SpecialFilters=127&EmploymentType=129&sort=Date"
  ihe_page = requests.get(ihe_URL)
  ihe_soup = BeautifulSoup(ihe_page.text, "html.parser")

#CHANGE THE SEARCH LIMIT IN "limit=" IF NECCESSARY
  for li in ihe_soup.find_all(name='li', attrs={'class':'lister__meta-item lister__meta-item--recruiter'})[3:]:
    institution.append(li.text)
    discipline.append(keyword)

  for h in ihe_soup.find_all(name='h3', attrs={'class':'lister__header'})[3:]:
    position.append(h.text)
    link.append('https://careers.insidehighered.com' + h.next['href'].strip())
    
d = {'Institution':institution, 'Position': position,'Link': link, 'Discipline':discipline}
df_ihe = pd.DataFrame(d)
df_ihe = df_ihe.drop_duplicates(subset=['Link'])
df_ihe=df_ihe.reset_index(drop=True)

df_ihe

### 4. Save to Spreadsheet



In [None]:
from datetime import date

today = str(date.today())

df_final = pd.DataFrame()
df_list = [df_hnet,df_ied_1, df_ied_2, df_ied_3,df_che]

for i in df_list:
  df_final=df_final.append(i)

df_final.to_csv(f'serach_result_{today}.csv',index=False)

In [None]:
from google.colab import drive
drive.mount('drive')

#CHANGE THE FILE NAME
!cp serach_result_2021-09-17.csv 'drive/My Drive/job_search'

Mounted at drive


### 5. Other Sources

* Academic Jobs Wiki (no longer actively maintained)
* HERC (not correctly configured server)
* Indeed (not exclusively for academic jobs)
