The purpose of this script is to extract faculty information from the Top 5 QS-Ranked CS schools.

Following that, the extracted faculty info will be used to map to dblp data, in order to query and retrieve associated publication info for our future analysis.

From the following article: https://www.topuniversities.com/university-rankings-articles/university-subject-rankings/top-computer-science-schools-2021
We note the top 5 universities as follow: 
1. MIT: 
- https://www.eecs.mit.edu/people/faculty-advisors/34 (CS, AI)
- https://www.eecs.mit.edu/people/faculty-advisors/32 (CS, Systems)
- https://www.eecs.mit.edu/people/faculty-advisors/35 (CS, Theory)
- https://www.eecs.mit.edu/people/lecturer (Lecturer)
2. Stanford: 
- https://cs.stanford.edu/directory/faculty
3. CMU: 
- https://csd.cmu.edu/directory/faculty
4. NUS: 
- https://www.comp.nus.edu.sg/about/depts/cs/faculty/
5. UCB: 
- https://www2.eecs.berkeley.edu/Faculty/Lists/CS/faculty.html?_ga=2.57244906.1713537701.1616564430-341988066.1616564430 (CS Faculty)
- https://www2.eecs.berkeley.edu/Faculty/Lists/teaching.html?_ga=2.57244906.1713537701.1616564430-341988066.1616564430 (Teaching Faculty)

We shall start with the special snowflake, MIT first.

# NEW ADDITION
Might need to account for Lecturers in:
- (MIT; another link)
- (Stanford; 2nd table instance, done)
- (no need for CMU, all consolidated in one list!)
- (NUS; bloody special snowflake with 4 separate tables)
- UCB

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re
import pickle
pd.set_option("display.max_rows", None, "display.max_columns", None)

# 1. MIT Faculty Name List Extraction

In [2]:
def retrieve_mit_list():
    
    url_list = [ 
        'https://www.eecs.mit.edu/people/faculty-advisors/34',
        'https://www.eecs.mit.edu/people/faculty-advisors/32',
        'https://www.eecs.mit.edu/people/faculty-advisors/35',
        'https://www.eecs.mit.edu/people/lecturer'
    ]

    # BS4 steps to get MIT CS, AI faculty soup and details
    page = requests.get(url_list[0])
    soup = BeautifulSoup(page.content, 'html.parser')
    results = soup.find("ul", {"class": "faculty-list"})
    r_list = results.find_all('span', class_='field-content card-title')
    # List to store CS, AI faculty names
    mit_cs_ai_list = [each.getText() for each in r_list]

    # BS4 steps to get MIT CS, Systems faculty soup and details
    page = requests.get(url_list[1])
    soup = BeautifulSoup(page.content, 'html.parser')
    results = soup.find("ul", {"class": "faculty-list"})
    r_list = results.find_all('span', class_='field-content card-title')
    # List to store CS, Systems faculty names
    mit_cs_systems_list = [each.getText() for each in r_list]

    # BS4 steps to get MIT CS, Theory faculty soup and details
    page = requests.get(url_list[2])
    soup = BeautifulSoup(page.content, 'html.parser')
    results = soup.find("ul", {"class": "faculty-list"})
    r_list = results.find_all('span', class_='field-content card-title')
    # List to store CS, Theory faculty names
    mit_cs_theory_list = [each.getText() for each in r_list]
    
    # BS4 steps to get MIT lecturer faculty soup and details    
    page = requests.get('https://www.eecs.mit.edu/people/lecturer')
    soup = BeautifulSoup(page.content, 'html.parser')
    results = soup.find("ul", {"class": "faculty-list"})
    r_list = results.find_all('span', class_='field-content card-title')
    # List to store lecturer faculty names
    mit_cs_lecturer_list = [each.getText() for each in r_list]    
    
    return mit_cs_ai_list, mit_cs_systems_list, mit_cs_theory_list, mit_cs_lecturer_list

# 2. Stanford Faculty Name List Extraction

In [3]:
# need to include lecturers
def retrieve_stanford_list():

    '''
    results[0] - Regular Faculty Members
    results[1] - Lecturer Faculty Members
    '''        
    
    # BS4 steps to get Stanford CS faculty soup and details
    page = requests.get('https://cs.stanford.edu/directory/faculty')
    soup = BeautifulSoup(page.content, 'html.parser')
    results = soup.find_all("table")
    
    # List to store Stanford CS Regular faculty names
    stanford_regular_list = []
    for row in results[0].find_all("tr")[1:]:
        col = row.find_all("td")
        fac_name = col[0].getText().strip()
        stanford_regular_list.append(fac_name)
        
    # List to store Stanford CS Lecturer faculty names
    stanford_lecturer_list = []
    for row in results[0].find_all("tr")[1:]:
        col = row.find_all("td")
        fac_name = col[0].getText().strip()
        stanford_lecturer_list.append(fac_name)
    
    return stanford_regular_list, stanford_lecturer_list

# 3. CMU Faculty Name List Extraction

In [4]:
def retrieve_cmu_list():

    # BS4 steps to get CMU CS faculty soup and details
    page = requests.get('https://csd.cmu.edu/directory/faculty')
    soup = BeautifulSoup(page.content, 'html.parser')
    results = soup.find("tbody")
    # List to store CMU CS faculty names
    cmu_cs_list = []
    for row in results.find_all("tr")[1:]:
        col = row.find_all("td")
        fac_name = col[0].getText().strip()
        cmu_cs_list.append(fac_name)

    return cmu_cs_list

# 4. NUS Faculty Name List Extraction

In [5]:
def retrieve_nus_list():
    
    '''
    results[1] - Regular Faculty Members
    results[2] - Joint Faculty Members
    results[3] - Teaching Faculty Members
    results[4] - Research Faculty Members

    '''    
    
    # BS4 steps to get NUS CS faculty soup and details
    page = requests.get('https://www.comp.nus.edu.sg/about/depts/cs/faculty/')
    soup = BeautifulSoup(page.content, 'html.parser')
    results = soup.find_all("tbody")
    
    # List to store NUS CS Regular faculty names
    nus_regular_list = []
    for row in results[1].find_all("tr")[1:]:
        col = row.find_all("td")
        fac_name = col[0].getText().strip()
        nus_regular_list.append(fac_name)
        
    # List to store NUS CS Joint faculty names
    nus_joint_list = []
    for row in results[2].find_all("tr")[1:]:
        col = row.find_all("td")
        fac_name = col[0].getText().strip()
        nus_joint_list.append(fac_name)
        
    # List to store NUS CS Teaching faculty names
    nus_teaching_list = []
    for row in results[3].find_all("tr")[1:]:
        col = row.find_all("td")
        fac_name = col[0].getText().strip()
        nus_teaching_list.append(fac_name)
        
    # List to store NUS CS Research faculty names
    nus_research_list = []
    for row in results[3].find_all("tr")[1:]:
        col = row.find_all("td")
        fac_name = col[0].getText().strip()
        nus_research_list.append(fac_name)
    
    return nus_regular_list, nus_joint_list, nus_teaching_list, nus_research_list

# 5. UCB Faculty Name List Extraction

In [6]:
def retrieve_ucb_list():

    # BS4 steps to get UCB CS faculty soup and details
    page = requests.get('https://www2.eecs.berkeley.edu/Faculty/Lists/CS/faculty.html?_ga=2.57244906.1713537701.1616564430-341988066.1616564430')
    soup = BeautifulSoup(page.content, 'html.parser')
    results = soup.find_all("h3", {"class": "media-heading"})

    # List to store UCB CS faculty names
    ucb_faculty_list = []
    for each in results:
        detail = each.find('a')
        fac_name = detail.getText()
        ucb_faculty_list.append(fac_name)

    # BS4 steps to get UCB CS teaching soup and details
    page = requests.get('https://www2.eecs.berkeley.edu/Faculty/Lists/teaching.html?_ga=2.57244906.1713537701.1616564430-341988066.1616564430')
    soup = BeautifulSoup(page.content, 'html.parser')
    results = soup.find_all("h3", {"class": "media-heading"})    

    # List to store UCB CS teaching names
    ucb_teaching_list = []
    for each in results:
        detail = each.find('a')
        fac_name = detail.getText()
        ucb_teaching_list.append(fac_name)

    return ucb_faculty_list, ucb_teaching_list

In [7]:
top_5_list = [[retrieve_mit_list()], [retrieve_stanford_list()], [retrieve_cmu_list()], [retrieve_nus_list()], [retrieve_ucb_list()]]

In [8]:
with open('top_5_list.pkl', 'wb') as f:
    pickle.dump(top_5_list, f)

In [9]:
# Retrieve pretty_soup_list with pickle
with open('top_5_list.pkl', 'rb') as f:
    top_5_list = pickle.load(f)   

In [11]:
top_5_list[0]

[(['Hal Abelson',
   'Regina Barzilay',
   'Robert Berwick',
   'Tamara Broderick',
   'Rodney Brooks',
   'Randall Davis',
   'Fredo Durand',
   'William Freeman',
   'David Gifford',
   'Polina Golland',
   'W. Eric L. Grimson',
   'John Guttag',
   'Berthold Horn',
   'Tommi Jaakkola',
   'Stefanie Jegelka',
   'Leslie Kaelbling',
   'David Karger',
   'Manolis Kellis',
   'Tomás Lozano-Pérez',
   'Wojciech Matusik',
   'Rob Miller',
   'Joel Moses',
   'Daniela Rus',
   'Devavrat Shah',
   'Justin Solomon',
   'David Sontag',
   'Gerald Sussman',
   'Vivienne Sze',
   'Peter Szolovits',
   'Russell Tedrake',
   'Bruce Tidor',
   'Antonio Torralba',
   'Alan Willsky',
   'Victor Zue'],
  ['Fadel Adib',
   'Mohammad Alizadeh',
   'Saman Amarasinghe',
   ' Arvind',
   'Hari Balakrishnan',
   'Adam Belay',
   'Michael Carbin',
   'Adam Chlipala',
   'Henry Corrigan-Gibbs',
   'Jack Dennis',
   'Srini Devadas',
   'Joel Emer',
   'Manya Ghobadi',
   'Daniel Jackson',
   'M. Frans Kaasho