# Faculty-Wage Scraper

## Imports

In [55]:
import csv
import json
import requests
from bs4 import BeautifulSoup
from typing import List, Dict, Optional
import re

## Scraping Logic

In [56]:
def _split_name(full_name: str) -> tuple[str, str]:
    """
    Split a full name into first name and last name.
    
    :param full_name: The full name string
    :return: A tuple containing (first_name, last_name)
    """
    
    # Remove suffixes and additional info
    clean_name = re.sub(r",?\s*(Ph\.D\.|M\.D\.|J\.D\.|MBA|DDS|MS|BA)\.?$", "", full_name.strip())
    clean_name = re.sub(r"\s*\(.*?\)\s*$", "", clean_name)  # Remove any text in parentheses
    
    # Handle special cases where names might not have a last name
    if '-' in clean_name:  # Handle names with
        clean_name = clean_name.split('-')[0].strip()
    if '—' in clean_name:  # Handle names with
        clean_name = clean_name.split('—')[0].strip()
    
    parts = clean_name.split()
    
    if len(parts) == 0:
        return "", ""
    if len(parts) == 1:
        return parts[0], ""
    else:
        return ' '.join(parts[:-1]).strip(), parts[-1].strip()


In [57]:
def get_faculty_information_from_page(base_url: str, 
                                    faculty_selector: str, 
                                    name_selector: str, 
                                    rank_selector: str,
                                    aggregate_field: str,
                                    department: str,) -> List[Dict[str, str]]:
    """
    Scrape faculty directory data from a given URL, handling pagination if needed.
    
    :param base_url: The base URL of the faculty directory page
    :param faculty_selector: CSS selector for each faculty member's container
    :param name_selector: CSS selector for the faculty member's name within their container
    :param rank_selector: CSS selector for the faculty member's rank within their container
    :param next_page_selector: CSS selector for the "Next Page" link (if paginated)
    :param max_pages: Maximum number of pages to scrape (default: 1)
    :return: List of dictionaries containing faculty information
    """
    faculty_data = []

    response = requests.get(base_url)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.text, 'html.parser')
    faculty_members = soup.select(faculty_selector)
    print(f"Found {len(faculty_members)} members")

    for member in faculty_members:
            name_element = member.select_one(name_selector)
            full_name = name_element.text.strip() if name_element else ''
            first_name, last_name = _split_name(full_name)
            
            rank_element = member.select_one(rank_selector)
            rank = rank_element.text.strip() if rank_element else ''
            
            faculty_data.append({
                'firstName': first_name,
                'lastName': last_name,
                'rank': rank,
                'aggregateField': aggregate_field,
                'departmentInfo': department
            })

    return faculty_data

In [58]:
def write_to_csv(data: List[Dict[str, str]], filename: str):
    """
    Write the faculty data to a CSV file.
    """
    with open(f"output/{filename}", 'a', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['firstName', 'lastName', 'rank', 'aggregateField', 'departmentInfo']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        writer.writeheader()
        for faculty in data:
            writer.writerow(faculty)

In [62]:
FILE_NAME = "LISC_cbh"
with open(f"scraper-configs/{FILE_NAME}.json", 'r') as f:
    CONFIG = json.load(f)
print(CONFIG)

{'base_url': 'https://www.public-health.uiowa.edu/cbh-faculty-list/', 'page_count': 1, 'aggregate_field': 'Life Sciences', 'department_info': 'Community and Behavioral Health', 'faculty_selector': 'table.directory tbody tr', 'name_selector': 'td:nth-child(1) a', 'rank_selector': 'td:nth-child(2)', 'outfileName': 'LISC_cbh.csv'}


In [63]:
for page in range(CONFIG["page_count"]):
    url = f"{CONFIG['base_url']}/?page={page}"
    faculty_data = get_faculty_information_from_page(url, CONFIG["faculty_selector"], 
                                                     CONFIG["name_selector"], CONFIG["rank_selector"],
                                                     CONFIG["aggregate_field"], CONFIG["department_info"])
    write_to_csv(faculty_data, CONFIG["outfileName"])
    print(f"Scraping finished for page {page}")

Found 12 members
[{'firstName': 'Oluwafemi', 'lastName': 'Adeagbo', 'rank': 'Assistant Professor', 'aggregateField': 'Life Sciences', 'departmentInfo': 'Community and Behavioral Health'}, {'firstName': 'Sato', 'lastName': 'Ashida', 'rank': 'Professor', 'aggregateField': 'Life Sciences', 'departmentInfo': 'Community and Behavioral Health'}, {'firstName': 'Natoshia', 'lastName': 'Askelson', 'rank': 'Associate Professor', 'aggregateField': 'Life Sciences', 'departmentInfo': 'Community and Behavioral Health'}, {'firstName': 'Paul', 'lastName': 'Gilbert', 'rank': 'Associate ProfessorDirector of Graduate Studies', 'aggregateField': 'Life Sciences', 'departmentInfo': 'Community and Behavioral Health'}, {'firstName': 'Ebonee', 'lastName': 'Johnson', 'rank': 'Assistant Professor', 'aggregateField': 'Life Sciences', 'departmentInfo': 'Community and Behavioral Health'}, {'firstName': 'Tricia', 'lastName': 'Kitzmann', 'rank': 'Assistant Professor of Instruction', 'aggregateField': 'Life Sciences',

## One-off patterns

In [34]:
def get_biostats_faculty_info(url):
    data = []
    response = requests.get(url)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.text, 'html.parser')
    rows = soup.find_all('tr')
    for row in rows[1:]:
        name = row.find('td').get_text(strip=True)
        first_name, last_name = _split_name(name)
        
        title = row.find_all('td')[1].get_text(" ", strip=True)
        data.append({
            'firstName': first_name, 
            'lastName': last_name, 
            'aggregateField': "Life Sciences",
            'departmentInfo': "Biostatistics",
            'rank': title})
    return data

URL = "https://www.public-health.uiowa.edu/biostatistics-faculty-list/"
data = get_biostats_faculty_info(URL)
write_to_csv(data, 'LISC_biostatistics.csv')

In [54]:
def get_DGS_faculty_info(url):
    data = []
    response = requests.get(url)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.text, 'html.parser')
    rows = soup.find_all('tr')
    for row in rows[2:]:
        department = row.find('td').get_text(strip=True)
        
        names = row.find_all('td')[1].find_all('li')
        for name in names:
            cleaned_name = name.get_text(" ", strip=True)
            first_name, last_name = _split_name(cleaned_name)

            data.append({
                'firstName': first_name, 
                'lastName': last_name,
                'departmentInfo': department,
                'rank': "Director",
            })
    return data

URL = "https://grad.uiowa.edu/faculty-staff/dgs-graduate-faculty/current-directors"
data = get_DGS_faculty_info(URL)
write_to_csv(data, 'mixed_dgs.csv')