In [101]:
from bs4 import BeautifulSoup
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

options = Options()
options.headless = True
driver = webdriver.Chrome("./chromedriver", options=options)

# uses webdriver object to execute javascript code and get dynamically loaded webcontent
def get_js_soup(url, driver):
    driver.get(url)
    res_html = driver.execute_script("return document.body.innerHTML")
    soup = BeautifulSoup(res_html,"html.parser")
    return clean_soup(soup)

def clean_soup(soup):
    for scr in soup.find_all(['script', 'img']):
        scr.decompose()
    return soup

def get_tags_with_matching_classes(key_classes_set, soup):
    def is_any_class_match(css_class):
        if css_class is None:
            return False
        
        class_tokens =  set(re.split(r'\s|-', css_class.lower())) # sdap, profile, field, links, research, etc
        return len(class_tokens.intersection(key_classes_set)) > 0
    
    matches = []
    for match in soup.find_all(class_=is_any_class_match):
        matches = [m for m in matches if match not in m.descendants]
        matches.append(match)

    return matches

def get_faculty_html_tags_from_url(url, key_classes):
    soup = get_js_soup(curr_faculty_url, driver)
    matching_tags = get_tags_with_matching_classes(key_classes, soup)
    return matching_tags

def save_faculty_html_tags(filename, tags):
    with open(filename, "w") as f:
        for t in tags:
            f.write(str(t))

def get_faculty_text_from_tags(tags):
    faculty_page_text = ' '.join([res.get_text() for res in tags])
    cleaned_faculty_page_text = re.sub("\s+", " ", faculty_page_text).strip()
    return cleaned_faculty_page_text

def save_faculty_text(filename, text):
    with open(filename, "w") as f:
        f.write(text)

# test gensim's LDA with chbe prof bio content
# construct rules for each dept in engr by having a prototype faculty member page HTML for each dept

In [102]:
chbe_key_classes = set([
    "profile",
    "biography",
    "research",
    "education",
    "email",
    "phone",
    "title"
])

save_folder = "data/chbe/"
chbe_urls = "chbe_faculty_page_urls.txt"

with open(chbe_urls) as f:
    chbe_faculty_urls = [s.strip() for s in f.readlines()]
    
for url in chbe_faculty_urls:
    tags = get_faculty_html_tags_from_url(url, chbe_key_classes)
    faculty_id = url[url.rindex('/')+1:]
    text = get_faculty_text_from_tags(tags)
    
    save_faculty_html_tags(save_folder + "chbe_" + faculty_id + ".html", tags)
    save_faculty_text(save_folder + "chbe_" + faculty_id + ".txt", text)

In [103]:
# dept_htmls = soup.find_all("h3", "list-expand-header")
# dept_names = [dh.get_text() for dh in dept_htmls]
# pprint(dept_names)

# engr_dept_faculty = {
#     "Agricultural and Biological Engineering": [],
#     "Aerospace Engineering": [],
#     "Bioengineering": [],
#     "Civil and Environmental Engineering": "",
#     "Chemical & Biomolecular Engineering": "",
#     "Computer Science",
#     "Electrical and Computer Engineering",
#     "Industrial and Enterprise Systems Engineering",
#     "Materials Science and Engineering",
#     "Mechanical Science and Engineering",
#     "Nuclear, Plasma and Radiological Engineering",
#     "Physics"
# }
