In [466]:
from bs4 import BeautifulSoup
from selenium import webdriver 
from selenium.webdriver.chrome.options import Options
import re 
import urllib
import time

In [467]:
#create a webdriver object and set options for headless browsing
options = Options()
options.headless = True
driver = webdriver.Chrome('/Users/sunnie/Desktop/School/UIUC/CS410/MP2.1/chromedriver',options=options)

In [468]:
#uses webdriver object to execute javascript code and get dynamically loaded webcontent
def get_js_soup(url,driver):
    driver.get(url)
    res_html = driver.execute_script('return document.body.innerHTML')
    soup = BeautifulSoup(res_html,'html.parser') #beautiful soup object to be used for parsing html content
    return soup

#tidies extracted text 
def process_bio(bio):
    bio = bio.encode('ascii',errors='ignore').decode('utf-8')       #removes non-ascii characters
    bio = re.sub('\s+',' ',bio)       #repalces repeated whitespace characters with single space
    return bio

''' More tidying
Sometimes the text extracted HTML webpage may contain javascript code and some style elements. 
This function removes script and style tags from HTML so that extracted text does not contain them.
'''
def remove_script(soup):
    for script in soup(["script", "style"]):
        script.decompose()
    return soup


#Checks if bio_url is a valid faculty homepage
def is_valid_homepage(bio_url,dir_url):
    if bio_url.endswith('.pdf'): #we're not parsing pdfs
        return False
    try:
        #sometimes the homepage url points to the same page as the faculty profile page
        #which should be treated differently from an actual homepage
        ret_url = urllib.request.urlopen(bio_url).geturl() 
    except:
        return False       #unable to access bio_url
    urls = [re.sub('((https?://)|(www.))','',url) for url in [ret_url,dir_url]] #removes url scheme (https,http or www) 
    return not(urls[0]== urls[1])

In [469]:
#extracts all Faculty Profile page urls from the Directory Listing Page
def scrape_dir_page(dir_url,driver):
    print ('-'*20,'Scraping directory page','-'*20)
    faculty_links = []
    faculty_base_url = 'https://www.sjsu.edu'
    #execute js on webpage to load faculty listings on webpage and get ready to parse the loaded HTML 
    soup = get_js_soup(dir_url,driver)    
    for link in soup.find_all('a', href=True):
        if "people" in link.get("href") or "edu/~" in link.get("href"):
            faculty_links.append(str(link.get('href')))
    print ('-'*20,'Found {} faculty profile urls'.format(len(faculty_links)),'-'*20)
    return faculty_links

In [470]:
dir_url = 'https://www.sjsu.edu/cs/faculty/faculty.php' #url of directory listings of CS faculty
faculty_links = scrape_dir_page(dir_url,driver)

-------------------- Scraping directory page --------------------
-------------------- Found 18 faculty profile urls --------------------


In [471]:
from bs4 import BeautifulSoup
from bs4.element import Comment
import urllib.request

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)



def scrape_faculty_page(fac_url,driver):
    soup = get_js_soup(fac_url,driver)
    homepage_found = False
    bio_url = ''
    bio = ''
    profile_sec = soup.find('div', class_= 'content_wrapper')
#     if profile_sec is not None:
#         all_headers = profile_sec.find_all('titile')
#         faculty_last_name = all_headers[0].get_text().lower().split()[-1] #find faculty last name
#         faculty_first_name = all_headers[0].get_text().lower().split()[0]
#         homepage_txts = ['University',faculty_last_name,faculty_first_name]
#         #exceptions = ['course ','research','group','cs','mirror','google scholar']
#         #find the homepage url and extract all text from it
#         for hdr in all_headers:  #first find the required header
#             if hdr.text.lower() == 'for more information':
#                 next_tag = hdr.find_next('li')
#                 #find <li> which has homepage url
#                 while next_tag is not None: 
#                     cand = next_tag.find('a')
#                     next_tag = next_tag.next_sibling  #sibling means element present at the same level
#                     try:
#                         cand['href']
#                     except:
#                         continue
#                     cand_text = cand.string

#                     if cand_text is not None and (any(hp_txt in cand_text.lower() for hp_txt in homepage_txts) and 
#                         not any(e in cand_text.lower() for e in exceptions)): #compare text to predefined patterns
#                         bio_url = cand['href'] 
#                         homepage_found = True
#                         #check if homepage url is valid
#                         if not(is_valid_homepage(bio_url,fac_url)):
#                             homepage_found = False
#                         else:
#                             try:
#                                 bio_soup = remove_script(get_js_soup(bio_url,driver)) 
#                             except:
#                                 print ('Could not access {}'.format(bio_url))
#                                 homepage_found = False
#                         break 
#                 if homepage_found:
#                     #get all the text from homepage(bio)
#                     bio = process_bio(bio_soup.get_text(separator=' ')) 


#         if not homepage_found:
    bio_url = fac_url #treat faculty profile page as homepage
    
    if(soup.find('div', class_='content_wrapper')):
        profile_sec = soup.find('div', class_= 'content_wrapper')
        bio = process_bio(profile_sec.get_text(separator=' '))
    elif(soup.find('div', id_='content')):
        profile_sec = soup.find('div', id_= 'content')
        bio = process_bio(profile_sec.get_text(separator=' '))
    elif(soup.find('div', clss_='content_item')):
        profile_sec = soup.find('div', class_= 'content_item')
        bio = process_bio(profile_sec.get_text(separator=' '))    
    elif(soup.find('div', title ='Position and address')):
        profile_sec = soup.find('div', title ='Position and address')
        bio = process_bio(profile_sec.get_text(separator=' '))   
    elif(soup.find_all("p")):
        for para in soup.find_all("p"):
            bio = para.get_text(separator=' ').strip()
#     else:
#         html = urllib.request.urlopen(bio_url).read()
#         print(html)
#         bio = text_from_html(html)
#         print(bio)
    return bio_url,bio

In [472]:
#Scrape homepages of all urls
bio_urls, bios = [],[]
tot_urls = len(faculty_links)
for i,link in enumerate(faculty_links):
    print ('-'*20,'Scraping faculty url {}/{}'.format(i+1,tot_urls),'-'*20)
    bio_url,bio = scrape_faculty_page(link,driver)
    print("test", bio_url, bio)
    if bio.strip()!= '' and bio_url.strip()!='':
        bio_urls.append(bio_url.strip())
        bios.append(bio)
driver.close()

-------------------- Scraping faculty url 1/18 --------------------
test https://www.sjsu.edu/people/william.andreopoulos/index.html  Assistant Professor,Department of Computer Science Email Preferred: william.andreopoulos@sjsu.edu Telephone Preferred: 408 924-5085 Office Hours Friday 15:00-17:00pm online Teaching -Fall 2020 CS149 - Operating Systems CS147 - Computer Architecture BIOL221T - Advanced Bioinformatics for Biotechnology Teaching -Summer 2020 CS149 - Operating Systems - Section 80 Teaching -Spring 2020 CS146- Data Structures andAlgorithms - Sections2 & 3 CS149 - Operating Systems - Section 2 Teaching - Fall 2019 CS146- Data Structures andAlgorithms - Sections 6 & 8 BIOL221T - Advanced Bioinformatics for Biotechnology Research Interests Machine learning, software engineering, bioinformatics Education Ph.D. Department of Computer Science and Engineering, York University, Toronto, Canada (2006) M.Sc. Department of Computer Science, University of Toronto, Toronto, Canada (2001) 

test http://www.cs.sjsu.edu/~stamp/ 
-------------------- Scraping faculty url 15/18 --------------------
test http://www.cs.sjsu.edu/~taylor/ Fall 2021


 office hours:
-------------------- Scraping faculty url 16/18 --------------------
test http://stage.sjsu.edu/people/chris.tseng/index.html  Professor, Computer Science Dept. Email Preferred: chris.tseng@sjsu.edu Education PhD, Electrical and Computer Engineering,University of Illinois at Urbana-Champaign, 1988 MS, Mathematics,University of Illinois at Urbana-Champaign, 1985 BS, Electrical Engineering, National Taiwan University, 1982 Links http://www.sjsu.edu/people/chris.tseng/assessment/ 
-------------------- Scraping faculty url 17/18 --------------------
test http://www.cs.sjsu.edu/~wesley/ Ohhh ... if I had only known then as much as I have forgotten by now ...
-------------------- Scraping faculty url 18/18 --------------------
test https://www.sjsu.edu/people/ching-seh.wu/index.html  Email Preferred: ching-seh.wu@sjsu.edu Go

In [473]:
def write_lst(lst,file_):
    with open(file_,'w') as f:
        for l in lst:
            f.write(l)
            f.write('\n')

In [474]:
bio_urls_file = 'bio_urls.txt'
bios_file = 'bios.txt'
write_lst(bio_urls,bio_urls_file)
write_lst(bios,bios_file)