In [1]:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as ureq
from selenium import webdriver
import time
import re

In [2]:
url = "https://academiccalendars.romcmaster.ca/preview_program.php?catoid=41&poid=21907"

In [3]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument('--incognito')
#chrome_options.add_argument('--headless')

driver = webdriver.Chrome("C:\\Users\\jerry\\Downloads\\chromedriver", options=chrome_options)

In [4]:
driver.get(url)

# 1. Collect course link texts for driver to click open

In [5]:
page_soup = soup(driver.page_source, 'lxml')

In [6]:
containers = page_soup.find("ul", {"style": "list-style-type:disc;"}).findAll("a")
len(containers)

46

In [7]:
link_texts = [container.text for container in containers]
link_texts

['CHEMENG 4T03',
 'ENGINEER 4T04',
 'MATLS 4T03',
 'MECHENG 4B03',
 '4BB3',
 '4CC3',
 '4E03',
 '4H03',
 '4I03',
 '4K03',
 '4L03',
 '4N03',
 '4T03',
 '4Y03',
 '4Z03',
 'CHEMENG 4X03',
 '4T04',
 'MATLS 4T03',
 'MECHENG 4B03',
 '4C03',
 '4D03',
 '4E03',
 '4H03',
 '4K03',
 '4N03',
 '4T03',
 '4Y03',
 '4Z03',
 'MECHENG 4S03',
 'CHEMENG 4X03',
 'ENGPHYS 3D03',
 '4D03',
 '4NE3',
 '4P03',
 'MECHENG 4AA3',
 '4I03',
 '4J03',
 '4N03',
 '4O04',
 '4T03',
 '4U03',
 '4W03',
 '4Y03',
 'CIVENG 3K03',
 'COMMERCE 4QA3',
 'ENGINEER 3N03']

# 2. Test run - try to scrape the first course

In [13]:
driver.find_element_by_link_text(link_texts[0]).click()

In [14]:
page_soup = soup(driver.page_source, 'lxml')

In [16]:
container = page_soup.find("table", {"class": "nopom"}).find("div", {"class": None})
container

<div><h3>CHEMENG 4T03 - Applications of Chemical Engineering in Medicine </h3>    3 unit(s) <br/><hr/>Applications of chemical engineering principles to biological systems and medical problems including examples from hemodynamics, blood oxygenation, artificial kidney systems, controlled drug release, biosensors and biomaterials.<br/>
Three lectures; second term<br/>
<strong>Prerequisite(s):</strong> Registration in Level III or above in any engineering program or registration in Level IV or above in the Integrated Biomedical Engineering &amp; Health Sciences (IBEHS) Program; or permission of the Department<br/></div>

In [17]:
course_title = container.h3.text.strip()
course_title

'CHEMENG 4T03 - Applications of Chemical Engineering in Medicine'

In [18]:
course_desc = container.text.split("unit(s) ")[1].split("\n")[0]
course_desc

'Applications of chemical engineering principles to biological systems and medical problems including examples from hemodynamics, blood oxygenation, artificial kidney systems, controlled drug release, biosensors and biomaterials.'

In [19]:
driver.find_element_by_link_text(link_texts[0]).click()

In [20]:
driver.find_element_by_link_text(link_texts[1]).click()

### automating this will be quite difficult, many of the link texts are identical but belong to different courses or do belong to the same course. Many edge cases would exist in an automation script

In [29]:
driver.find_elements_by_link_text("4L03")

[<selenium.webdriver.remote.webelement.WebElement (session="32155268da36d6877fa49bfc34e0dad2", element="b5d84518-10f8-4653-be49-af08e9cffa8e")>]

In [30]:
driver.find_elements_by_link_text("4L03")[0].click()

In [31]:
page_soup = soup(driver.page_source, 'lxml')
container = page_soup.find("table", {"class": "nopom"}).find("div", {"class": None})
course_title = container.h3.text.strip()
course_title

'CHEMENG 4K03 - Reactor Design for Heterogeneous Systems'

# 3. Test run revealed some challenges. Try an automation script to scrape as many courses as possible

In [33]:
counter = 0
course_codes = []
course_names = []
course_descs = []

for link_text in link_texts:
    
    driver.get(url)
    time.sleep(2)
    links = driver.find_elements_by_link_text(link_text) #obtain a list of all links with this link text
    time.sleep(1)
    
    #iterate through each link and make a descision about whether to scrape its contents or not
    for i in range(len(links)):
        
        driver.get(url)
        time.sleep(2)
        link = driver.find_elements_by_link_text(link_text)[i]
        time.sleep(1)
        link.click()
        time.sleep(2)
    
        page_soup = soup(driver.page_source, 'lxml')
        container = page_soup.find("table", {"class": "nopom"}).find("div", {"class": None})
        
        #obtain the true identifier of the course
        course_code = container.h3.text.strip().split(" - ")[0]
        
        #don't scrape if course already scraped
        if course_code in course_codes:
            print(course_code, " duplicate")
            continue
        
        #special case there might be a mistake on the website, MECH 4K03 is linked to CHEM 4K03. Will have to manually scrape that after
        if link_text == "4K03":
            continue
        
        #don't scrape if the link text is just the alphanumeric part of the course code (does not show subject) and the current course's subject does not match the subject of the last scraped course
        if len(link_text.split()) == 1 and course_code.split()[0] != course_codes[-1].split()[0]:
            print(course_code, " wrong")
            continue
        
        course_codes.append(course_code)
        course_names.append(container.h3.text.strip().split(" - ")[1])
        course_descs.append(container.text.split("unit(s) ")[1].split("\n")[0])
        
        print("Scraped ", course_codes[-1])
        counter += 1
    
print("scraped {} courses".format(counter))

Scraped  CHEMENG 4T03
Scraped  ENGINEER 4T04
Scraped  MATLS 4T03
MATLS 4T03  duplicate
Scraped  MECHENG 4B03
MECHENG 4B03  duplicate
Scraped  MECHENG 4BB3
Scraped  MECHENG 4CC3
Scraped  MECHENG 4E03
MECHENG 4E03  duplicate
Scraped  MECHENG 4H03
MECHENG 4H03  duplicate
Scraped  MECHENG 4I03
MECHENG 4I03  duplicate
Scraped  MECHENG 4L03
Scraped  MECHENG 4N03
MECHENG 4N03  duplicate
MECHENG 4N03  duplicate
Scraped  MECHENG 4T03
MECHENG 4T03  duplicate
MECHENG 4T03  duplicate
Scraped  MECHENG 4Y03
MECHENG 4Y03  duplicate
MECHENG 4Y03  duplicate
Scraped  MECHENG 4Z03
MECHENG 4Z03  duplicate
Scraped  CHEMENG 4X03
CHEMENG 4X03  duplicate
ENGINEER 4T04  duplicate
MATLS 4T03  duplicate
MATLS 4T03  duplicate
MECHENG 4B03  duplicate
MECHENG 4B03  duplicate
MECHENG 4C03  wrong
MECHENG 4D03  wrong
ENGPHYS 4D03  wrong
MECHENG 4E03  duplicate
MECHENG 4E03  duplicate
MECHENG 4H03  duplicate
MECHENG 4H03  duplicate
MECHENG 4N03  duplicate
MECHENG 4N03  duplicate
MECHENG 4N03  duplicate
MECHENG 4T03  du

### manually scrape mecheng 4k03, 4c03, 4d03. Edge cases in the script

In [46]:
driver.get(url)
time.sleep(2)

In [47]:
driver.find_elements_by_link_text("4C03")[0].click()

In [48]:
page_soup = soup(driver.page_source, 'lxml')
container = page_soup.find("table", {"class": "nopom"}).find("div", {"class": None})
        
course_codes.append(container.h3.text.strip().split(" - ")[0])
course_names.append(container.h3.text.strip().split(" - ")[1])
course_descs.append(container.text.split("unit(s) ")[1].split("\n")[0])

In [50]:
driver.get(url)
time.sleep(2)

In [51]:
driver.find_elements_by_link_text("4K03")[1].click()

In [52]:
page_soup = soup(driver.page_source, 'lxml')
container = page_soup.find("table", {"class": "nopom"}).find("div", {"class": None})
        
course_codes.append(container.h3.text.strip().split(" - ")[0])
course_names.append(container.h3.text.strip().split(" - ")[1])
course_descs.append(container.text.split("unit(s) ")[1].split("\n")[0])

In [62]:
driver.get(url)
time.sleep(2)

In [63]:
driver.find_elements_by_link_text("4D03")[0].click()

In [64]:
page_soup = soup(driver.page_source, 'lxml')
container = page_soup.find("table", {"class": "nopom"}).find("div", {"class": None})
        
course_codes.append(container.h3.text.strip().split(" - ")[0])
course_names.append(container.h3.text.strip().split(" - ")[1])
course_descs.append(container.text.split("unit(s) ")[1].split("\n")[0])

# 4. Inspect, clean, and write to CSV

In [65]:
course_codes

['CHEMENG 4T03',
 'ENGINEER 4T04',
 'MATLS 4T03',
 'MECHENG 4B03',
 'MECHENG 4BB3',
 'MECHENG 4CC3',
 'MECHENG 4E03',
 'MECHENG 4H03',
 'MECHENG 4I03',
 'MECHENG 4L03',
 'MECHENG 4N03',
 'MECHENG 4T03',
 'MECHENG 4Y03',
 'MECHENG 4Z03',
 'CHEMENG 4X03',
 'MECHENG 4S03',
 'ENGPHYS 3D03',
 'ENGPHYS 4D03',
 'ENGPHYS 4NE3',
 'ENGPHYS 4P03',
 'MECHENG 4AA3',
 'MECHENG 4J03',
 'MECHENG 4O04',
 'MECHENG 4U03',
 'MECHENG 4W03',
 'CIVENG 3K03',
 'COMMERCE 4QA3',
 'ENGINEER 3N03',
 'MECHENG 4C03',
 'MECHENG 4K03',
 'MECHENG 4D03']

In [66]:
course_names

['Applications of Chemical Engineering in Medicine',
 'Materials Selection in Design and Manufacturing',
 'Properties and Processing of Composites',
 'Topics in Product Development',
 'Biomechanics',
 'Experimental and Computational Biomechanics',
 'Microelectromechanical Systems (MEMS)',
 'Mechatronics',
 'Noise Analysis and Control',
 'Industrial Design',
 'Nanobio Engineering',
 'Finite Element Applications',
 'Internal Combustion Engines',
 'CAD/CAM/CAE',
 'Polymer Processing',
 'Incompressible Flow',
 'Principles of Nuclear Engineering',
 'Nuclear Reactor Physics',
 'Advanced Nuclear Engineering',
 'Nuclear Power Plant Systems and Operation',
 'Aerodynamics',
 'Introduction to Computational Fluid Mechanics and Heat Transfer',
 'Sustainable Energy Systems',
 'Compressible Flow and Turbomachinery',
 'Air Conditioning and Refrigeration Systems',
 'Introduction to Transportation Engineering',
 'Operations Modelling and Analysis',
 'Electronics and Instrumentation',
 'Production System

In [67]:
course_descs

['Applications of chemical engineering principles to biological systems and medical problems including examples from hemodynamics, blood oxygenation, artificial kidney systems, controlled drug release, biosensors and biomaterials.',
 'Materials indices, materials selection charts, materials selection and design with mechanical and thermo-mechanical constraints, design of hybrid materials, sustainable materials selection and design.',
 'Intrinsic properties of matrix materials and fibres; mechanics and thermodynamics of interfaces; mechanical properties and fabrication of engineering composites.',
 'Case studies using modern product development methods, value engineering, product specification, rapid product development, lean design and continuous improvement. Product liability and robust design.',
 'Application of mechanical engineering principles to biomechanics problems including cellular biomechanics, hemodynamics, circulatory system, respiratory system, muscles and movement and ske

In [68]:
course_descs = [course_desc.split("Three lectures")[0] for course_desc in course_descs]
course_descs

['Applications of chemical engineering principles to biological systems and medical problems including examples from hemodynamics, blood oxygenation, artificial kidney systems, controlled drug release, biosensors and biomaterials.',
 'Materials indices, materials selection charts, materials selection and design with mechanical and thermo-mechanical constraints, design of hybrid materials, sustainable materials selection and design.',
 'Intrinsic properties of matrix materials and fibres; mechanics and thermodynamics of interfaces; mechanical properties and fabrication of engineering composites.',
 'Case studies using modern product development methods, value engineering, product specification, rapid product development, lean design and continuous improvement. Product liability and robust design.',
 'Application of mechanical engineering principles to biomechanics problems including cellular biomechanics, hemodynamics, circulatory system, respiratory system, muscles and movement and ske

In [69]:
course_descs = [course_desc.split("Four lectures")[0] for course_desc in course_descs]
course_descs

['Applications of chemical engineering principles to biological systems and medical problems including examples from hemodynamics, blood oxygenation, artificial kidney systems, controlled drug release, biosensors and biomaterials.',
 'Materials indices, materials selection charts, materials selection and design with mechanical and thermo-mechanical constraints, design of hybrid materials, sustainable materials selection and design.',
 'Intrinsic properties of matrix materials and fibres; mechanics and thermodynamics of interfaces; mechanical properties and fabrication of engineering composites.',
 'Case studies using modern product development methods, value engineering, product specification, rapid product development, lean design and continuous improvement. Product liability and robust design.',
 'Application of mechanical engineering principles to biomechanics problems including cellular biomechanics, hemodynamics, circulatory system, respiratory system, muscles and movement and ske

In [70]:
course_descs = [course_desc.split("Lectures (three hours)")[0] for course_desc in course_descs]
course_descs

['Applications of chemical engineering principles to biological systems and medical problems including examples from hemodynamics, blood oxygenation, artificial kidney systems, controlled drug release, biosensors and biomaterials.',
 'Materials indices, materials selection charts, materials selection and design with mechanical and thermo-mechanical constraints, design of hybrid materials, sustainable materials selection and design.',
 'Intrinsic properties of matrix materials and fibres; mechanics and thermodynamics of interfaces; mechanical properties and fabrication of engineering composites.',
 'Case studies using modern product development methods, value engineering, product specification, rapid product development, lean design and continuous improvement. Product liability and robust design.',
 'Application of mechanical engineering principles to biomechanics problems including cellular biomechanics, hemodynamics, circulatory system, respiratory system, muscles and movement and ske

In [71]:
len(course_codes)

31

In [72]:
course_codes1 = []
course_names1 = []
course_descs1 = []

for i in range(len(course_codes)):
    if course_codes[i] not in course_codes1:
        course_codes1.append(course_codes[i])
        course_names1.append(course_names[i])
        course_descs1.append(course_descs[i])

In [73]:
len(course_codes1) #should be the same

31

In [74]:
import pandas as pd

df = pd.DataFrame({
    "Course Number": course_codes1,
    "Course Name": course_names1,
    "Course Description": course_descs1   
})

df

Unnamed: 0,Course Number,Course Name,Course Description
0,CHEMENG 4T03,Applications of Chemical Engineering in Medicine,Applications of chemical engineering principle...
1,ENGINEER 4T04,Materials Selection in Design and Manufacturing,"Materials indices, materials selection charts,..."
2,MATLS 4T03,Properties and Processing of Composites,Intrinsic properties of matrix materials and f...
3,MECHENG 4B03,Topics in Product Development,Case studies using modern product development ...
4,MECHENG 4BB3,Biomechanics,Application of mechanical engineering principl...
5,MECHENG 4CC3,Experimental and Computational Biomechanics,Introduction to experimental and computational...
6,MECHENG 4E03,Microelectromechanical Systems (MEMS),"Introduction, microfabrication and micromachin..."
7,MECHENG 4H03,Mechatronics,Integration of mechanical engineering with ele...
8,MECHENG 4I03,Noise Analysis and Control,Acoustic quantities; noise measurements and an...
9,MECHENG 4L03,Industrial Design,Introduction for engineering students to the t...


In [75]:
df.to_csv('NEW_McMaster_MechEng_Technical_Electives_Courses.csv', index = False)

In [76]:
driver.quit()