In [1]:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as ureq
from selenium import webdriver
import time
import re

In [2]:
url = "https://academiccalendars.romcmaster.ca/preview_program.php?catoid=41&poid=22151"

In [3]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument('--incognito')
#chrome_options.add_argument('--headless')

driver = webdriver.Chrome("C:\\Users\\jerry\\Downloads\\chromedriver", options=chrome_options)

In [4]:
driver.get(url)

# 1. Collect course link texts for driver to click on

In [5]:
page_soup = soup(driver.page_source, 'lxml')

In [6]:
containers = page_soup.findAll("li", {"class": "acalog-course"})
len(containers)

8

In [9]:
link_texts = [container.span.text.strip() for container in containers]
link_texts

['CHEM 1E03 - General Chemistry for Engineering I',
 'ENGINEER 1P13 A/B - Integrated Cornerstone Design Projects in Engineering',
 'MATH 1ZA3 - Engineering Mathematics I',
 'MATH 1ZB3 - Engineering Mathematics II-A',
 'MATH 1ZC3 - Engineering Mathematics II-B',
 'PHYSICS 1D03 - Introductory Mechanics',
 'PHYSICS 1E03 - Waves, Electricity and Magnetic Fields',
 'WHMIS 1A00 - Introduction to Health and Safety']

# 2. Quick test and then script to click open all course info boxes

In [10]:
driver.find_element_by_link_text(link_texts[0]).click()

In [11]:
driver.find_element_by_link_text(link_texts[1]).click()

In [12]:
from selenium.webdriver.common.keys import Keys

driver.get(url)
time.sleep(5)

for link_text in link_texts:
    link = driver.find_element_by_link_text(link_text)
        
    time.sleep(2)
    link.click()    
    time.sleep(3)
    print("clicked {}".format(link_text))

clicked CHEM 1E03 - General Chemistry for Engineering I
clicked ENGINEER 1P13 A/B - Integrated Cornerstone Design Projects in Engineering
clicked MATH 1ZA3 - Engineering Mathematics I
clicked MATH 1ZB3 - Engineering Mathematics II-A
clicked MATH 1ZC3 - Engineering Mathematics II-B
clicked PHYSICS 1D03 - Introductory Mechanics
clicked PHYSICS 1E03 - Waves, Electricity and Magnetic Fields
clicked WHMIS 1A00 - Introduction to Health and Safety


# 3. Obtain updated page's html and scrape course codes, names, and descriptions

In [13]:
page_soup = soup(driver.page_source, 'lxml')

In [14]:
containers = page_soup.findAll("li", {"class": "acalog-course acalog-course-open"})
len(containers)

8

In [15]:
course_infos = [container.find("div", {"class": None}).text for container in containers]
course_infos

['CHEM 1E03 - General Chemistry for Engineering I     3 unit(s) An introduction to chemical principles for Engineering students, including reactivity, bonding, structure, energetics and electrochemistry.\nThree lectures, one lab (two and one half hours) every other week; one term\nPrerequisite(s): Registration in a program in Engineering\nAntirequisite(s): CHEM 1A03\xa0\nNot open to students with credit or registration in ISCI 1A24 A/B\xa0.CloseClose',
 'ENGINEER 1P13 A/B - Integrated Cornerstone Design Projects in Engineering     13 unit(s) Project-based integrated learning course that introduces a range of foundational (i.e., cornerstone) topics in engineering, including engineering design and communication, computation, graphic design, materials and the engineering profession. These topics are applied through a series of integrated team-based design projects in simulated workplace environments.\nThree lectures, one tutorial (two hours), two labs (three hours each); both terms\nPrere

In [16]:
course_descs = [course_info.split("unit(s) ")[1].split("\n")[0] for course_info in course_infos]
course_descs

['An introduction to chemical principles for Engineering students, including reactivity, bonding, structure, energetics and electrochemistry.',
 'Project-based integrated learning course that introduces a range of foundational (i.e., cornerstone) topics in engineering, including engineering design and communication, computation, graphic design, materials and the engineering profession. These topics are applied through a series of integrated team-based design projects in simulated workplace environments.',
 'Functions: limits, continuity, derivatives, optimization, curve sketching. Antiderivative, definite integral, techniques of integration, with applications.',
 'Techniques of integration, applications of definite integrals, differential equations, polar coordinates, parametrized curves. Sequences, infinite series, power series. Partial derivatives.',
 'Vector spaces given by solutions to linear systems. Linear independence, dimension. Determinants. Eigenvalues, eigenvectors and diago

In [17]:
course_codes = [link_text.split(" - ")[0] for link_text in link_texts]
course_codes

['CHEM 1E03',
 'ENGINEER 1P13 A/B',
 'MATH 1ZA3',
 'MATH 1ZB3',
 'MATH 1ZC3',
 'PHYSICS 1D03',
 'PHYSICS 1E03',
 'WHMIS 1A00']

In [18]:
course_names = [link_text.split(" - ")[1] for link_text in link_texts]
course_names

['General Chemistry for Engineering I',
 'Integrated Cornerstone Design Projects in Engineering',
 'Engineering Mathematics I',
 'Engineering Mathematics II-A',
 'Engineering Mathematics II-B',
 'Introductory Mechanics',
 'Waves, Electricity and Magnetic Fields',
 'Introduction to Health and Safety']

# 4. Write to CSV

In [19]:
import pandas as pd

df = pd.DataFrame({
    "Course Number": course_codes,
    "Course Name": course_names,
    "Course Description": course_descs    
})

df

Unnamed: 0,Course Number,Course Name,Course Description
0,CHEM 1E03,General Chemistry for Engineering I,An introduction to chemical principles for Eng...
1,ENGINEER 1P13 A/B,Integrated Cornerstone Design Projects in Engi...,Project-based integrated learning course that ...
2,MATH 1ZA3,Engineering Mathematics I,"Functions: limits, continuity, derivatives, op..."
3,MATH 1ZB3,Engineering Mathematics II-A,"Techniques of integration, applications of def..."
4,MATH 1ZC3,Engineering Mathematics II-B,Vector spaces given by solutions to linear sys...
5,PHYSICS 1D03,Introductory Mechanics,A course for engineering students. Principles ...
6,PHYSICS 1E03,"Waves, Electricity and Magnetic Fields",A course for engineering students. Oscillation...
7,WHMIS 1A00,Introduction to Health and Safety,Introduction to safety guidelines at McMaster ...


In [20]:
df.to_csv('McMaster_EngineeringI_(Year1)_Courses.csv', index = False)

In [21]:
driver.quit()