In [1]:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as ureq
from selenium import webdriver
import time
import re

In [2]:
url = "https://catalog.umanitoba.ca/undergraduate-studies/engineering/mechanical-engineering/mechanical-engineering-bsc/index.html#preliminaryengineeringprogramtext"

In [4]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument('--incognito')
#chrome_options.add_argument('--headless')

driver = webdriver.Chrome("C:\\Users\\jerry\\Downloads\\chromedriver", options=chrome_options)

In [10]:
driver.get(url)

# 1. Collect course link texts for driver to click on

In [11]:
page_soup = soup(driver.page_source, 'lxml')

In [14]:
containers = page_soup.find("div", {"id": "preliminaryengineeringprogramtextcontainer"}).find("table", {"class": "sc_courselist"}).find("tbody").findAll("a")
len(containers)

12

In [15]:
link_texts = [container.text for container in containers]
link_texts

['CHEM\xa01100',
 'CHEM\xa01122',
 'COMP\xa01012',
 'ENG\xa01430',
 'ENG\xa01440',
 'ENG\xa01450',
 'ENG\xa01460',
 'MATH\xa01210',
 'MATH\xa01510',
 'MATH\xa01710',
 'PHIL\xa01290',
 'PHYS\xa01050']

In [17]:
driver.find_element_by_link_text("CHEM 1100")

<selenium.webdriver.remote.webelement.WebElement (session="c8b850ce1c11259ded7fd9b9ccebd5c7", element="a645ba9e-b517-459f-ab27-2bbe03de9e3f")>

In [18]:
link_texts = [link_text.replace("\xa0", " ") for link_text in link_texts]
link_texts

['CHEM 1100',
 'CHEM 1122',
 'COMP 1012',
 'ENG 1430',
 'ENG 1440',
 'ENG 1450',
 'ENG 1460',
 'MATH 1210',
 'MATH 1510',
 'MATH 1710',
 'PHIL 1290',
 'PHYS 1050']

# 2. Test run - try scraping the first course

In [19]:
driver.find_element_by_link_text("CHEM 1100").click()

In [20]:
page_soup = soup(driver.page_source, 'lxml')

In [21]:
course_block = page_soup.find("div", {"class": "lfjsbubble"}).find("div", {"class": "courseblock"})
course_block

<div class="courseblock"><div class="cols noindent"><span class="text col-3 detail-code margin--tiny text--semibold text--big"><strong>CHEM 1100</strong></span>  <span class="text col-7 detail-title margin--tiny text--semibold text--big"><strong>Introductory Chemistry 1: Atomic and Molecular Structure and Energetics</strong></span>  <span class="text detail-hours_html margin--tiny text--semibold text--big"><strong>3 cr</strong></span>  </div><div class="noindent"><p class="courseblockextra noindent">This course provides a basic understanding of the fundamentals of chemistry. By the end of this course, students will understand the periodic table, energy in chemistry, atomic and molecular structures, and the concept of chemical reactivity. May not be held with the former CHEM 1300 or <a class="bubblelink code" href="/search/?P=CHEM%201301" onclick="return showCourse(this, 'CHEM 1301');" title="CHEM 1301">CHEM 1301</a>.</p></div><div class="noindent"><p class="courseblockextra noindent"><

In [23]:
course_code = course_block.find("div", {"class": "cols noindent"}).findAll("span")[0].text
course_code

'CHEM 1100'

In [24]:
course_name = course_block.find("div", {"class": "cols noindent"}).findAll("span")[1].text
course_name

'Introductory Chemistry 1: Atomic and Molecular Structure and Energetics'

In [31]:
course_desc = course_block.find("p").text.replace("\xa0", " ")
course_desc

'This course provides a basic understanding of the fundamentals of chemistry. By the end of this course, students will understand the periodic table, energy in chemistry, atomic and molecular structures, and the concept of chemical reactivity. May not be held with the former CHEM 1300 or CHEM 1301.'

In [26]:
course_desc = course_desc.replace("\xa0", " ")
course_desc

'This course provides a basic understanding of the fundamentals of chemistry. By the end of this course, students will understand the periodic table, energy in chemistry, atomic and molecular structures, and the concept of chemical reactivity. May not be held with the former CHEM 1300 or CHEM 1301.'

In [28]:
from selenium.webdriver.common.keys import Keys
driver.find_element_by_link_text("CHEM 1100").send_keys(Keys.ESCAPE)

In [30]:
driver.find_element_by_link_text("ENG 1460").click()

# 3. Test run successful. Implement automation script to scrape all courses

In [32]:
counter = 0
course_names = []
course_codes = []
course_descs = []

driver.get(url)
time.sleep(5)

for link_text in link_texts:
    
    link = driver.find_element_by_link_text(link_text)
    time.sleep(2)
    link.click()
    time.sleep(3)
    
    page_soup = soup(driver.page_source, 'lxml')
    course_block = page_soup.find("div", {"class": "lfjsbubble"}).find("div", {"class": "courseblock"})
    course_codes.append(course_block.find("div", {"class": "cols noindent"}).findAll("span")[0].text.strip())
    course_names.append(course_block.find("div", {"class": "cols noindent"}).findAll("span")[1].text.strip())
    course_descs.append(course_block.find("p").text.strip().replace("\xa0", " "))
    
    print("Scraped ", course_codes[-1])
    counter += 1
    
    link.send_keys(Keys.ESCAPE)
    time.sleep(2)
    
    
print("Successfully scraped {} courses".format(counter))
    

Scraped  CHEM 1100
Scraped  CHEM 1122
Scraped  COMP 1012
Scraped  ENG 1430
Scraped  ENG 1440
Scraped  ENG 1450
Scraped  ENG 1460
Scraped  MATH 1210
Scraped  MATH 1510
Scraped  MATH 1710
Scraped  PHIL 1290
Scraped  PHYS 1050
Successfully scraped 12 courses


# 4. Inspect and write to CSV

In [33]:
course_codes

['CHEM 1100',
 'CHEM 1122',
 'COMP 1012',
 'ENG 1430',
 'ENG 1440',
 'ENG 1450',
 'ENG 1460',
 'MATH 1210',
 'MATH 1510',
 'MATH 1710',
 'PHIL 1290',
 'PHYS 1050']

In [34]:
course_names

['Introductory Chemistry 1: Atomic and Molecular Structure and Energetics',
 'Introduction to Chemical Techniques for Engineering 1',
 'Computer Programming for Scientists and Engineers',
 'Design in Engineering',
 'Introduction to Statics',
 'Introduction to Electrical and Computer Engineering',
 'Introduction to Thermal Sciences',
 'Techniques of Classical and Linear Algebra',
 'Applied Calculus 1',
 'Applied Calculus 2',
 'Critical Thinking',
 'Physics 1: Mechanics']

In [35]:
course_descs

['This course provides a basic understanding of the fundamentals of chemistry. By the end of this course, students will understand the periodic table, energy in chemistry, atomic and molecular structures, and the concept of chemical reactivity. May not be held with the former CHEM 1300 or CHEM 1301.',
 'For Price Faculty of Engineering students only. This course builds understanding in chemistry through active learning in the lab. By performing lab experiments, students will gain skills in making observations, safe handling of chemicals, handling laboratory equipment, quantitative analysis, data processing, and scientific communication. These skills are fundamental for student success in chemistry. In addition, students will be given a broader appreciation of chemistry in the world by introducing them to chemical sustainability, chemical responsibility and chemical applications. May not be held with CHEM 1120, the former CHEM 1310 or CHEM 1311.',
 '(Lab Required) An introduction to com

In [36]:
import pandas as pd

df = pd.DataFrame({
    "Course Number": course_codes,
    "Course Name": course_names,
    "Course Description": course_descs    
})

df

Unnamed: 0,Course Number,Course Name,Course Description
0,CHEM 1100,Introductory Chemistry 1: Atomic and Molecular...,This course provides a basic understanding of ...
1,CHEM 1122,Introduction to Chemical Techniques for Engine...,For Price Faculty of Engineering students only...
2,COMP 1012,Computer Programming for Scientists and Engineers,(Lab Required) An introduction to computer pro...
3,ENG 1430,Design in Engineering,The creative process; the design process; work...
4,ENG 1440,Introduction to Statics,(Lab required) Statics of particles; rigid bod...
5,ENG 1450,Introduction to Electrical and Computer Engine...,"(Lab required) Part I: Current, voltage, energ..."
6,ENG 1460,Introduction to Thermal Sciences,(Lab required) Properties of pure substances; ...
7,MATH 1210,Techniques of Classical and Linear Algebra,(Lab Required) To introduce a variety of pract...
8,MATH 1510,Applied Calculus 1,(Lab Required) Functions and graphs; limits an...
9,MATH 1710,Applied Calculus 2,(Lab Required) Applications of integration to ...


In [37]:
df.to_csv('UManitoba_Preliminary_Engineering_Program_Courses.csv', index = False)

In [38]:
driver.quit()