In [1]:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as ureq
from selenium import webdriver
import time
import re

In [2]:
url = 'https://programs.usask.ca/engineering/first-year/index.php#Year14144creditunits'

In [3]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument('--incognito')
chrome_options.add_argument('--headless')

driver = webdriver.Chrome("C:\\Users\\jerry\\Downloads\\chromedriver", options=chrome_options)

In [4]:
driver.get(url)
time.sleep(3)

# 1. Collect course link texts for webdriver to click on

In [5]:
page_html = driver.page_source

In [6]:
link_texts = re.findall("[A-Z]+ [0-9]{3}\.[0-9]", page_html)

link_texts = list(dict.fromkeys(link_texts))

link_texts

['GE 102.2',
 'GE 112.1',
 'GE 122.2',
 'GE 132.1',
 'GE 142.2',
 'GE 152.1',
 'CMPT 142.3',
 'MATH 133.4',
 'PHYS 152.1',
 'CHEM 142.1',
 'GEOL 102.1',
 'BIOL 102.1',
 'GE 103.1',
 'GE 123.3',
 'GE 133.2',
 'GE 143.2',
 'GE 153.2',
 'GE 163.2',
 'CHEM 146.3',
 'MATH 134.3',
 'PHYS 156.3',
 'CMPT 146.3',
 'ME 113.3',
 'CHE 113.3',
 'CE 271.2']

In [7]:
len(link_texts)

25

# 2. Test run - try to scrape the first course

In [8]:
link = driver.find_element_by_link_text(link_texts[0])
link.click()
time.sleep(2)
driver.page_source

'<html><head>\n    <meta charset="UTF-8">\n    <title>\n       Introduction to Engineering I (GE 102) - Course Catalogue | University of\n      Saskatchewan\n    </title>\n    <meta content="This course includes two concurrent modules.  Module 1 introduces students to the profession of engineering and life as an engineering student.  The course will allow students to learn, apply and reflect upon strategies for success in engineering in areas including: well-being, group dynamics, conflict resolution, time management, goal setting, planning, studying, problem solving and academic honesty.  Module 2 introduces students to important aspects of the culture and worldviews of Indigenous Peoples and contextualizes the engineering profession within those worldviews.  The course will introduce students to the engineer’s legal and moral duty to consult with affected communities and examples of historical and contemporary influences of Indigenous worldviews on technology and engineering design. 

In [9]:
page_soup = soup(driver.page_source, 'lxml')

In [14]:
page_soup.find("h1", {"class": "uofs-page-title"}).text.strip()[:-2]

'GE 102'

In [13]:
page_soup.find("p", {"class": "lead"}).text.strip()

'Introduction to Engineering I'

In [21]:
page_soup.findAll("div", {"class": "uofs-subsection"})[1].find("p").text.strip()

'This course includes two concurrent modules.  Module 1 introduces students to the profession of engineering and life as an engineering student.  The course will allow students to learn, apply and reflect upon strategies for success in engineering in areas including: well-being, group dynamics, conflict resolution, time management, goal setting, planning, studying, problem solving and academic honesty.  Module 2 introduces students to important aspects of the culture and worldviews of Indigenous Peoples and contextualizes the engineering profession within those worldviews.  The course will introduce students to the engineer’s legal and moral duty to consult with affected communities and examples of historical and contemporary influences of Indigenous worldviews on technology and engineering design.  There is also discussion about the importance of inclusion of, and respect for, all people.'

In [22]:
driver.back()

# 3. Test run successful. Implement automation script to scrape all courses

In [23]:
from selenium.common.exceptions import NoSuchElementException

course_codes = []
course_names = []
course_descs = []
counter = 0

for link_text in link_texts:
    
    #go to course page
    try:
        link = driver.find_element_by_partial_link_text(link_text)
    except NoSuchElementException:
        print("no link for {}".format(link_text))
        continue
        
    time.sleep(2)
    link.click()    
    time.sleep(2)
    page_soup = soup(driver.page_source, 'lxml')
    
    #scrape data
    course_codes.append(page_soup.find("h1", {"class": "uofs-page-title"}).text.strip()[:-2])
    course_names.append(page_soup.find("p", {"class": "lead"}).text.strip())
    course_descs.append(page_soup.findAll("div", {"class": "uofs-subsection"})[1].find("p").text.strip())
    
    print("Scraped ", page_soup.find("h1", {"class": "uofs-page-title"}).text.strip()[:-2])
    counter += 1
    
    driver.back()
    time.sleep(2)
    
print("Finished scraping {} courses".format(counter))

Scraped  GE 102
Scraped  GE 112
Scraped  GE 122
Scraped  GE 132
Scraped  GE 142
Scraped  GE 152
Scraped  CMPT 142
Scraped  MATH 133
Scraped  PHYS 152
Scraped  CHEM 142
Scraped  GEOL 102
Scraped  BIOL 102
Scraped  GE 103
Scraped  GE 123
Scraped  GE 133
Scraped  GE 143
Scraped  GE 153
Scraped  GE 163
Scraped  CHEM 146
Scraped  MATH 134
Scraped  PHYS 156
Scraped  CMPT 146
Scraped  ME 113
Scraped  CHE 113
Scraped  CE 271
Finished scraping 25 courses


# 4. Inspect, clean, and write to CSV

In [24]:
course_codes

['GE 102',
 'GE 112',
 'GE 122',
 'GE 132',
 'GE 142',
 'GE 152',
 'CMPT 142',
 'MATH 133',
 'PHYS 152',
 'CHEM 142',
 'GEOL 102',
 'BIOL 102',
 'GE 103',
 'GE 123',
 'GE 133',
 'GE 143',
 'GE 153',
 'GE 163',
 'CHEM 146',
 'MATH 134',
 'PHYS 156',
 'CMPT 146',
 'ME 113',
 'CHE 113',
 'CE 271']

In [25]:
course_names

['Introduction to Engineering I',
 'Engineering Discipline Experience',
 'Engineering Mechanics I',
 'Engineering Communications I',
 'Design I',
 'Electrical Circuits I',
 'Introduction to Computer Science for Engineers',
 'Engineering Mathematics I',
 'Introduction to Atoms and Nuclei for Engineering',
 'Global Impact of Chemistry for Engineering',
 'Introduction to Geology for Engineering',
 'Nature for Engineering',
 'Introduction to Engineering II',
 'Engineering Mechanics II',
 'Engineering Communication II',
 'Design II',
 'Electrical Circuits II',
 'Process Engineering',
 'General Chemistry for Engineering',
 'Engineering Mathematics II',
 'Electromagnetism and Waves for Engineering',
 'Principles of Computer Science for Engineers',
 'Engineering Analysis I',
 'Unit Operations in Chemical Process Engineering',
 'Spring Surveying Camp']

In [26]:
course_descs

['This course includes two concurrent modules.  Module 1 introduces students to the profession of engineering and life as an engineering student.  The course will allow students to learn, apply and reflect upon strategies for success in engineering in areas including: well-being, group dynamics, conflict resolution, time management, goal setting, planning, studying, problem solving and academic honesty.  Module 2 introduces students to important aspects of the culture and worldviews of Indigenous Peoples and contextualizes the engineering profession within those worldviews.  The course will introduce students to the engineer’s legal and moral duty to consult with affected communities and examples of historical and contemporary influences of Indigenous worldviews on technology and engineering design.  There is also discussion about the importance of inclusion of, and respect for, all people.',
 'This course will provide students with an opportunity to have a meaningful experience for en

In [27]:
#the two last courses and the fourth last course are not taken by mech eng students
irrelevant_codes = ["CMPT 146", "CHE 113", "CE 271"]

mech_codes = []
mech_names = []
mech_descs = []

for i in range(len(course_codes)):
    if course_codes[i] not in irrelevant_codes:
        mech_codes.append(course_codes[i])
        mech_names.append(course_names[i])
        mech_descs.append(course_descs[i])
        
mech_codes

['GE 102',
 'GE 112',
 'GE 122',
 'GE 132',
 'GE 142',
 'GE 152',
 'CMPT 142',
 'MATH 133',
 'PHYS 152',
 'CHEM 142',
 'GEOL 102',
 'BIOL 102',
 'GE 103',
 'GE 123',
 'GE 133',
 'GE 143',
 'GE 153',
 'GE 163',
 'CHEM 146',
 'MATH 134',
 'PHYS 156',
 'ME 113']

In [29]:
mech_names

['Introduction to Engineering I',
 'Engineering Discipline Experience',
 'Engineering Mechanics I',
 'Engineering Communications I',
 'Design I',
 'Electrical Circuits I',
 'Introduction to Computer Science for Engineers',
 'Engineering Mathematics I',
 'Introduction to Atoms and Nuclei for Engineering',
 'Global Impact of Chemistry for Engineering',
 'Introduction to Geology for Engineering',
 'Nature for Engineering',
 'Introduction to Engineering II',
 'Engineering Mechanics II',
 'Engineering Communication II',
 'Design II',
 'Electrical Circuits II',
 'Process Engineering',
 'General Chemistry for Engineering',
 'Engineering Mathematics II',
 'Electromagnetism and Waves for Engineering',
 'Engineering Analysis I']

In [31]:
mech_descs

['This course includes two concurrent modules.  Module 1 introduces students to the profession of engineering and life as an engineering student.  The course will allow students to learn, apply and reflect upon strategies for success in engineering in areas including: well-being, group dynamics, conflict resolution, time management, goal setting, planning, studying, problem solving and academic honesty.  Module 2 introduces students to important aspects of the culture and worldviews of Indigenous Peoples and contextualizes the engineering profession within those worldviews.  The course will introduce students to the engineer’s legal and moral duty to consult with affected communities and examples of historical and contemporary influences of Indigenous worldviews on technology and engineering design.  There is also discussion about the importance of inclusion of, and respect for, all people.',
 'This course will provide students with an opportunity to have a meaningful experience for en

In [32]:
import pandas as pd

df = pd.DataFrame({
    
    "Course Number": mech_codes,
    "Course Name": mech_names,
    "Course Description": mech_descs    
    
})

df.to_csv('USaskatchewan_Engineeering_Common_First_Year_Courses.csv', index = False)


In [34]:
len(mech_codes)

22

In [None]:
driver.quit()