In [26]:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as ureq
from selenium import webdriver
import time
import re

In [27]:
url = 'https://programs.usask.ca/engineering/mechanical-engineering/be-mechanical-engineering.php#Year14144creditunits'

In [28]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument('--incognito')
chrome_options.add_argument('--headless')

driver = webdriver.Chrome("C:\\Users\\jerry\\Downloads\\chromedriver", options=chrome_options)

In [29]:
driver.get(url)
time.sleep(3)

# 1. Collect course link texts for driver to click on

In [30]:
page_soup = soup(driver.page_source, 'lxml')

In [31]:
#get a list of all the courses that should be scraped
relevant_courses = []

#core year 2-4 courses for mech eng
temp = page_soup.find("section", {"id": "Year236creditunits"})
relevant_courses += re.findall("[A-Z]+ [0-9]{3}\.[0-9]", str(temp))

temp = page_soup.find("section", {"id": "Year339creditunits"})
relevant_courses += re.findall("[A-Z]+ [0-9]{3}\.[0-9]", str(temp))

temp = page_soup.find("section", {"id": "Year436creditunits"})
relevant_courses += re.findall("[A-Z]+ [0-9]{3}\.[0-9]", str(temp))

#technical and design electives for mech eng
for i in range(2, 8):
    temp = page_soup.find("div", {"id": "Electives-subsection-{}".format(i)})
    relevant_courses += re.findall("[A-Z]+ [0-9]{3}\.[0-9]", str(temp))

relevant_courses

['CMPT 141.3',
 'EE 204.3',
 'GE 213.3',
 'MATH 223.3',
 'ME 214.3',
 'ME 227.3',
 'MATH 224.3',
 'ME 215.3',
 'ME 226.3',
 'ME 229.3',
 'ME 251.3',
 'RCM 200.3',
 'ME 313.3',
 'ME 321.3',
 'ME 324.3',
 'ME 327.3',
 'ME 330.3',
 'ME 314.3',
 'ME 323.3',
 'ME 328.3',
 'ME 329.3',
 'ME 335.3',
 'ME 352.3',
 'GE 348.3',
 'ME 417.3',
 'ME 418.3',
 'ME 431.3',
 'GE 449.3',
 'GE 495.6',
 'ME 495.6',
 'GEOE 377.3',
 'GEOE 466.3',
 'BLE 313.3',
 'CHE 464.3',
 'EE 471.3',
 'GEOE 380.3',
 'ME 460.3',
 'ME 461.3',
 'ME 463.3',
 'ME 472.3',
 'ME 475.3',
 'ME 477.3',
 'ME 478.3',
 'CHE 453.3',
 'ME 450.3',
 'ME 462.3',
 'ME 471.3',
 'ME 473.3',
 'ME 476.3',
 'ME 496.3',
 'ME 490.3',
 'ME 492.3',
 'GE 496.3',
 'ME 494.3',
 'ME 491.3',
 'ME 493.3',
 'ME 497.3']

In [32]:
len(relevant_courses)

57

# 2. Automation script to scrape all courses

In [33]:
from selenium.common.exceptions import NoSuchElementException

course_codes = []
course_names = []
course_descs = []
counter = 0

for course in relevant_courses:
    
    #go to course page
    try:
        link = driver.find_element_by_link_text(course)
    except NoSuchElementException:
        print("no link for {}".format(course))
        continue
        
    time.sleep(2)
    link.click()    
    time.sleep(2)
    page_soup = soup(driver.page_source, 'lxml')
    
    #scrape data
    course_codes.append(page_soup.find("h1", {"class": "uofs-page-title"}).text.strip()[:-2])
    course_names.append(page_soup.find("p", {"class": "lead"}).text.strip())
    course_descs.append(page_soup.findAll("div", {"class": "uofs-subsection"})[1].find("p").text.strip())
    
    print("Scraped ", page_soup.find("h1", {"class": "uofs-page-title"}).text.strip()[:-2])
    counter += 1
    
    driver.back()
    time.sleep(2)
    
print("Finished scraping {} courses".format(counter))

Scraped  CMPT 141
Scraped  EE 204
Scraped  GE 213
Scraped  MATH 223
Scraped  ME 214
Scraped  ME 227
Scraped  MATH 224
Scraped  ME 215
Scraped  ME 226
Scraped  ME 229
Scraped  ME 251
Scraped  RCM 200
Scraped  ME 313
Scraped  ME 321
Scraped  ME 324
Scraped  ME 327
Scraped  ME 330
Scraped  ME 314
Scraped  ME 323
Scraped  ME 328
Scraped  ME 329
Scraped  ME 335
Scraped  ME 352
Scraped  GE 348
Scraped  ME 417
Scraped  ME 418
Scraped  ME 431
Scraped  GE 449
Scraped  GE 495
Scraped  ME 495
Scraped  GEOE 377
Scraped  GEOE 466
Scraped  BLE 313
Scraped  CHE 464
Scraped  EE 471
Scraped  GEOE 380
Scraped  ME 460
no link for ME 461.3
no link for ME 463.3
no link for ME 472.3
Scraped  ME 475
Scraped  ME 477
Scraped  ME 478
Scraped  CHE 453
Scraped  ME 450
Scraped  ME 462
Scraped  ME 471
Scraped  ME 473
Scraped  ME 476
Scraped  ME 496
Scraped  ME 490
Scraped  ME 492
Scraped  GE 496
Scraped  ME 494
Scraped  ME 491
Scraped  ME 493
Scraped  ME 497
Finished scraping 54 courses


# 3. Inspect and write to CSV

In [None]:
course_codes

In [22]:
course_names

['Introduction to Computer Science',
 'Basic Electronics and Electrical Power',
 'Mechanics of Materials',
 'Calculus III for Engineers',
 'Introduction to Materials and Manufacturing',
 'Thermodynamics I',
 'Calculus IV for Engineers',
 'Fluid Mechanics I',
 'Mechanics III',
 'Introduction to Mechanical Engineering Design',
 'Engineering Analysis I',
 'Engineering Professional Communication',
 'Mechanics of Materials I',
 'Engineering Analysis II',
 'Engineering Materials',
 'Heat Transfer',
 'Manufacturing Processes',
 'Machine Design I',
 'Mechanics of Materials II',
 'Mechanical Engineering Laboratory I',
 'Collaborative Design and Manufacturing',
 'Fluid Mechanics II',
 'Engineering Analysis III',
 'Engineering Economics',
 'Thermodynamics II',
 'Mechanical Engineering Laboratory II',
 'Control Systems',
 'Engineering in Society',
 'Technological Innovation Capstone Design Project',
 'Industrial Design Project',
 'Fundamentals of Mining and Mineral Processing',
 'Petroleum Geomech

In [23]:
course_descs

['An introduction to computer science and problem solving using procedural programming. This course introduces the basic computer science and computer programming principles of algorithms, abstraction, encapsulation, variables, conditional branching, repetition, functions, recursion, and elementary data structures. These concepts are applied to problem solving applications such as data analysis and visualization, simulation, text processing, and image processing. The programming skills acquired in this course are applicable in all fields of study, the work-place, and personal projects.',
 "This is a basic course on electrical topics for non-electrical engineering disciplines. It explores basic electrical and electronic devices as well as AC power and energy.  Topics include force on a wire carrying a current, Faraday's and Lenz's Laws, electromagnetic induction, inductors, self and mutual inductance, DC inductive transient circuits, basic generator and motor principles, basic transform

In [24]:
import pandas as pd

df = pd.DataFrame({
    
    "Course Number": course_codes,
    "Course Name": course_names,
    "Course Description": course_descs    
    
})

df


Unnamed: 0,Course Number,Course Name,Course Description
0,CMPT 141,Introduction to Computer Science,An introduction to computer science and proble...
1,EE 204,Basic Electronics and Electrical Power,This is a basic course on electrical topics fo...
2,GE 213,Mechanics of Materials,Building upon the concepts introduced in the c...
3,MATH 223,Calculus III for Engineers,Vectors and coordinate geometry in 3- space; v...
4,ME 214,Introduction to Materials and Manufacturing,Provides an introduction to the relations betw...
5,ME 227,Thermodynamics I,The fundamental mass and energy conservation l...
6,MATH 224,Calculus IV for Engineers,Vector fields; vector calculus; ordinary diffe...
7,ME 215,Fluid Mechanics I,The basic principles of fluid mechanics are in...
8,ME 226,Mechanics III,Studies the mechanics (kinematics and kinetics...
9,ME 229,Introduction to Mechanical Engineering Design,This group/project class guides the engineerin...


In [25]:
df.to_csv('USaskatchewan_MechEng_Core_and_Elective_(Year2-4)_Courses.csv', index = False)

In [None]:
driver.quit()