In [2]:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as ureq
from selenium import webdriver
import time
import re

In [10]:
url = "https://programs.usask.ca/engineering/mechanical-engineering/be-mining-option.php#Requirements18creditunits"

In [4]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument('--incognito')
chrome_options.add_argument('--headless')

driver = webdriver.Chrome("C:\\Users\\jerry\\Downloads\\chromedriver", options=chrome_options)

In [11]:
driver.get(url)

# 1. Collect course links for webdriver to click on

In [12]:
page_soup = soup(driver.page_source, 'lxml')

In [14]:
containers = page_soup.find("section", {"id": "Requirements18creditunits"}).findAll("a")
link_texts = [container.text.strip() for container in containers]
link_texts

['EPIP 401.0',
 'EPIP 402.0',
 'EPIP 403.0',
 'GEOE 377.3',
 'GEOL 121.3',
 'ME 490.3',
 'GEOL 224.3',
 'GEOL 245.3',
 'GEOL 258.3',
 'CHE 453.3',
 'GEOE 315.3',
 'GEOE 380.3',
 'ME 491.3',
 'ME 492.3',
 'ME 493.3',
 'ME 497.3']

In [15]:
len(link_texts)

16

# 2. Automation script to script all courses

In [16]:
from selenium.common.exceptions import NoSuchElementException

course_codes = []
course_names = []
course_descs = []
counter = 0

for course in link_texts:
    
    #go to course page
    try:
        link = driver.find_element_by_link_text(course)
    except NoSuchElementException:
        print("no link for {}".format(course))
        continue
        
    time.sleep(2)
    link.click()    
    time.sleep(2)
    page_soup = soup(driver.page_source, 'lxml')
    
    #scrape data
    course_codes.append(page_soup.find("h1", {"class": "uofs-page-title"}).text.strip()[:-2])
    course_names.append(page_soup.find("p", {"class": "lead"}).text.strip())
    course_descs.append(page_soup.findAll("div", {"class": "uofs-subsection"})[1].find("p").text.strip())
    
    print("Scraped ", course_codes[-1])
    counter += 1
    
    driver.back()
    time.sleep(2)
    
print("Finished scraping {} courses".format(counter))

Scraped  EPIP 401
Scraped  EPIP 402
Scraped  EPIP 403
Scraped  GEOE 377
Scraped  GEOL 121
Scraped  ME 490
Scraped  GEOL 224
Scraped  GEOL 245
Scraped  GEOL 258
Scraped  CHE 453
Scraped  GEOE 315
Scraped  GEOE 380
Scraped  ME 491
Scraped  ME 492
Scraped  ME 493
Scraped  ME 497
Finished scraping 16 courses


# 3. Inspect and write to CSV

In [None]:
course_codes

In [18]:
course_names

['Internship Placement I',
 'Internship Placement II',
 'Internship Placement III',
 'Fundamentals of Mining and Mineral Processing',
 'Earth Processes',
 'Design of Fluid Power Circuits',
 'Mineralogy',
 'Introduction to Sedimentary Rocks',
 'Structural Geology',
 'Corrosion Engineering',
 'Rock Mechanics',
 'Mine Ventilation',
 'Thermal Systems Design',
 'Materials in Engineering Design',
 'Advanced Mechanical Design',
 'Acoustics and Vibrations in Design']

In [19]:
course_descs

['The Engineering Student Centre, College of Engineering will register Internship students in this 0- credit unit course for the first 4-month installment of the 8 to 16 month internship placement. This course is graded on a Pass/Fail basis.',
 'The Engineering Student Centre, College of Engineering will register Internship students in this 0- credit unit course for the second 4-month installment of the 8 to 16 month internship placement. This course is graded on a Pass/Fail basis.',
 'The Engineering Student Centre, College of Engineering will register Internship students in this 0- credit unit course for the third 4-month installment of the 12 to 16 month internship placement. This course is graded on a Pass/Fail basis.',
 'Provides the student with a basic understanding of mining engineering and the mining industry. The mining component of the course will introduce the drill and blast cycle, mining methods, and the economic evaluation of mineral properties. The mineral process-engin

In [20]:
import pandas as pd

df = pd.DataFrame({
    
    "Course Number": course_codes,
    "Course Name": course_names,
    "Course Description": course_descs    
    
})

df

Unnamed: 0,Course Number,Course Name,Course Description
0,EPIP 401,Internship Placement I,"The Engineering Student Centre, College of Eng..."
1,EPIP 402,Internship Placement II,"The Engineering Student Centre, College of Eng..."
2,EPIP 403,Internship Placement III,"The Engineering Student Centre, College of Eng..."
3,GEOE 377,Fundamentals of Mining and Mineral Processing,Provides the student with a basic understandin...
4,GEOL 121,Earth Processes,Follows the same lectures as GEOL 108. The lab...
5,ME 490,Design of Fluid Power Circuits,An introduction to the design of industrial an...
6,GEOL 224,Mineralogy,Crystalline materials and their properties; cr...
7,GEOL 245,Introduction to Sedimentary Rocks,Provides a general introduction to sedimentary...
8,GEOL 258,Structural Geology,An introduction to the structural features of ...
9,CHE 453,Corrosion Engineering,Intended for engineers and others who wish to ...


In [21]:
df.to_csv('USaskatchewan_MechEng_Mining_Option_Courses.csv', index = False)

In [22]:
driver.quit()