In [1]:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as ureq
from selenium import webdriver
import time
import re

In [2]:
url = "https://calendars.students.yorku.ca/2021-2022/programs/LE/mechanical-engineering"

In [3]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument('--incognito')
#chrome_options.add_argument('--headless')

driver = webdriver.Chrome("C:\\Users\\jerry\\Downloads\\chromedriver", options=chrome_options)

# 1. Locate tab where the courses are listed and collect the course links

In [4]:
driver.get(url)

In [5]:
driver.find_element_by_link_text("Degree Requirements").click()

In [6]:
driver.find_element_by_link_text("Faculty-Wide Degree Requirements").click()

In [7]:
driver.find_element_by_id("ui-id-7").click()

In [8]:
driver.find_element_by_id("ui-id-7").click()

In [9]:
page_soup = soup(driver.page_source, 'lxml')

In [10]:
containers = page_soup.find("div", {"id": "specialized-honours-bachelor-of-engineering-specialized-honours-beng"}).find("ul").findAll("a")
len(containers)

20

In [11]:
link_texts = [container.text.strip() for container in containers if container.text != "LE/EECS 1028 3.00"] #this course does not apply to mecheng
link_texts

['SC/CHEM 1100 4.00',
 'LE/EECS 1011 3.00',
 'LE/EECS 1021 3.00',
 'LE/ENG 1101 4.00',
 'LE/ENG 1102 4.00',
 'LE/ENG 2001 3.00',
 'LE/ENG 2003 3.00',
 'LE/ENG 3000 3.00',
 'LE/ENG 4000 6.00',
 'EU/ENVS 2150 3.00',
 'LE/ESSE 2210 3.00',
 'SC/MATH 1013 3.00',
 'SC/MATH 1014 3.00',
 'SC/MATH 1025 3.00',
 'SC/MATH 2015 3.00',
 'SC/MATH 2930 3.00',
 'SC/PHYS 1800 3.00',
 'SC/PHYS 1801 3.00',
 'LE/ESSE 1012 3.00']

In [12]:
len(link_texts)

19

# 2. Test run - scrape the first course

In [13]:
driver.find_element_by_link_text(link_texts[0]).click()

In [14]:
page_soup = soup(driver.page_source, 'lxml')

In [15]:
course_title = page_soup.find("h1").text.strip()
course_title

'SC/CHEM 1100   4.00 \xa0\xa0Chemistry and Materials Science for Engineers'

In [16]:
course_code = re.split("\s+ [0-9\.]+ \s+", course_title)[0]
course_code

'SC/CHEM 1100'

In [17]:
course_name = re.split("\s+ [0-9\.]+ \s+", course_title)[1]
course_name

'Chemistry and Materials Science for Engineers'

In [18]:
course_desc = page_soup.find("p", text="Course Description:").findNextSibling().text.strip()
course_desc

'The course is designed for Engineering students interested in refreshing and expending their general chemistry knowledge while exploring the relationship between structure of matter, properties and processing. This course will focus mainly at covering important introductory concept to understand solution chemistry including reactivity, thermochemistry, structure and properties of materials. The course is divided in six sections. The first section covers an introduction to the topic of Materials Science and its impact on our daily lives as well as future trends and review key chemistry concepts required for this course. The second section will present the states of matter (gas, liquid and solid), their physical characteristics and the forces holding materials together (bonding and intermolecular forces). The third section will expend on the liquid phase and properties of solutions including equilibrium, solubility, pH and pKa. The fourth section will deal with thermochemistry and its f

In [19]:
driver.back()

# 3. Test run successful. Implement automation script to scrape all courses

In [24]:
from selenium.common.exceptions import NoSuchElementException

course_codes = []
course_names = []
course_descs = []
counter = 0

driver.get(url)
time.sleep(3)
    
for link_text in link_texts:
    
    driver.find_element_by_link_text("Degree Requirements").click()
    time.sleep(1)
    driver.find_element_by_link_text("Faculty-Wide Degree Requirements").click()
    time.sleep(1)
    driver.find_element_by_id("ui-id-7").click()
    time.sleep(1)
    driver.find_element_by_id("ui-id-7").click()
    time.sleep(1)
    try:
        link = driver.find_element_by_link_text(link_text)
    except NoSuchElementException:
        print("no link for {}".format(link_text))
        continue
    time.sleep(1)
    link.click()    
    time.sleep(3)
    
    #scrape course info
    page_soup = soup(driver.page_source, 'lxml')
    try:
        course_title = page_soup.find("h1").text.strip()
    except AttributeError:
        print("Course info unavailable")
        print()
        driver.back()
        time.sleep(3)
        continue
    course_code = re.split("   [0-9\.]+\s+", course_title)[0].strip()
    course_name = re.split("   [0-9\.]+\s+", course_title)[1].strip()
    course_desc = page_soup.find("p", text="Course Description:").findNextSibling().text.strip()
    
    course_codes.append(course_code)
    course_names.append(course_name)
    course_descs.append(course_desc)
    
    counter += 1
    print("Scraped ", course_codes[-1], " Courses Scraped: ", counter)    
    
    #go to course list page
    driver.get(url)
    time.sleep(3)
        
print("Finished scraping {} courses".format(counter))


Scraped  SC/CHEM 1100  Courses Scraped:  1
Scraped  LE/EECS 1011  Courses Scraped:  2
Scraped  LE/EECS 1021  Courses Scraped:  3
Scraped  LE/ENG  1101  Courses Scraped:  4
Scraped  LE/ENG  1102  Courses Scraped:  5
Scraped  LE/ENG  2001  Courses Scraped:  6
Scraped  LE/ENG  2003  Courses Scraped:  7
Scraped  LE/ENG  3000  Courses Scraped:  8
Scraped  LE/ENG  4000  Courses Scraped:  9
Course info unavailable

Scraped  LE/ESSE 2210  Courses Scraped:  10
Scraped  SC/MATH 1013  Courses Scraped:  11
Scraped  SC/MATH 1014  Courses Scraped:  12
Scraped  SC/MATH 1025  Courses Scraped:  13
Scraped  SC/MATH 2015  Courses Scraped:  14
Scraped  SC/MATH 2930  Courses Scraped:  15
Scraped  SC/PHYS 1800  Courses Scraped:  16
Scraped  SC/PHYS 1801  Courses Scraped:  17
Scraped  LE/ESSE 1012  Courses Scraped:  18
Finished scraping 18 courses


# 4. Inspect, clean, and write to CSV

In [25]:
len(course_codes)

18

In [26]:
len(list(dict.fromkeys(course_codes)))

18

In [27]:
course_codes

['SC/CHEM 1100',
 'LE/EECS 1011',
 'LE/EECS 1021',
 'LE/ENG  1101',
 'LE/ENG  1102',
 'LE/ENG  2001',
 'LE/ENG  2003',
 'LE/ENG  3000',
 'LE/ENG  4000',
 'LE/ESSE 2210',
 'SC/MATH 1013',
 'SC/MATH 1014',
 'SC/MATH 1025',
 'SC/MATH 2015',
 'SC/MATH 2930',
 'SC/PHYS 1800',
 'SC/PHYS 1801',
 'LE/ESSE 1012']

In [28]:
course_names

['Chemistry and Materials Science for Engineers',
 'Computational Thinking through Mechatronics',
 'Object Oriented Programming from Sensors to Actuators',
 'Renaissance Engineer 1: Ethics, Communication & Problem Solving',
 'Renaissance Engineer 2: Engineering Design Principles',
 'Engineering Projects: Management, Economics and Safety',
 'Effective Engineering Communication',
 'Professional Engineering Practice',
 'Engineering Project (Capstone)',
 'Engineering and the Environment',
 'Applied Calculus I',
 'Applied Calculus II',
 'Applied Linear Algebra',
 'Applied Multivariate and Vector Calculus',
 'Introductory Probability and Statistics',
 'Engineering Mechanics',
 'Electricity, Magnetism and Optics for Engineers',
 'The Earth Environment']

In [29]:
course_descs

['The course is designed for Engineering students interested in refreshing and expending their general chemistry knowledge while exploring the relationship between structure of matter, properties and processing. This course will focus mainly at covering important introductory concept to understand solution chemistry including reactivity, thermochemistry, structure and properties of materials. The course is divided in six sections. The first section covers an introduction to the topic of Materials Science and its impact on our daily lives as well as future trends and review key chemistry concepts required for this course. The second section will present the states of matter (gas, liquid and solid), their physical characteristics and the forces holding materials together (bonding and intermolecular forces). The third section will expend on the liquid phase and properties of solutions including equilibrium, solubility, pH and pKa. The fourth section will deal with thermochemistry and its 

In [30]:
course_descs = [desc.replace("\n", " ") for desc in course_descs]

In [31]:
course_descs

['The course is designed for Engineering students interested in refreshing and expending their general chemistry knowledge while exploring the relationship between structure of matter, properties and processing. This course will focus mainly at covering important introductory concept to understand solution chemistry including reactivity, thermochemistry, structure and properties of materials. The course is divided in six sections. The first section covers an introduction to the topic of Materials Science and its impact on our daily lives as well as future trends and review key chemistry concepts required for this course. The second section will present the states of matter (gas, liquid and solid), their physical characteristics and the forces holding materials together (bonding and intermolecular forces). The third section will expend on the liquid phase and properties of solutions including equilibrium, solubility, pH and pKa. The fourth section will deal with thermochemistry and its 

In [32]:
import pandas as pd

df = pd.DataFrame({
    "Course Number": course_codes,
    "Course Name": course_names,
    "Course Description": course_descs
})

df

Unnamed: 0,Course Number,Course Name,Course Description
0,SC/CHEM 1100,Chemistry and Materials Science for Engineers,The course is designed for Engineering student...
1,LE/EECS 1011,Computational Thinking through Mechatronics,The Objectives of 1011 are threefold: providin...
2,LE/EECS 1021,Object Oriented Programming from Sensors to Ac...,"""Introduces student to computational thinking ..."
3,LE/ENG 1101,"Renaissance Engineer 1: Ethics, Communication ...",Who is an engineer and what are his/her ethica...
4,LE/ENG 1102,Renaissance Engineer 2: Engineering Design Pri...,This course will cover: engineering design met...
5,LE/ENG 2001,"Engineering Projects: Management, Economics an...","Introduction to the management, economics and ..."
6,LE/ENG 2003,Effective Engineering Communication,Students learn to effectively employ communica...
7,LE/ENG 3000,Professional Engineering Practice,An introduction to the legal and ethical frame...
8,LE/ENG 4000,Engineering Project (Capstone),The project will include significant elements ...
9,LE/ESSE 2210,Engineering and the Environment,This course surveys a variety of Canadian case...


In [33]:
df.to_csv('York_Engineering_Common_Core_for_MechEng_Courses.csv', index = False)

In [34]:
driver.quit()