In [1]:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as ureq
from selenium import webdriver
import time
import re

In [2]:
#listing of all McGill engineering courses, applied filters for mech 400 - 500 level courses
url = "https://www.mcgill.ca/study/2020-2021/faculties/engineering/undergraduate/courses/engineering/undergraduate/engineering/undergraduate?sort_by=field_subject_code&f%5B0%5D=field_subject_code%3AMECH&f%5B1%5D=course_level%3A400&f%5B2%5D=course_level%3A500"

In [3]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument('--incognito')
#chrome_options.add_argument('--headless')

driver = webdriver.Chrome("C:\\Users\\jerry\\Downloads\\chromedriver", options=chrome_options)

In [4]:
driver.get(url)

In [35]:
page_soup = soup(driver.page_source, 'lxml')

# 1. Collect course link texts on the current page

In [36]:
containers = page_soup.find("div", {"class": "view-content"}).findAll("a")
len(containers)

20

In [37]:
link_texts = [container.text.strip() for container in containers]
link_texts

['MECH 403D1 Thesis (Honours) (3 credits)',
 'MECH 403D2 Thesis (Honours) (3 credits)',
 'MECH 403N1 Thesis (Honours) (3 credits)',
 'MECH 403N2 Thesis (Honours) (3 credits)',
 'MECH 404 Honours Thesis 2 (3 credits)',
 'MECH 412 System Dynamics and Control (3 credits)',
 'MECH 419 Advanced Mechanics of Systems (4 credits)',
 'MECH 430 Fluid Mechanics 2 (3 credits)',
 'MECH 447 Combustion (3 credits)',
 'MECH 463D1 Design 3: Mechanical Engineering Project (3 credits)',
 'MECH 463D2 Design 3: Mechanical Engineering Project (3 credits)',
 'MECH 494 Honours Design Project (3 credits)',
 'MECH 497 Value Engineering (3 credits)',
 'MECH 498 Interdisciplinary Design Project 1 (3 credits)',
 'MECH 499 Interdisciplinary Design Project 2 (3 credits)',
 'MECH 500 Selected Topics in Mechanical Engineering (3 credits)',
 'MECH 501 Special Topics: Mechanical Engineering (3 credits)',
 'MECH 502 Topics in Mechanical Engineering (3 credits)',
 'MECH 510 Engineering Acoustics (3 credits)',
 'MECH 513 C

# 2. Test run - scrape the first course

In [8]:
driver.find_element_by_link_text(link_texts[0]).click()

In [9]:
page_soup = soup(driver.page_source, 'lxml')

In [12]:
course_title = re.split(" \([0-9]+ credit[s]*\)", page_soup.find("h1", {"id": "page-title"}).text.strip())[0]

In [13]:
course_title

'MECH 403D1 Thesis (Honours)'

In [14]:
course_code = " ".join(course_title.split()[:2])
course_code

'MECH 403D1'

In [15]:
course_name = " ".join(course_title.split()[2:])
course_name

'Thesis (Honours)'

In [17]:
course_desc = page_soup.find("h3", text = "Overview").findNextSibling().text.strip()
course_desc

'Mechanical Engineering : This course, together with MECH 404, involves a research project containing both engineering theory and design components, and requiring a theoretical and/or experimental investigation. Students are supervised by the course instructor and mentored by one or more staff members. The work culminates with the submission of a thesis.'

In [18]:
driver.back()

# 3. Test clicking the next button

there are 5 pages of courses in total. Simulated clicking of the next button 4 times and reached the end

In [22]:
driver.find_element_by_link_text("❯").click()

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"link text","selector":"❯"}
  (Session info: chrome=91.0.4472.114)


In [34]:
driver.get(url)

# 4. Ready for automation script to scrape all courses in all 5 pages!

In [38]:
from selenium.common.exceptions import NoSuchElementException

course_names = []
course_codes = []
course_descs = []
counter = 0

for link_text in link_texts:
    try:
        link = driver.find_element_by_link_text(link_text)
    except NoSuchElementException:
        print("no link for {}".format(link_text))
        continue
        
    time.sleep(1)
    link.click()    
    time.sleep(3)
    page_soup = soup(driver.page_source, 'lxml')
    
    course_title = re.split(" \([0-9]+ credit[s]*\)", page_soup.find("h1", {"id": "page-title"}).text.strip())[0]
    course_codes.append(" ".join(course_title.split()[:2]))
    course_names.append(" ".join(course_title.split()[2:]))
    course_descs.append(page_soup.find("h3", text = "Overview").findNextSibling().text.strip())
    
    print("Scraped ", course_codes[-1])
    counter += 1    
    
    driver.back()
    time.sleep(3)
    
    #if the last course on the page was just scraped, need to navigate to the next page
    if link_text == link_texts[-1]:
        try:
            driver.find_element_by_link_text("❯").click()
        except NoSuchElementException:
            print("end has been reached")
            break
            
        time.sleep(3)
        page_soup = soup(driver.page_source, 'lxml')
        
        containers = page_soup.find("div", {"class": "view-content"}).findAll("a")
        link_texts += [container.text.strip().replace("\n", "") for container in containers]

        
print("Finished scraping {} courses".format(counter))

Scraped  MECH 403D1
Scraped  MECH 403D2
Scraped  MECH 403N1
Scraped  MECH 403N2
Scraped  MECH 404
Scraped  MECH 412
Scraped  MECH 419
Scraped  MECH 430
Scraped  MECH 447
Scraped  MECH 463D1
Scraped  MECH 463D2
Scraped  MECH 494
Scraped  MECH 497
Scraped  MECH 498
Scraped  MECH 499
Scraped  MECH 500
Scraped  MECH 501
Scraped  MECH 502
Scraped  MECH 510
Scraped  MECH 513
Scraped  MECH 515
Scraped  MECH 516
Scraped  MECH 524
Scraped  MECH 526
Scraped  MECH 528
Scraped  MECH 529
Scraped  MECH 530
Scraped  MECH 532
Scraped  MECH 533
Scraped  MECH 534
Scraped  MECH 535
Scraped  MECH 536
Scraped  MECH 537
Scraped  MECH 538
Scraped  MECH 539
Scraped  MECH 541
Scraped  MECH 542
Scraped  MECH 543
Scraped  MECH 544
Scraped  MECH 546
Scraped  MECH 547
Scraped  MECH 548
Scraped  MECH 550
Scraped  MECH 551
Scraped  MECH 553
Scraped  MECH 554
Scraped  MECH 556
Scraped  MECH 557
Scraped  MECH 559
Scraped  MECH 560
Scraped  MECH 561
Scraped  MECH 562
Scraped  MECH 563
Scraped  MECH 565
Scraped  MECH 56

# 5. Inspect collected data and write to CSV

In [39]:
course_codes

['MECH 403D1',
 'MECH 403D2',
 'MECH 403N1',
 'MECH 403N2',
 'MECH 404',
 'MECH 412',
 'MECH 419',
 'MECH 430',
 'MECH 447',
 'MECH 463D1',
 'MECH 463D2',
 'MECH 494',
 'MECH 497',
 'MECH 498',
 'MECH 499',
 'MECH 500',
 'MECH 501',
 'MECH 502',
 'MECH 510',
 'MECH 513',
 'MECH 515',
 'MECH 516',
 'MECH 524',
 'MECH 526',
 'MECH 528',
 'MECH 529',
 'MECH 530',
 'MECH 532',
 'MECH 533',
 'MECH 534',
 'MECH 535',
 'MECH 536',
 'MECH 537',
 'MECH 538',
 'MECH 539',
 'MECH 541',
 'MECH 542',
 'MECH 543',
 'MECH 544',
 'MECH 546',
 'MECH 547',
 'MECH 548',
 'MECH 550',
 'MECH 551',
 'MECH 553',
 'MECH 554',
 'MECH 556',
 'MECH 557',
 'MECH 559',
 'MECH 560',
 'MECH 561',
 'MECH 562',
 'MECH 563',
 'MECH 565',
 'MECH 566',
 'MECH 567',
 'MECH 572',
 'MECH 573',
 'MECH 577',
 'MECH 578',
 'MECH 579',
 'MECH 531']

In [40]:
course_names

['Thesis (Honours)',
 'Thesis (Honours)',
 'Thesis (Honours)',
 'Thesis (Honours)',
 'Honours Thesis 2',
 'System Dynamics and Control',
 'Advanced Mechanics of Systems',
 'Fluid Mechanics 2',
 'Combustion',
 'Design 3: Mechanical Engineering Project',
 'Design 3: Mechanical Engineering Project',
 'Honours Design Project',
 'Value Engineering',
 'Interdisciplinary Design Project 1',
 'Interdisciplinary Design Project 2',
 'Selected Topics in Mechanical Engineering',
 'Special Topics: Mechanical Engineering',
 'Topics in Mechanical Engineering',
 'Engineering Acoustics',
 'Control Systems',
 'Unsteady Gasdynamics',
 'Computational Gasdynamics',
 'Computer Integrated Manufacturing',
 'Manufacturing and the Environment',
 'Product Design',
 'Discrete Manufacturing Systems',
 'Mechanics of Composite Materials',
 'Aircraft Performance, Stability and Control',
 'Subsonic Aerodynamics',
 'Air Pollution Engineering',
 'Turbomachinery and Propulsion',
 'Aerospace Structures',
 'High-Speed Aerod

In [41]:
course_descs

['Mechanical Engineering : This course, together with MECH 404, involves a research project containing both engineering theory and design components, and requiring a theoretical and/or experimental investigation. Students are supervised by the course instructor and mentored by one or more staff members. The work culminates with the submission of a thesis.',
 'Mechanical Engineering : See MECH 403D1 for course description.',
 'Mechanical Engineering : This course, together with MECH 404, involves a research project containing both engineering theory and design components, and requiring a theoretical and/or experimental investigation. Students are supervised by the course instructor and mentored by one or more staff members. The work culminates with the submission of a thesis.',
 'Mechanical Engineering : See MECH 403N1 for course description.',
 'Mechanical Engineering : This course is part of the same thesis project as course MECH 403.',
 'Mechanical Engineering : Modelling of physical

In [42]:
import pandas as pd

df = pd.DataFrame({
    
    "Course Number": course_codes,
    "Course Name": course_names,
    "Course Description": course_descs    
    
})

df

Unnamed: 0,Course Number,Course Name,Course Description
0,MECH 403D1,Thesis (Honours),"Mechanical Engineering : This course, together..."
1,MECH 403D2,Thesis (Honours),Mechanical Engineering : See MECH 403D1 for co...
2,MECH 403N1,Thesis (Honours),"Mechanical Engineering : This course, together..."
3,MECH 403N2,Thesis (Honours),Mechanical Engineering : See MECH 403N1 for co...
4,MECH 404,Honours Thesis 2,Mechanical Engineering : This course is part o...
...,...,...,...
57,MECH 573,Mechanics of Robotic Systems,Mechanical Engineering : Manipulator performan...
58,MECH 577,Optimum Design,Mechanical Engineering : The role of optimizat...
59,MECH 578,Advanced Thermodynamics,Mechanical Engineering : Review of classical m...
60,MECH 579,Multidisciplinary Design Optimization,Mechanical Engineering : A comprehensive intro...


In [43]:
df.to_csv('McGill_Possible_MECH_electives_(400s_and_500s)_Courses.csv', index = False)

In [44]:
driver.quit()