In [1]:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as ureq
import time
import re

In [2]:
url = "http://www.calendar.ubc.ca/vancouver/index.cfm?tree=12,195,272,33"

# 1. Collect course codes to be used to search for course names and descriptions

In [3]:
uClient = ureq(url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")

In [4]:
table = page_soup.find("table")
containers = table.findAll("td")
len(containers)

32

In [5]:
course_codes = [container.text.strip() for container in containers if bool(re.match("[A-Z]{4} [0-9]{3}", container.text.strip()))]
course_codes

['APSC 100',
 'APSC 1011',
 'APSC 160',
 'CHEM 1541',
 'WRDS 1502',
 'MATH 100',
 'MATH 101',
 'MATH 152',
 'PHYS 157',
 'PHYS 158',
 'PHYS 159',
 'PHYS 170']

In [8]:
len(course_codes)

12

In [7]:
course_codes = [course_code[:-1] if len(course_code) > 8 else course_code for course_code in course_codes]
course_codes

['APSC 100',
 'APSC 101',
 'APSC 160',
 'CHEM 154',
 'WRDS 150',
 'MATH 100',
 'MATH 101',
 'MATH 152',
 'PHYS 157',
 'PHYS 158',
 'PHYS 159',
 'PHYS 170']

# 2. Use the course codes to find and scrape corresponding course names and descriptions

In [9]:
subject_links = {
    "MECH": "http://www.calendar.ubc.ca/vancouver/courses.cfm?page=code&institution=2&code=MECH",
    "MATH": "http://www.calendar.ubc.ca/vancouver/courses.cfm?page=code&institution=12&code=MATH",
    "PHYS": "http://www.calendar.ubc.ca/vancouver/courses.cfm?page=code&institution=12&code=PHYS",
    "APSC": "http://www.calendar.ubc.ca/vancouver/courses.cfm?page=code&institution=2&code=APSC",
    "CHEM": "http://www.calendar.ubc.ca/vancouver/courses.cfm?page=code&institution=12&code=CHEM",
    "WRDS": "http://www.calendar.ubc.ca/vancouver/courses.cfm?page=code&institution=3&code=WRDS"
}

In [10]:
course_names = []
course_descs = []
counter = 0
prev_link = ""

for course_code in course_codes:
    
    if subject_links[course_code.split()[0]] != prev_link:
        uClient = ureq(subject_links[course_code.split()[0]])
        time.sleep(2)
        page_html = uClient.read()
        uClient.close()
        page_soup = soup(page_html, "html.parser")
        prev_link = subject_links[course_code.split()[0]]
        
    course_title_container = page_soup.find(text = re.compile("^" + course_code))
    if course_title_container is None:
        print("Cannot find ", course_code)
    else:
        course_names.append(course_title_container.findNext().text)
        course_descs.append(course_title_container.findNext().findNext().text.strip())
    
        counter += 1
        print("scraped ", course_code)
    
print(counter)

scraped  APSC 100
scraped  APSC 101
scraped  APSC 160
scraped  CHEM 154
scraped  WRDS 150
scraped  MATH 100
scraped  MATH 101
scraped  MATH 152
scraped  PHYS 157
scraped  PHYS 158
scraped  PHYS 159
scraped  PHYS 170
12


# 3. Inspect collected data and write to CSV

In [11]:
course_names

['Introduction to Engineering I',
 'Introduction to Engineering II',
 'Introduction to Computation in Engineering Design',
 'Chemistry for Engineering',
 'Writing and Research in the Disciplines',
 'Differential Calculus with Applications to Physical Sciences and Engineering',
 'Integral Calculus with Applications to Physical Sciences and Engineering',
 'Linear Systems',
 'Introductory Physics for Engineers I',
 'Introductory Physics for Engineers II',
 'Introductory Physics Laboratory for Engineers',
 'Mechanics I']

In [12]:
course_descs

['An introduction to the engineering profession including: roles and responsibilities of the engineer, the engineering disciplines, sustainability, an introduction to the engineering design process, introduction and application of the relevant foundational scientific principles, prototyping, engineering graphics, technical communication, and engineering ethics. This course is not eligible for Credit/D/Fail grading. [2-2-0]',
 'An introduction to the engineering profession including: the engineering design process, sustainability, prototype testing, introduction and application of the relevant foundational scientific principles, team functioning, engineering graphics, and technical communication. This course is not eligible for Credit/D/Fail grading. [2-2-0] Prerequisite: APSC 100.',
 'Analysis and simulation, laboratory data acquisition and processing, measurement interfaces, engineering tools, computer systems organization, programming languages. Credit will only be given for one of: 

In [13]:
import pandas as pd

df = pd.DataFrame({
    
    "Course Number": course_codes,
    "Course Name": course_names,
    "Course Description": course_descs    
    
})

df

Unnamed: 0,Course Number,Course Name,Course Description
0,APSC 100,Introduction to Engineering I,An introduction to the engineering profession ...
1,APSC 101,Introduction to Engineering II,An introduction to the engineering profession ...
2,APSC 160,Introduction to Computation in Engineering Design,"Analysis and simulation, laboratory data acqui..."
3,CHEM 154,Chemistry for Engineering,"Chemical bonding, properties of matter. Chemic..."
4,WRDS 150,Writing and Research in the Disciplines,Writing and reading in disciplines across the ...
5,MATH 100,Differential Calculus with Applications to Phy...,Derivatives of elementary functions. Applicati...
6,MATH 101,Integral Calculus with Applications to Physica...,"The definite integral, integration techniques,..."
7,MATH 152,Linear Systems,"2D and 3D geometry, vectors and matrices, eige..."
8,PHYS 157,Introductory Physics for Engineers I,"Heat, thermodynamics, oscillations, waves, and..."
9,PHYS 158,Introductory Physics for Engineers II,"Electricity and magnetism, DC and AC circuits,..."


In [14]:
df.to_csv('UBC_Engineering_First_Year_Courses.csv', index = False)