In [1]:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as ureq
import time
import re

In [2]:
url = "http://www.calendar.ubc.ca/okanagan/index.cfm?tree=18,317,989,1379"

# 1. Collect course codes to be used to search for course names and descriptions

In [3]:
uClient = ureq(url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")

In [4]:
tables = page_soup.findAll("table")
containers = [container for table in tables for container in table.findAll("td")]
len(containers)

78

In [5]:
course_codes = [container.text.strip() for container in containers if bool(re.match("[A-Z]{4} [0-9]{3}", container.text.strip()))]
course_codes

['APSC 169',
 'APSC 171',
 'APSC 172',
 'APSC 173',
 'APSC 176',
 'APSC 177',
 'APSC 178',
 'APSC 179',
 'APSC 180',
 'APSC 181',
 'APSC 182',
 'APSC 183',
 'APSC 201',
 'APSC 246',
 'APSC 248',
 'APSC 252',
 'APSC 254',
 'APSC 256',
 'APSC 258',
 'APSC 259',
 'APSC 260']

In [6]:
len(course_codes)

21

In [7]:
#two extra courses specific to mech eng as stated in astonishingly subtle footnote
course_codes.append("APSC 253")
course_codes.append("APSC 255")
len(course_codes)

23

# 2. Use the course codes to find and scrape corresponding course names and descriptions

In [8]:
uClient = ureq("http://www.calendar.ubc.ca/okanagan/courses.cfm?go=code&institution=2&code=APSC")
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")

In [9]:
course_names = []
course_descs = []
counter = 0

for course_code in course_codes:
    
    course_title_container = page_soup.find(text = re.compile("^" + course_code))
    if course_title_container is None:
        print("Cannot find ", course_code)
    else:
        course_names.append(course_title_container.findNext().text)
        course_descs.append(course_title_container.findNext().findNext().text.strip())
    
        counter += 1
        print("scraped ", course_code)
    
print(counter)

scraped  APSC 169
scraped  APSC 171
scraped  APSC 172
scraped  APSC 173
scraped  APSC 176
scraped  APSC 177
scraped  APSC 178
scraped  APSC 179
scraped  APSC 180
scraped  APSC 181
scraped  APSC 182
scraped  APSC 183
scraped  APSC 201
scraped  APSC 246
scraped  APSC 248
scraped  APSC 252
scraped  APSC 254
scraped  APSC 256
scraped  APSC 258
scraped  APSC 259
scraped  APSC 260
scraped  APSC 253
scraped  APSC 255
23


# 3. Inspect collected data and write to CSV

In [10]:
course_names

['Fundamentals of Sustainable Engineering Design',
 'Engineering Drawing and CAD/CAM',
 'Engineering Analysis I',
 'Engineering Analysis II',
 'Engineering Communication',
 'Engineering Computation and Instrumentation',
 'Electricity, Magnetism, and Waves',
 'Linear Algebra for Engineers',
 'Statics',
 'Dynamics',
 'Matter and Energy I',
 'Matter and Energy II',
 'Technical Communication',
 'System Dynamics',
 'Engineering Analysis III',
 'Thermodynamics',
 'Instrumentation and Data Analysis',
 'Numerical Methods for Analysis',
 'Applications of Engineering Design',
 'Materials Science I',
 'Mechanics of Materials I',
 'Fluid Mechanics I',
 'Electric Circuits and Power']

In [11]:
course_descs

['Theory and practice of sustainable engineering. Awareness and risk analysis of potential impacts on society and the environment over the lifecycle of engineering projects. Engineering design process, project lifecycle, and professional responsibility. Team-based design project. [3-2*-0*]',
 'Orthographic projections, axonometric and perspective projections, dimensioning and tolerances, computer-aided design and modelling, introduction to rapid prototyping, team-based design project. [3-0-2]',
 'Functions, limits, differentiation, applications of derivatives, integration, applications of definite integrals. [3-0-1]',
 'Integrals and transcendental functions, techniques of integration, applications of integration, polar coordinates, infinite sequences and series, vectors and the geometry of space, and partial derivatives. [3-0-1] Prerequisite: APSC 172.',
 'Written and oral presentations, formal and informal. Purpose, audience, content, format, and tone are studied, as are team-based r

In [12]:
import pandas as pd

df = pd.DataFrame({
    
    "Course Number": course_codes,
    "Course Name": course_names,
    "Course Description": course_descs    
    
})

df

Unnamed: 0,Course Number,Course Name,Course Description
0,APSC 169,Fundamentals of Sustainable Engineering Design,Theory and practice of sustainable engineering...
1,APSC 171,Engineering Drawing and CAD/CAM,"Orthographic projections, axonometric and pers..."
2,APSC 172,Engineering Analysis I,"Functions, limits, differentiation, applicatio..."
3,APSC 173,Engineering Analysis II,"Integrals and transcendental functions, techni..."
4,APSC 176,Engineering Communication,"Written and oral presentations, formal and inf..."
5,APSC 177,Engineering Computation and Instrumentation,"Computer systems, software development, operat..."
6,APSC 178,"Electricity, Magnetism, and Waves","Coulomb's law, electric field, Gauss' law, ele..."
7,APSC 179,Linear Algebra for Engineers,"Systems of linear equations, Gaussian eliminat..."
8,APSC 180,Statics,"Force vectors, Cartesian coordinate system, fr..."
9,APSC 181,Dynamics,"Kinematics of particles, curvilinear motion, n..."


In [13]:
df.to_csv('UBC_Okanagan_MechEng_Core_(Years1-2)_Courses.csv', index = False)