In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import re
import time

In [2]:
tools = pd.read_csv('../csv/inputs/tools_electives.csv')
tools.head(3)

Unnamed: 0,Course_Code
0,ATSC_V 303
1,CHEM_V 211
2,CHEM_V 311


In [3]:
def add_course_descriptions(df, base_url="https://vancouver.calendar.ubc.ca/course-descriptions/courses/"):
    """
    Add course descriptions to a DataFrame by scraping UBC course pages.
    
    Parameters:
    df (pd.DataFrame): DataFrame with a 'Course Code' column
    base_url (str): Base URL for UBC course pages
    
    Returns:
    pd.DataFrame: Original DataFrame with added 'Description' column
    """
    
    def format_course_code(course_code):
        """Convert course code from 'HGSE_V 310' format to 'hgsev-310' format"""
        # Remove any trailing letters (like 'E' in SOCI_V 495E)
        course_code = re.sub(r'[A-Z]$', '', course_code)
        # Convert to lowercase and replace '_V ' with '-'
        formatted = course_code.lower().replace('_v ', 'v-')
        return formatted
    
    # Create a copy of the dataframe
    df_copy = df.copy()
    df_copy['Description'] = ''
    
    for index, row in df_copy.iterrows():
        try:
            course_code = row['Course_Code']
            formatted_code = format_course_code(course_code)
            url = base_url + formatted_code
            
            response = requests.get(url)         
            soup = BeautifulSoup(response.content, 'html.parser')
            course_content = soup.find(class_='node node--type-course node--promoted node--view-mode-full')
            
            if course_content:
                desc_element = course_content.find(class_='mt-0')
                title_element = course_content.find('strong')
                if desc_element:
                    description = desc_element.text.strip()
                    title = title_element.text.strip()
                    df_copy.at[index, 'Description'] = description
                    df_copy.at[index, 'Course Title'] = title
                else:
                    df_copy.at[index, 'Description'] = 'Description Not Available'
                    df_copy.at[index, 'Course Title'] = 'Title Not Available'
            else:
                df_copy.at[index, 'Description'] = 'Description Not Available'
                df_copy.at[index, 'Course Title'] = 'Title Not Available'
                
        except Exception as e:
            print(f"Error processing {course_code}: {e}")
            df_copy.at[index, 'Description'] = 'Description Not Available'
            df_copy.at[index, 'Course Title'] = 'Title Not Available'
            
        time.sleep(0.5)
    
    return df_copy

In [4]:
tools_updated = add_course_descriptions(tools)
tools_updated.head(3)

Unnamed: 0,Course_Code,Description,Course Title
0,ATSC_V 303,An introduction to instrumentation used in mon...,Methods in Atmospheric Science
1,CHEM_V 211,Quantitative chemical analysis; chemical and p...,Introduction to Chemical Analysis
2,CHEM_V 311,"Theory, design, and application of instrumenta...",Instrumental Analytical Chemistry


In [5]:
tools_updated.sample(5)

Unnamed: 0,Course_Code,Description,Course Title
9,NRES_V 340,"Introduction to principles, practice and conte...",Introduction to Geographic Information Systems...
2,CHEM_V 311,"Theory, design, and application of instrumenta...",Instrumental Analytical Chemistry
7,GEOS_V 373,Aerial photography; measurement from aerial ph...,Remote Sensing: Monitoring Earth’s Surface fro...
0,ATSC_V 303,An introduction to instrumentation used in mon...,Methods in Atmospheric Science
5,GEOS_V 309,"Field sampling, instrumentation, surveying, ma...",Geographical Sciences Field Course


In [6]:
tools_updated[tools_updated['Description'] == 'Description Not Available'].shape

(0, 3)

In [7]:
tools_updated.to_json('../json/tools_electives.json', orient='records', indent=2)