In [15]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import re
import time

In [16]:
comp_studies = pd.read_csv('../csv/inputs/complementary_studies.csv')
comp_studies.head(3)

Unnamed: 0,Course Code,Subject,Course Title,Complementary Study Area
0,HGSE_V 310,Haida Gwaii Semesters,First Nations and Canada: (Re)writing History,Indigenous Knowledge & Perspectives
1,HGSE_V 311,Haida Gwaii Semesters,Law and Governance: Indigenous and European Tr...,Indigenous Knowledge & Perspectives
2,HGSE_V 312,Haida Gwaii Semesters,Perspectives on Reconciliation,Indigenous Knowledge & Perspectives


In [17]:
def add_course_descriptions(df, base_url="https://vancouver.calendar.ubc.ca/course-descriptions/courses/"):
    """
    Add course descriptions to a DataFrame by scraping UBC course pages.
    
    Parameters:
    df (pd.DataFrame): DataFrame with a 'Course Code' column
    base_url (str): Base URL for UBC course pages
    
    Returns:
    pd.DataFrame: Original DataFrame with added 'Description' column
    """
    
    def format_course_code(course_code):
        """Convert course code from 'HGSE_V 310' format to 'hgsev-310' format"""
        # Remove any trailing letters (like 'E' in SOCI_V 495E)
        course_code = re.sub(r'[A-Z]$', '', course_code)
        # Convert to lowercase and replace '_V ' with '-'
        formatted = course_code.lower().replace('_v ', 'v-')
        return formatted
    
    # Create a copy of the dataframe
    df_copy = df.copy()
    df_copy['Description'] = ''
    
    for index, row in df_copy.iterrows():
        try:
            course_code = row['Course Code']
            formatted_code = format_course_code(course_code)
            url = base_url + formatted_code
            
            response = requests.get(url)         
            soup = BeautifulSoup(response.content, 'html.parser')
            course_content = soup.find(class_='node node--type-course node--promoted node--view-mode-full')
            
            if course_content:
                desc_element = course_content.find(class_='mt-0')
                if desc_element:
                    description = desc_element.text.strip()
                    df_copy.at[index, 'Description'] = description
                else:
                    df_copy.at[index, 'Description'] = 'Description Not Available'
            else:
                df_copy.at[index, 'Description'] = 'Description Not Available'
                
        except Exception as e:
            print(f"Error processing {course_code}: {e}")
            df_copy.at[index, 'Description'] = 'Description Not Available'
            
        time.sleep(0.5)
    
    return df_copy

In [18]:
comp_studies_updated = add_course_descriptions(comp_studies)
comp_studies_updated.head(3)

Unnamed: 0,Course Code,Subject,Course Title,Complementary Study Area,Description
0,HGSE_V 310,Haida Gwaii Semesters,First Nations and Canada: (Re)writing History,Indigenous Knowledge & Perspectives,A survey of the history of Indigenous peoples ...
1,HGSE_V 311,Haida Gwaii Semesters,Law and Governance: Indigenous and European Tr...,Indigenous Knowledge & Perspectives,Background on sources of Aboriginal and Canadi...
2,HGSE_V 312,Haida Gwaii Semesters,Perspectives on Reconciliation,Indigenous Knowledge & Perspectives,Notions of reconciliation and restitution that...


In [19]:
comp_studies_updated.sample(5)

Unnamed: 0,Course Code,Subject,Course Title,Complementary Study Area,Description
20,HGSE_V 360,Haida Gwaii Semesters,Ocean People Culture and Tradition,Indigenous Knowledge & Perspectives,The relationship between the culture of the Ha...
21,HGSE_V 361,Haida Gwaii Semesters,Fisheries Co-Management of the North Pacific C...,Indigenous Knowledge & Perspectives,Concepts of fisheries co-management with a foc...
23,HGSE_V 363,Haida Gwaii Semesters,Applications in Marine Conservation,Indigenous Knowledge & Perspectives,Exposure to field settings and research techni...
10,HGSE_V 350,Haida Gwaii Semesters,Case Studies in Haida Gwaii,Indigenous Knowledge & Perspectives,"Integration of concepts of history, politics, ..."
1,HGSE_V 311,Haida Gwaii Semesters,Law and Governance: Indigenous and European Tr...,Indigenous Knowledge & Perspectives,Background on sources of Aboriginal and Canadi...


In [21]:
comp_studies_updated[comp_studies_updated['Description'] == 'Description Not Available'].shape

(7, 5)

In [22]:
comp_studies_updated.to_json('../json/comp_study_courses.json', orient='records', indent=2)