In [1]:
# This notebook is an alternative to download_learn.ipynb
# Instead of downloading content directly from Blackboard Learn via the API
# This code parses an exported .zip file for a course
# The .env file is not used and instead a path to the zip must be provided in the constants below

In [2]:
# Configuration
package_info_filename = ".bb-package-info"
manifest_filename = "imsmanifest.xml"

from dotenv import dotenv_values

config = dotenv_values(".env")
courses_dirname = config["COURSE_DATA_DIR"]
export_zip_path = config["LEARN_EXPORT_FILE"]

import os

os.makedirs(config["COURSE_DATA_DIR"], exist_ok=True)

In [3]:
# Get the course ID and extract the export zip file into a correspondingly named subdirectory under courses/
import os
import zipfile
from dotenv import dotenv_values

with zipfile.ZipFile(export_zip_path, "r") as zip:
    if package_info_filename not in map(lambda zinfo: zinfo.filename, zip.infolist()):
        print(
            f"Could not find {package_info_filename} in the provided zip file! Is the file corrupted?"
        )
    else:
        zip.extract(package_info_filename)
        package_info = dotenv_values(package_info_filename)
        course_id = package_info.get("cx.config.course.id")

        if not course_id or not course_id.startswith("INFR"):
            print(
                f"The course export file seems to be for a course that is not from informatics! {course_id=}"
            )
        else:
            os.remove(package_info_filename)
            course_dir = os.path.join(courses_dirname, course_id)
            # Do not re-extract course files if already present
            # When running multiple times for the same course, either delete the course subdirectory
            # Or ignore the error and proceed to the next cells anyway -- it will work fine with the old files
            os.makedirs(course_dir, exist_ok=False)
            zip.extractall(course_dir)

FileExistsError: [WinError 183] Cannot create a file when that file already exists: './courses/INFR111992022-3SV1SEM2'

In [4]:
# Parse the course manifest file
import xml.etree.ElementTree as ET


# Borrowed from https://stackoverflow.com/a/10151590
def parse_and_get_ns(file):
    events = "start", "start-ns"
    root = None
    ns = {}
    for event, elem in ET.iterparse(file, events):
        if event == "start-ns":
            if elem[0] in ns and ns[elem[0]] != elem[1]:
                # NOTE: It is perfectly valid to have the same prefix refer
                #     to different URI namespaces in different parts of the
                #     document. This exception serves as a reminder that this
                #     solution is not robust.    Use at your own peril.
                raise KeyError("Duplicate prefix with different URI found.")
            ns[elem[0]] = "{%s}" % elem[1]
        elif event == "start":
            if root is None:
                root = elem
    return ET.ElementTree(root), ns


manifest, ns = parse_and_get_ns(os.path.join(course_dir, manifest_filename))

course_details = manifest.find(
    "./resources/resource[@type='course/x-bb-coursesetting']"
)
course_name = course_details.get(f"{ns['bb']}title")

In [9]:
def visit_bb_content():
    pass


def visit_toc_subitem(toc_subitem, manifest, ns):
    # Subitems may be documents, quizzes, folders, or staffinfo items
    # We ignore quizzes, get the body for document, and recursively process folders
    idref = toc_subitem.get("identifierref")
    content_resource = manifest.find(f"./resources/resource[@identifier='{idref}']")
    content_filename = content_resource.get(f"{ns['bb']}file")
    content = ET.parse(os.path.join(course_dir, content_filename))

    description_tag = content.find("DESCRIPTION")
    body_tag = content.find("BODY")
    result = {
        "title": toc_subitem.findtext("title"),
        "available": content.find("FLAGS/ISAVAILABLE").get("value") == "true",
        "description": description_tag.get("value")
        if description_tag is not None
        else None,
        "body": body_tag.findtext("TEXT") if body_tag is not None else None,
        "children": [],
    }

    is_folder = content.find("FLAGS/ISFOLDER").get("value") == "true"
    if is_folder:
        for child in toc_subitem.iterfind("./item"):
            result["children"].append(visit_toc_subitem(child, manifest, ns))
    return result


def visit_toc_item(toc_item, manifest, ns):
    idref = toc_item.get("identifierref")
    is_enabled = False

    if idref is not None:
        content_resource = manifest.find(f"./resources/resource[@identifier='{idref}']")
        content_filename = content_resource.get(f"{ns['bb']}file")
        content = ET.parse(os.path.join(course_dir, content_filename))
        is_enabled = content.find("FLAGS/ISENABLED").get("value") == "true"

    result = {
        "title": toc_item.findtext("title"),
        "enabled": is_enabled,
        "children": [],
    }
    # Assumption: each TOC item is either an external URL
    #  or has 1 item called --TOP-- and all subitems are inside this --TOP-- item
    for subitem in toc_item.iterfind("./item"):
        if subitem.findtext("title") == "--TOP--":
            for containeditem in subitem.iterfind("./item"):
                result["children"].append(
                    visit_toc_subitem(containeditem, manifest, ns)
                )
        else:
            result["children"].append(visit_toc_subitem(subitem, manifest, ns))
    return result


def build_toc(manifest, ns):
    toc = []
    # Assumption: there is only 1 organization in organizations, and only one TOC item in the organization
    for item in manifest.iterfind("./organizations/organization/item"):
        # It would be more convenient to set up a dictionary mapping each item title to its properties
        # But titles are not guaranteed to be unique... so a list is safer
        toc.append(visit_toc_item(item, manifest, ns))
    return toc


toc = build_toc(manifest, ns)

In [10]:
import json

with open(
    file=os.path.join(courses_dirname, f"{course_id}.json"), mode="w", encoding="utf-8"
) as f:
    json.dump({"course_name": course_name, "toc": toc}, f)