In [1]:
from langchain_together import ChatTogether
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from course_model import HIGH_LEVEL_FIELDS
import json
from process_pdf import process_pdf
from langchain_core.rate_limiters import InMemoryRateLimiter

In [2]:
rate_limiter = InMemoryRateLimiter(
    requests_per_second=0.1,  # request once every 10 seconds (limit for togetherai free llama model)
    check_every_n_seconds=0.1,  # Wake up every 100 ms to check whether allowed to make a request,
    max_bucket_size=10,  # Controls the maximum burst size.
)

llm = ChatTogether(
    # model="meta-llama/Llama-3-70b-chat-hf",
    model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
    temperature=0,
    max_tokens=1000,
    timeout=None,
    max_retries=2,
    rate_limiter=rate_limiter,
)

prompt_template = ChatPromptTemplate.from_messages(
    [
        (
            "human",
            "You are an expert course-data scraping algorithm. "
            "Only extract relevant information from the text. "
            "Only return information present in the text. "
            "If you do not know or are unsure of the value of an attribute asked to extract, "
            "return null for the attribute's value."
            "Here is the course syllabus in markdown format: "
            "{course_syllabus}"
            "Here is some miscellaneous information about the course website: "
            "{misc_info}"
            "Here is the contents of the course website in markdown format: "
            "{course_website}",
        )
    ]
)

In [3]:
with open("debug_files/info.json") as f:
    misc_info = str(json.load(f))
with open("debug_files/wiki_page.md") as f:
    course_website = f.read()
course_syllabus = process_pdf("debug_files/20251_CSC301H5S_LEC0101_syllabus.pdf")

prompt = prompt_template.invoke({"misc_info": misc_info, "course_website": course_website, "course_syllabus": course_syllabus})

In [4]:
general_course_info = {}

for field in HIGH_LEVEL_FIELDS:
    structured_llm = llm.with_structured_output(schema=field)
    info = structured_llm.invoke(prompt)
    general_course_info[field.__name__] = info
    print(field, info)

<class 'course_model.CourseInformation'> id='CSC301H5S' title='Introduction to Software Engineering' course_url='https://q.utoronto.ca/courses/373305' term='Winter 2025' communication=Communication(platforms=[Platform(name='Piazza', url=AnyUrl('https://piazza.com/class/m5gs2ezw9d15rp/')), Platform(name='MarkUs', url=AnyUrl('https://markus0.teach.cs.toronto.edu/utm-2025-01'))]) policies=Policy(academic_integrity='The University of Toronto’s Code of Behaviour on Academic Matters outlines behaviours that constitute academic dishonesty and the process for addressing academic offences.', late_submission='All deadlines are strict. Exceptions may be made at the discretion of the instructor for documented reasons.', ai_usage='Students may use artificial intelligence tools, including generative AI, in this course as a learning aid or to help produce assignments and project deliverables except for the following two components: presentation component, term test component.', remark_requests='Remar

In [5]:
general_course_info['Labs']

Labs(start_week=2, submission=Submission(platform='MarkUs', policy='No late submissions'), schedule=[LabScheduleItem(week=2, date='Jan 13', topic='Gitflow', notes=None, resources=[Resource(name='CSC301Lab1 (2).pdf', link=AnyUrl('https://q.utoronto.ca/courses/373305/files/35742820?wrap=1'))]), LabScheduleItem(week=3, date='Jan 20', topic='REST', notes=None, resources=[Resource(name='l2Materials-1.zip', link=AnyUrl('https://q.utoronto.ca/courses/373305/files/35826569?wrap=1')), Resource(name='Lab 2 (1)-1.pdf', link=AnyUrl('https://q.utoronto.ca/courses/373305/files/35826570?wrap=1'))]), LabScheduleItem(week=4, date='Jan 27', topic='Neo4J', notes='A1', resources=[Resource(name='L3-1.zip', link=AnyUrl('https://q.utoronto.ca/courses/373305/files/36236769?wrap=1'))]), LabScheduleItem(week=5, date='Feb 3', topic='SPRINT 1 DEMO', notes=None, resources=None), LabScheduleItem(week=6, date='Feb 10', topic='Docker', notes=None, resources=[Resource(name='tut6-3.zip', link=AnyUrl('https://q.utoronto

In [7]:
prompt = prompt_template.invoke({"misc_info": misc_info, "course_website": course_website, "course_syllabus": course_syllabus})
structured_llm = llm.with_structured_output(schema=HIGH_LEVEL_FIELDS[6], include_raw=True)
info = structured_llm.invoke(prompt)
# general_course_info[field.__name__] = info
info

{'raw': AIMessage(content='<function=Tests>[{"name": "Midterm 1", "test_date": "2025-01-30", "time": "14:00-16:00", "location": "MN 1170", "duration": "2 hours", "format": "Closed book", "coverage": "Lectures 1-5", "weight": 15, "resources": [{"name": "Practice test", "link": "https://q.utoronto.ca/courses/373305/files/35742820"}]}, {"name": "Final Exam", "test_date": "2025-03-25", "time": "14:00-16:00", "location": "MN 1170", "duration": "2 hours", "format": "Closed book", "coverage": "All lectures", "weight": 50, "resources": [{"name": "Practice test", "link": "https://q.utoronto.ca/courses/373305/files/35742820"}]}]</function>', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 216, 'prompt_tokens': 6529, 'total_tokens': 6745, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'meta-llama/Llama-3.3-70B-Instruct-Turbo-Free', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-60ee4dc7

In [6]:
info

In [6]:
general_course_info

{'CourseInformation': CourseInformation(id='373305', title='Introduction to Software Engineering', course_url='https://q.utoronto.ca/courses/373305', term='Winter 2025', communication=Communication(platforms=[Platform(name='Piazza', url=AnyUrl('https://piazza.com/class/m5gs2ezw9d15rp/')), Platform(name='Markus', url=AnyUrl('https://markus0.teach.cs.toronto.edu/utm-2025-01'))]), policies=Policy(academic_integrity='Academic integrity is essential to the pursuit of learning and scholarship in a university, and to ensuring that a degree from the University of Toronto Mississauga is a strong signal of each student’s individual academic achievement.', late_submission='All deadlines are strict. Exceptions may be made at the discretion of the instructor for documented reasons.', ai_usage='Students may use artificial intelligence tools, including generative AI, in this course as a learning aid or to help produce assignments and project deliverables except for the following two components: prese