In [7]:
import requests
from bs4 import BeautifulSoup
import json as JSON
from tqdm import tqdm
import polars as pl
from markdownify import markdownify
from openai import OpenAI
from getpass import getpass
from pprint import pprint
from pydantic import BaseModel, Field
import html
from collections import OrderedDict

In [10]:
BASE_URL = "https://www.dealii.org"
DOC_URL = "https://dealii.org/current/doxygen/deal.II/"
TUTORIAL_INDEX = "https://dealii.org/current/doxygen/deal.II/Tutorial.html"
TUTO_LIST_TITLE = "Tutorial programs listed by number"
CODE_TITLE = "The plain program"

In [9]:

r = requests.get(TUTORIAL_INDEX)
soup = BeautifulSoup(r.text, "html.parser")
title = soup.find("h3", string=TUTO_LIST_TITLE)

In [11]:
from dataclasses import dataclass


@dataclass
class Tutorial:
    name: str
    link: str
    description: str
    keywords: list[str]

tutorials = []
for tr in title.find_next_sibling("table").find_all("tr"):
    
    name_td, descr_td = tr.find_all("td")

    link = DOC_URL + "/" + name_td.find("a").get("href")
    name = name_td.get_text(strip=True)
    descr_w_keywords = descr_td.get_text(strip=True).split("Keywords:")
    descr = descr_w_keywords[0].strip()
    
    if len(descr_w_keywords) > 1:
        keywords = descr_w_keywords[1].strip()
        keywords = [k.strip() for k in keywords.split(",")]
    else:
        keywords = []
        
    tutorials.append({"name": name, "link": link, "description": descr, "keywords": keywords})
    
tutorials = pl.DataFrame(tutorials).lazy() \
             .with_columns(pl.col("name").str.split("-").list.get(1).cast(pl.UInt8).alias("num"))  \
             .select(pl.col("num"), pl.all().exclude("num"))  \
             .collect()
print(f"Found {len(tutorials)} deal.II tutorials.")

tutorials
    

Found 86 deal.II tutorials.


num,name,link,description,keywords
u8,str,str,str,list[str]
1,"""step-1""","""https://dealii.org/current/dox…","""Creating a grid. A simple way …","[""Triangulation"", ""GridGenerator::hyper_cube()"", … ""Triangulation::execute_coarsening_and_refinement()""]"
2,"""step-2""","""https://dealii.org/current/dox…","""Associate degrees of freedom t…","[""FE_Q"", ""DynamicSparsityPattern"", … ""SparsityPattern""]"
3,"""step-3""","""https://dealii.org/current/dox…","""Actually solve Laplace's probl…","[""FEValues"", ""VectorTools::interpolate_boundary_values()"", … ""DataOut""]"
4,"""step-4""","""https://dealii.org/current/dox…","""This example is programmed in …","[""VectorTools::point_value()"", ""VectorTools::compute_mean_value()""]"
5,"""step-5""","""https://dealii.org/current/dox…","""Computations on successively r…","[""PreconditionSSOR"", ""GridIn"", ""SphericalManifold""]"
…,…,…,…,…
85,"""step-85""","""https://dealii.org/current/dox…","""Solving the Poisson equation u…","[""FEInterfaceValues"", ""NonMatching::FEImmersedSurfaceValues""]"
86,"""step-86""","""https://dealii.org/current/dox…","""The heat equation, solved with…","[""PETScWrappers::TimeStepper""]"
87,"""step-87""","""https://dealii.org/current/dox…","""Evaluation of finite element s…","[""Utilities::MPI::RemotePointEvaluation"", ""VectorTools::point_values()""]"
89,"""step-89""","""https://dealii.org/current/dox…","""Matrix-free operator evaluatio…","[""FERemoteEvaluation""]"


In [13]:
missing_tutorials = (
    tutorials.lazy()
    .select(pl.col("num"), pl.col("num").diff().alias("diff") )
    .filter(pl.col("diff") == 2)
    .select(pl.col("num") - 1)
        .collect()
).to_series().to_list()

print(f"The following tutorials are missing on the website : {", ".join(map(str, missing_tutorials))}.")

The following tutorials are missing on the website : 73, 80, 84, 88.


It turns out that there is a [list](https://github.com/dealii/dealii/issues/11998) of unfinished tutorial programs.

In [46]:
import re

STEP_NUM = 87
STEP_URL = f"https://dealii.org/current/doxygen/deal.II/step_{STEP_NUM}.html"
r = requests.get(STEP_URL)
soup = BeautifulSoup(r.text, "html.parser")
content = soup.select_one("div.contents")

# Replace all div.fragment with pre elements
for div in content.find_all("div", class_="fragment"):
    for to_remove in div.select(".ttc"):
        to_remove.decompose()
    pre = soup.new_tag("pre")
    pre.string = div.get_text().replace("Â", " ").lstrip("\n").rstrip()
    div.replace_with(pre)

text = markdownify(str(content))
code = content.find("h1", string=CODE_TITLE).find_next_sibling("pre").getText()


# remove first c++ docstring
file_docstring_pattern = re.compile(r"^\s*\/\*.*?\*\/\s*", re.DOTALL)
code = file_docstring_pattern.sub("", code)
lines = code.splitlines()
print("Extracted code", "--------------\n", *lines[0:5], '...', *lines[-10:], sep="\n")
# print(str(text))

# Save the text to a file
with open(f"step_{STEP_NUM}.md", "w") as f:
    f.write(text)
    print(f"\nSaved step-{STEP_NUM} content to step_{STEP_NUM}.md")

Extracted code
--------------

#include <deal.II/base/conditional_ostream.h>
#include <deal.II/base/function_lib.h>
#include <deal.II/base/function_signed_distance.h>
#include <deal.II/base/mpi.h>
#include <deal.II/base/mpi.templates.h>
...
 Utilities::MPI::MPI_InitFinalize mpi(argc, argv, 1);
  std::cout.precision(5);
 
 if (Utilities::MPI::this_mpi_process(MPI_COMM_WORLD) == 0)
    Step87::example_0(); // only run on root process
 
  Step87::example_1();
  Step87::example_2();
  Step87::example_3();
}

Saved step-87 content to step_87.md


In [11]:
def get_tutorial_content(link: str):
    r = requests.get(link)
    soup = BeautifulSoup(r.text, "html.parser")
    content = soup.select_one("div.contents")

    # Replace all div.fragment with pre elements
    for div in content.find_all("div", class_="fragment"):
        for to_remove in div.select(".ttc"):
            to_remove.decompose()
        pre = soup.new_tag("pre")
        pre.string = div.get_text().replace("Â", " ").lstrip("\n").rstrip()
        div.replace_with(pre)

    text = markdownify(str(content))
    
    return text
    
with tqdm(total=len(tutorials)) as pbar:
    def foo(link: str):
        content = get_tutorial_content(link)
        pbar.update(1)
        return content

    tutorials = (tutorials.lazy()
        .with_columns(pl.col("link").alias("content").map_elements(foo, return_dtype=pl.String))
    ).collect()
tutorials.write_parquet("tutorials.parquet")
tutorials

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 86/86 [01:07<00:00,  1.27it/s]


num,name,link,description,keywords,content
u8,str,str,str,list[str],str
1,"""step-1""","""https://dealii.org/current/dox…","""Creating a grid. A simple way …","[""Triangulation"", ""GridGenerator::hyper_cube()"", … ""Triangulation::execute_coarsening_and_refinement()""]","""| **Table of contents** | | | …"
2,"""step-2""","""https://dealii.org/current/dox…","""Associate degrees of freedom t…","[""FE_Q"", ""DynamicSparsityPattern"", … ""SparsityPattern""]","""This tutorial depends on [step…"
3,"""step-3""","""https://dealii.org/current/dox…","""Actually solve Laplace's probl…","[""FEValues"", ""VectorTools::interpolate_boundary_values()"", … ""DataOut""]","""This tutorial depends on [step…"
4,"""step-4""","""https://dealii.org/current/dox…","""This example is programmed in …","[""VectorTools::point_value()"", ""VectorTools::compute_mean_value()""]","""This tutorial depends on [step…"
5,"""step-5""","""https://dealii.org/current/dox…","""Computations on successively r…","[""PreconditionSSOR"", ""GridIn"", ""SphericalManifold""]","""This tutorial depends on [step…"
…,…,…,…,…,…
85,"""step-85""","""https://dealii.org/current/dox…","""Solving the Poisson equation u…","[""FEInterfaceValues"", ""NonMatching::FEImmersedSurfaceValues""]","""This tutorial depends on [step…"
86,"""step-86""","""https://dealii.org/current/dox…","""The heat equation, solved with…","[""PETScWrappers::TimeStepper""]","""This tutorial depends on [step…"
87,"""step-87""","""https://dealii.org/current/dox…","""Evaluation of finite element s…","[""Utilities::MPI::RemotePointEvaluation"", ""VectorTools::point_values()""]","""This tutorial depends on [step…"
89,"""step-89""","""https://dealii.org/current/dox…","""Matrix-free operator evaluatio…","[""FERemoteEvaluation""]","""This tutorial depends on [step…"


In [56]:
tutorial_content = tutorials[1, "content"]
print(tutorial_content)

This tutorial depends on [step-1](step_1.html).

| **Table of contents** | |
| --- | --- |
| 1. [Introduction](#step_2-Intro)    * [Enumerating degrees of freedom](#step_2-Enumeratingdegreesoffreedom) * [Sparsity](#step_2-Sparsity) * [How degrees of freedom are enumerated](#step_2-Howdegreesoffreedomareenumerated)- [The commented program](#step_2-CommProg)      * [Mesh generation](#step_2-Meshgeneration)* [Outputting the location of degrees of freedom](#step_2-Outputtingthelocationofdegreesoffreedom)* [Creation of a DoFHandler](#step_2-CreationofaDoFHandler)* [Renumbering of DoFs](#step_2-RenumberingofDoFs)* [The main function](#step_2-Themainfunction) | 1. [Results](#step_2-Results)    * [Possibilities for extensions](#step_2-Possibilitiesforextensions)- [The plain program](#step_2-PlainProg) |

Introduction

Note
:   The material presented here is also discussed in [video lecture 9](https://www.math.colostate.edu/~bangerth/videos.676.9.html). (All video lectures are also available [h

In [18]:
oai_key = getpass("Enter your OpenAI API key: ")

In [13]:
client = OpenAI(api_key=oai_key)

In [15]:
model = "gpt-4.1-mini"
prompt_template = f"""You are an AI expert in C++. Your task is to analyze the following Deal.II tutorial content, 
and provide a pair prompt / completion that is ideally suited for fine tuning a code LLM. You will provide this pair in a JSON format. 
The prompt is what a scientific user would ask of the code LLM, focusing on theoritical aspect and not the technical one (so no reference to deal.II).
(example: Solve the scalar wave equation with Dirichlet boundary conditions),
and the completion is the final code.
Tutorial content :
{tutorial_content}
"""
stream = client.responses.create(model=model, input=prompt_template, stream=True, temperature=0.2)
response = ""
for event in stream:
    if event.type == "response.output_text.delta":
        response += event.delta
        print(event.delta, end="", flush=True)
        
# Extract the JSON from the response
# import re
# json_match = re.search(r"\{.*\}", response, re.DOTALL)
# json = json_match.group(0) if json_match else None

# if json is None:
#     raise ValueError("No JSON found in the response")

# Write to file
# with open("step_1.json", "w") as f:
#     f.write(response)
    
pair = JSON.loads("\n".join(response.split("\n")[1:-1]))
print(pair)

```json
{
  "prompt": "Enumerate the degrees of freedom for a finite element approximation of the Laplace equation on a 2D circular domain using lowest order (Q1) elements, visualize the sparsity pattern of the resulting system matrix, and improve the sparsity pattern by renumbering the degrees of freedom using the Cuthill-McKee algorithm.",
  "completion": "#include <deal.II/grid/tria.h>\n#include <deal.II/grid/grid_generator.h>\n#include <deal.II/grid/grid_out.h>\n\n#include <deal.II/dofs/dof_handler.h>\n\n#include <deal.II/fe/fe_q.h>\n#include <deal.II/dofs/dof_tools.h>\n#include <deal.II/fe/mapping_q1.h>\n\n#include <deal.II/lac/sparse_matrix.h>\n#include <deal.II/lac/dynamic_sparsity_pattern.h>\n\n#include <deal.II/dofs/dof_renumbering.h>\n\n#include <fstream>\n\nusing namespace dealii;\n\nvoid make_grid(Triangulation<2> &triangulation)\n{\n  const Point<2> center(1, 0);\n  const double inner_radius = 0.5, outer_radius = 1.0;\n  GridGenerator::hyper_shell(triangulation, center, in

In [16]:
pprint(pair["prompt"])
print(pair["completion"])

('Enumerate the degrees of freedom for a finite element approximation of the '
 'Laplace equation on a 2D circular domain using lowest order (Q1) elements, '
 'visualize the sparsity pattern of the resulting system matrix, and improve '
 'the sparsity pattern by renumbering the degrees of freedom using the '
 'Cuthill-McKee algorithm.')
#include <deal.II/grid/tria.h>
#include <deal.II/grid/grid_generator.h>
#include <deal.II/grid/grid_out.h>

#include <deal.II/dofs/dof_handler.h>

#include <deal.II/fe/fe_q.h>
#include <deal.II/dofs/dof_tools.h>
#include <deal.II/fe/mapping_q1.h>

#include <deal.II/lac/sparse_matrix.h>
#include <deal.II/lac/dynamic_sparsity_pattern.h>

#include <deal.II/dofs/dof_renumbering.h>

#include <fstream>

using namespace dealii;

void make_grid(Triangulation<2> &triangulation)
{
  const Point<2> center(1, 0);
  const double inner_radius = 0.5, outer_radius = 1.0;
  GridGenerator::hyper_shell(triangulation, center, inner_radius, outer_radius, 5);

  for (unsigne

In [17]:
# Save the completion to a file
with open("step_1_completion.cpp", "w") as f:
    f.write(pair["completion"])
# Save the prompt to a file
with open("step_1_prompt.txt", "w") as f:
    f.write(pair["prompt"])
# Save the pair to a JSON file
with open("step_1_pair.json", "w") as f:
    JSON.dump(pair, f, indent=4)

In [90]:


class PromptCompletionPair(BaseModel):
    prompt: str
    completion: str
    
prompt_template = """You are an expert in AI finetuning and C++. Your task is to analyze the following Deal.II tutorial content,
and provide a pair prompt / completion that is ideally suited for fine tuning a code LLM. You will provide this pair in a JSON format.
The prompt is what a scientific user would ask of the code LLM, focusing on theoritical aspect and not the technical one (so no reference to deal.II or C++).
(example: Solve the scalar wave equation in 2D with Dirichlet boundary conditions),
and the completion is the final code.
Tutorial description : 
{description}
Tutorial content :
{content}
"""

def get_prompt_and_completion(content, description) -> PromptCompletionPair:
    client = OpenAI(api_key=oai_key)
    model = "gpt-4.1-mini"

    response = client.responses.parse(model=model, input=prompt_template.format(content=content, description=description), text_format=PromptCompletionPair, temperature=0.2).output_parsed
    
    return response


# get_prompt_and_completion(tutorial_content)
get_prompt_and_completion(*pl.scan_parquet("tutorials.parquet").filter(pl.col("num") == 3).select_seq("content", "description").collect().row(0))

PromptCompletionPair(prompt='Solve the Poisson equation \\(-\\Delta u = 1\\) on the square domain \\([-1,1]^2\\) with homogeneous Dirichlet boundary conditions \\(u=0\\) on \\(\\partial \\Omega\\), using the finite element method with bilinear elements. Assemble the system matrix and right hand side vector by integrating over the domain, apply the boundary conditions, solve the resulting linear system using the Conjugate Gradient method, and output the solution for visualization.', completion='#include <deal.II/grid/tria.h>\n#include <deal.II/dofs/dof_handler.h>\n#include <deal.II/grid/grid_generator.h>\n#include <deal.II/fe/fe_q.h>\n#include <deal.II/dofs/dof_tools.h>\n#include <deal.II/fe/fe_values.h>\n#include <deal.II/base/quadrature_lib.h>\n#include <deal.II/base/function.h>\n#include <deal.II/numerics/vector_tools.h>\n#include <deal.II/numerics/matrix_tools.h>\n#include <deal.II/lac/vector.h>\n#include <deal.II/lac/full_matrix.h>\n#include <deal.II/lac/sparse_matrix.h>\n#include 

In [92]:
tutorials.filter(pl.col("num") == 3)

num,name,link,description,keywords,content
u8,str,str,str,list[str],str
3,"""step-3""","""https://dealii.org/current/dox…","""Actually solve Laplace's probl…","[""FEValues"", ""VectorTools::interpolate_boundary_values()"", … ""DataOut""]","""This tutorial depends on [step…"


In [15]:
def with_prompts_and_completions(tutorials: pl.DataFrame) -> pl.DataFrame:
    with tqdm(total=len(tutorials)) as pbar:
        pbar.set_description("Processing tutorials to generate prompt/completion pairs")
        def foo(content: str):
            res = get_prompt_and_completion(content[:len(content) // 4])
            pbar.update()
            return OrderedDict(res)
        tutorials = (
        tutorials.lazy().head(1)
        .with_columns(pl.col("content").alias("prompt/completion").map_elements(foo, return_dtype=pl.Struct({"prompt": pl.String, "completion": pl.String})).struct.unnest())
        .select(pl.all().exclude("prompt/completion"))
         .collect()
        )
    return tutorials

#res = with_prompts_and_completions(tutorials)
# res

In [25]:
from typing import Dict, Literal
import openai


class Message(BaseModel):
    role: Literal["user"] | Literal["system"] | Literal["assistant"] = "user"
    content: str


class OpenAIChatBody(BaseModel):
    model: str = "gpt-4.1-mini"
    messages: list[Message] = Field(default_factory=list)
    temperature: float = 0.2
    response_format: Dict = {
        "type": "json_schema",
        "json_schema": {
            "name": "finetune_prompt-completion_pair",
            "strict": True,
            "description": "A pair of prompt and completion for finetuning a code LLM",
            "schema": {
                "type": "object",
                "properties": {
                    "prompt": {
                        "type": "string",
                        "description": "The prompt that a scientific user would ask of the code LLM, focusing on the theoretical aspect and not the technical one (so no reference to deal.II or C++)",
                    },
                    "completion": {
                        "type": "string",
                        "description": "The final code completion",
                    },
                },
                "additionalProperties": False,
                "required": ["prompt", "completion"],
            }
        },
    }


class OpenAIRequest(BaseModel):
    custom_id: str
    method: Literal["POST"] = "POST"
    url: str = "/v1/chat/completions"
    body: OpenAIChatBody


def get_batch_file_content(tutorials: pl.DataFrame, limit: int | None = None) -> str:
    res = []
    prompt_template = """You are an expert in AI finetuning and C++. Your task is to analyze the following Deal.II tutorial content,
and provide a pair prompt / completion that is ideally suited for fine tuning a code LLM. You will provide this pair in a JSON format.
The prompt is what a scientific user would ask of the code LLM, focusing on theoritical aspect and not the technical one (so no reference to deal.II or C++).
(example: Solve the scalar wave equation in 2D with Dirichlet boundary conditions),
and the completion is the final code.
Tutorial description : 
{descr}
Tutorial content :
{content}
"""
    for name, link, descr, content in tqdm(
        tutorials.lazy()
        .limit(limit)
        .select("name", "link", "description", "content")
        .collect()
        .iter_rows(),
        total=limit if limit else len(tutorials),
    ):
        req = OpenAIRequest(
            custom_id=link,
            body=OpenAIChatBody(
                messages=[
                    Message(
                        content=prompt_template.format(descr=descr, content=content)
                    )
                ]
            ),
        )
        json = req.model_dump_json()
        res.append(json)

    return "\n".join(res)


def create_batch_file(tutorials: pl.DataFrame, limit: int | None = None):
    with open("batch_tutorials_prompt_completion.jsonl", "w") as f:
        f.write(get_batch_file_content(tutorials, limit))


tutorials = pl.read_parquet("tutorials.parquet")
batch_file_content = get_batch_file_content(tutorials, limit=5)
print(batch_file_content)

100%|██████████| 5/5 [00:00<00:00, 2100.30it/s]






In [27]:
client = OpenAI(api_key=oai_key)
batch_file = "batch_tutorials_prompt_completion.jsonl"
step = 20
k = 2.5
start = int(k * step)
end = min(len(tutorials), int((k + 1) * step))
reached_end = end == len(tutorials)
print(f"start: {start}, end: {end}, reached end: {reached_end}")
create_batch_file(tutorials[start: end])

# with open(batch_file, "rb") as f:
#     batch_file_id = client.files.create(
#         file=f,
#         purpose="batch"
#     ).id

start: 50, end: 70, reached end: False


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 20/20 [00:00<00:00, 1022.33it/s]


In [125]:
batch = client.batches.create(
    input_file_id=batch_file_id,
    completion_window="24h",
    endpoint="/v1/chat/completions"
)
batch

Batch(id='batch_680bb564943c819099243916122c52bb', completion_window='24h', created_at=1745597796, endpoint='/v1/chat/completions', input_file_id='file-XpxfGR2iF7R6TYBCz61VYM', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1745684196, failed_at=None, finalizing_at=None, in_progress_at=None, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))

In [129]:
batch_id = batch.id if True else "batch_680ba4a753188190af3ddd234ac386a5"
client.batches.retrieve(batch_id).model_dump()

{'id': 'batch_680bb564943c819099243916122c52bb',
 'completion_window': '24h',
 'created_at': 1745597796,
 'endpoint': '/v1/chat/completions',
 'input_file_id': 'file-XpxfGR2iF7R6TYBCz61VYM',
 'object': 'batch',
 'status': 'validating',
 'cancelled_at': None,
 'cancelling_at': None,
 'completed_at': None,
 'error_file_id': None,
 'errors': None,
 'expired_at': None,
 'expires_at': 1745684196,
 'failed_at': None,
 'finalizing_at': None,
 'in_progress_at': None,
 'metadata': None,
 'output_file_id': None,
 'request_counts': {'completed': 0, 'failed': 0, 'total': 0}}

In [None]:
client.chat.completions

In [29]:

batch_file = "batch_tutorials_prompt_completion.jsonl"
with open(batch_file, "r") as f:
    line = f.readline()
openai_api_url = "https://api.openai.com"
r = JSON.loads(line)
response = requests.post(
    openai_api_url + r["url"],
    headers={
        "Authorization": f"Bearer {oai_key}",
        "Content-Type": "application/json",
    },
    json=r["body"]
    
).json()

In [35]:
r

{'custom_id': 'https://dealii.org/current/doxygen/deal.II//step_51.html',
 'method': 'POST',
 'url': '/v1/chat/completions',
 'body': {'model': 'gpt-4.1-mini',
  'messages': [{'role': 'user',
  'temperature': 0.2,
  'response_format': {'type': 'json_schema',
   'json_schema': {'name': 'finetune_prompt-completion_pair',
    'strict': True,
    'description': 'A pair of prompt and completion for finetuning a code LLM',
    'schema': {'type': 'object',
     'properties': {'prompt': {'type': 'string',
       'description': 'The prompt that a scientific user would ask of the code LLM, focusing on the theoretical aspect and not the technical one (so no reference to deal.II or C++)'},
      'completion': {'type': 'string',
       'description': 'The final code completion'}},
     'additionalProperties': False,
     'required': ['prompt', 'completion']}}}}}

In [1]:

content = JSON.loads(response["choices"][0]["message"]["content"])
prompt = content["prompt"]
completion = content["completion"]

with open("completion.cpp", "w") as f:
    f.write(completion)

NameError: name 'JSON' is not defined