In [6]:
import openai
from typing import List, Dict

COMPLETION_MODEL = "gpt-3.5-turbo"

def get_response(prompt, previous_messages: List[Dict[str, str]] = None):
    if previous_messages is None:
        previous_messages = []
    new_message = [
        {
            "content": prompt,
            "role": "user",
        }
    ]
    messages = previous_messages + new_message
    return openai.ChatCompletion.create(
        model=COMPLETION_MODEL,
        messages=messages,
    )

In [7]:
input_text = """
This function is slow, can you make it faster? 
The only available pandoc methods are: read(), write(), and iter().
Minimize prose and maximize code in your answer.
Feel free to create any functions or import extra libraries you need.

```
import pandoc
from pandoc import Header, Para, BulletList, OrderedList

def split_doc_by_header(doc, include_orphans=True):
    \"""From split_markdown.ipynb, split a pandoc document into sections by header.
    
    Args:
        doc (pandoc.Document): The document to split.
        include_orphans (bool, optional): Whether to include sections that don't have a header. Defaults to True.

    Returns:
        dict: A dictionary mapping section titles to lists of strings.

    Example:
        >>> doc = pandoc.read("test.md")
        >>> split_doc_by_header(doc)
            {'Header 1': ['This is a test.'], 'Header 2': ['This is another test.']}
    \"""
    sections = {}
    title_str = ""
    for elt in pandoc.iter(doc):
        if elt == doc:
            continue
        match elt:
            case Header(_, _, title):
                title_str = pandoc.write(title).strip()
                sections[pandoc.write(title).strip()] = []
            case Para(x):
                if not title_str and include_orphans:
                    title_str = "[No Header]"
                if title_str not in sections and include_orphans:
                    sections[title_str] = []
                sections[title_str].append(pandoc.write(x, options=["--wrap=none"]).strip())
            case BulletList(_) | OrderedList(_):
                if not title_str and include_orphans:
                    title_str = "[No Header]"
                if title_str not in sections and include_orphans:
                    sections[title_str] = []
                # split the list on newlines
                list_str = pandoc.write(elt, options=["--wrap=none"])
                list_items = list_str.splitlines()
                sections[title_str].extend(list_items)
    return sections
```
"""

res = get_response(input_text)

KeyboardInterrupt: 

In [None]:
print(res.choices[0].message.content)



One way to optimize this function is to minimize the number of times we call `pandoc.write()`, which can be a costly operation. We can do this by storing the title string in a variable and reusing it when we append paragraphs and list items to the appropriate section. Additionally, we can use a dictionary comprehension to simplify the code and eliminate the need for the `if not title_str and include_orphans:` check.

Here's the optimized function:

```
def split_doc_by_header(doc, include_orphans=True):
    """Split a pandoc document into sections by header.

    Args:
        doc (pandoc.Document): The document to split.
        include_orphans (bool, optional): Whether to include sections that don't have a header. Defaults to True.

    Returns:
        dict: A dictionary mapping section titles to lists of strings.
    """
    sections = {}
    title_str = ""
    for elt in pandoc.iter(doc):
        if isinstance(elt, Header):
            title_str = pandoc.write(elt.content).strip

In [26]:
import pandoc
from pandoc.types import Header, Para, BulletList, OrderedList

def split_doc_by_header_orig(doc, include_orphans=True):
    """From split_markdown.ipynb, split a pandoc document into sections by header.
    
    Args:
        doc (pandoc.Document): The document to split.
        include_orphans (bool, optional): Whether to include sections that don't have a header. Defaults to True.

    Returns:
        dict: A dictionary mapping section titles to lists of strings.

    Example:
        >>> doc = pandoc.read("test.md")
        >>> split_doc_by_header(doc)
            {'Header 1': ['This is a test.'], 'Header 2': ['This is another test.']}
    """
    sections = {}
    title_str = ""
    for elt in pandoc.iter(doc):
        if elt == doc:
            continue
        match elt:
            case Header(_, _, title):
                title_str = pandoc.write(title).strip()
                sections[pandoc.write(title).strip()] = []
            case Para(x):
                if not title_str and include_orphans:
                    title_str = "[No Header]"
                if title_str not in sections and include_orphans:
                    sections[title_str] = []
                sections[title_str].append(pandoc.write(x, options=["--wrap=none"]).strip())
            case BulletList(_) | OrderedList(_):
                if not title_str and include_orphans:
                    title_str = "[No Header]"
                if title_str not in sections and include_orphans:
                    sections[title_str] = []
                # split the list on newlines
                list_str = pandoc.write(elt, options=["--wrap=none"])
                list_items = list_str.splitlines()
                sections[title_str].extend(list_items)
    return sections


def split_doc_by_header(doc, include_orphans=True):
    """Split a pandoc document into sections by header.

    Args:
        doc (pandoc.Document): The document to split.
        include_orphans (bool, optional): Whether to include sections that don't have a header. Defaults to True.

    Returns:
        dict: A dictionary mapping section titles to lists of strings.
    """
    sections = {}
    title_str = ""
    for elt in pandoc.iter(doc):
        if isinstance(elt, Header):
            title_str = elt[1][0]
            sections[title_str] = []
        elif isinstance(elt, Para):
            sections.setdefault(title_str or "[No Header]", []).append(" ".join([sub[0] for sub in elt[0] if isinstance(sub, pandoc.types.Str)]))
        elif isinstance(elt, (BulletList, OrderedList)):
            list_str = pandoc.write(elt, options=["--wrap=none"])
            list_items = list_str.splitlines()
            sections.setdefault(title_str or "[No Header]", []).extend(list_items)
    if include_orphans:
        sections = {title or "[No Header]": items for title, items in sections.items()}
    return sections

doc = pandoc.read(file="tmp/dnd-notes-main/Factions/The Institute.md")
sections = split_doc_by_header(doc)

KeyboardInterrupt: 

In [23]:
# compare runtime of original and new function
import timeit
print(timeit.timeit("split_doc_by_header_orig(doc)", globals=globals(), number=100))
print(timeit.timeit("split_doc_by_header(doc)", globals=globals(), number=100))


33.25858767400496
4.103509199980181


In [25]:
# get runtime line-by-line for original and new function, using line_profiler and 100 runs
import line_profiler
lp = line_profiler.LineProfiler()
lp.add_function(split_doc_by_header_orig)
lp.add_function(split_doc_by_header)
lp.runcall(split_doc_by_header_orig, doc)
lp.runcall(split_doc_by_header, doc)
lp.print_stats()


Timer unit: 1e-09 s

Total time: 0.346911 s
File: /tmp/ipykernel_1534222/4119657474.py
Function: split_doc_by_header_orig at line 4

Line #      Hits         Time  Per Hit   % Time  Line Contents
     4                                           def split_doc_by_header_orig(doc, include_orphans=True):
     5                                               """From split_markdown.ipynb, split a pandoc document into sections by header.
     6                                               
     7                                               Args:
     8                                                   doc (pandoc.Document): The document to split.
     9                                                   include_orphans (bool, optional): Whether to include sections that don't have a header. Defaults to True.
    10                                           
    11                                               Returns:
    12                                                   dict: A dictionary

In [37]:
import re

def parse_markdown(markdown_string):
    headings_regex = r'(?P<level>#+)\s*(?P<heading>.+)'
    headings = {}
    current_heading = ''
    current_content = []
    for line in markdown_string.splitlines():
        if heading_match := re.match(headings_regex, line):
            if current_heading:
                headings[current_heading] = current_content
                current_content = []
            current_heading = heading_match['heading']
        elif line.strip():
            current_content.append(line.strip())
    if current_heading:
        headings[current_heading] = current_content
    return headings

In [40]:
# compare runtime of original and new function
import timeit
docstring = pandoc.write(doc, options=["--wrap=none"], format="commonmark")
print(timeit.timeit("parse_markdown(docstring)", globals=globals(), number=100))
print(timeit.timeit("split_doc_by_header(doc)", globals=globals(), number=100))

# compare outputs of original and new function
out1 = parse_markdown(docstring)
out2 = split_doc_by_header(doc)
print(", ".join(out1.keys()))
print(", ".join(out2.keys()))

0.0023967979941517115
4.095397616038099
The Institute, History, Motivations, Next Steps, Krav, Rungaa, Key Figures
the-institute, history, motivations, next-steps, krav, rungaa, key-figures


TypeError: sequence item 0: expected str instance, list found