In [None]:
import marimo as mo
import polars as pl

from remarx.sandbox_utils import submit_prompt, get_text_response
from remarx.notebook_utils import highlight_bracketed_text

# Test quotation detection prompts

This notebook tests the quotation detection prompt templates; it is inspired by the title mentions notebook.

## Load quotation subset data

Choose 2-3 pages to test with; at least one should have two quotes.

In [None]:
quotes_df = pl.read_csv("data/subset/direct_quotes.csv")
quotes_df.group_by("page_index").agg(pl.col("UUID").count()).head()

Page index 661 has two quotes; page index 256 has one.

In [None]:
test_quotes = quotes_df.filter(pl.col("page_index").is_in([661, 256]))

# order by start index within the text
test_quotes = test_quotes.sort("start_index")

test_quotes

In [None]:
# just three rows, so get as a list of dict
rows = list(test_quotes.iter_rows(named=True))

# grab the page text content
page_text_i256 = rows[0]["page_text"]

page_text_i661 = rows[1]["page_text"]

Highlight annotation for page 256.

The annotation highlights a long, multiline passage at the end of the page.

In [None]:
def highlight_text_span(text, start_index, end_index):
    text_before = text[:start_index]
    highlight_text = text[start_index:end_index]
    text_after = text[end_index:]
    return f"{text_before}<mark>{highlight_text}</mark>{text_after}"

In [None]:
# start index in the annotation data is from beginning of file;
# adjust by start of page

page_i256_start = rows[0]["page_start"]
mo.md(
    highlight_text_span(
        page_text_i256,
        rows[0]["start_index"] - page_i256_start,
        rows[0]["end_index"] - page_i256_start,
    )
)

Highlight annotations for page 661, which has two quotes. One quote comes directly after the other.

In [None]:
page_i661_start = rows[1]["page_start"]
# this page has two highlights; if we add highlighting for the second one first
# the indices for the first one will still be valid
mo.md(
    highlight_text_span(
        highlight_text_span(
            page_text_i661,
            rows[2]["start_index"] - page_i661_start,
            rows[2]["end_index"] - page_i661_start,
        ),
        rows[1]["start_index"] - page_i661_start,
        rows[1]["end_index"] - page_i661_start,
    )
)

## Basic Prompt (zero-shot)

In [None]:
# load and display the prompt

# load prompt
with open("prompts/quotations/basic.txt") as f0:
    basic_prompt = f0.read()
mo.md(basic_prompt)

In [None]:
# get response with the default model
basic_responses = []
for sample_page in [page_text_i256, page_text_i661]:
    basic_response = submit_prompt(
        task_prompt=basic_prompt, user_prompt=sample_page
    )
    basic_responses.append(basic_response)

In [None]:
basic_responses

In [None]:
highlight_bracketed_text(get_text_response(basic_responses[0]))

Substantial overlap with the annotated passage; it starts about a sentence later and does not include the citation.

In [None]:
highlight_bracketed_text(get_text_response(basic_responses[1]))

Matches the annotated text. Because of the way we're highlighting, I'm not sure if this is identified as one long quotation or two, as in the annotated data and as indicated by the quotes.

## One-shot

In [None]:
# load prompt
with open("prompts/quotations/one_shot.txt") as f1:
    one_shot_prompt = f1.read()
mo.md(one_shot_prompt)

In [None]:
one_shot_responses = []
for _sample_page in [page_text_i256, page_text_i661]:
    one_shot_response = submit_prompt(
        task_prompt=one_shot_prompt, user_prompt=_sample_page
    )
    one_shot_responses.append(one_shot_response)

In [None]:
highlight_bracketed_text(get_text_response(one_shot_responses[0]))

In [None]:
highlight_bracketed_text(get_text_response(one_shot_responses[1]))

The results from the one-shot prompt are exactly the same as the zero-shot.