# Identify subset of examples for direct quotes

## Load & filter annotation data

**Goal:** Find a subset of pages with direct quotes to be use for testing and experimentation.

In [None]:
import marimo as mo
import polars as pl

In [None]:
# load all annotation data files
df = pl.read_csv("data/annotation/*.csv")
# limit to the columns we care about
df = df.select(
    pl.col("UUID", "FILE", "QUOTE_TRANSCRIPTION", "ANCHOR", "TAGS", "COMMENTS")
)
# turn char-offset into numeric start index, calculate end index
df = (
    df.with_columns(start_index=pl.col("ANCHOR").str.slice(12).cast(dtype=int))
    .with_columns(
        end_index=pl.col("start_index").add(
            pl.col("QUOTE_TRANSCRIPTION").str.len_chars()
        )
    )
    .drop("ANCHOR")
)
df

Filter to the subset of annotations that have direct references to the texts that are in scope for this phase.

In [None]:
# filter to the subset with direct quotes from texts of interest
# - we want tag only, no qualification such as |Title Reference or |Concept Reference
quotes_df = df.filter(
    pl.col("TAGS").is_in(["Kapital", "Manifest der Kommunistischen Partei"])
)
quotes_df

In [None]:
quotes_df.group_by("FILE").agg(
    pl.col("TAGS").count().alias("count"),
    pl.col("TAGS").unique().str.join("|"),
)

`1896-97aCLEAN.txt` has the most direct quotes and has some from both Marx texts; let's use that one.

In [None]:
# 1896-97aCLEAN.txt has the most direct quotes and has some from both texts
quote_subset_df = quotes_df.filter(pl.col("FILE").eq("1896-97aCLEAN.txt"))
quote_subset_df

## Load article text and chunk roughly into pages

Load the text contents of the file for the annotations we want to use and make sure we can match pages of text to the annotated text.

In [None]:
# load text file and chunk into pages

with open("data/text/1896-97aCLEAN.txt") as inputfile:
    text = inputfile.read()
    pages = text.split("\n\n\n")

page_df = pl.DataFrame(data={"text": pages}).with_row_index()
# add field to calculate text length
page_df = page_df.with_columns(text_length=pl.col("text").str.len_chars())
page_df

In [None]:
# calculate start index for each page based on preceding text length + split characters
page_start_indices = []
# first page index is zero
current_index = 0
for page in page_df.iter_rows(named=True):
    # add current page index to the list
    page_start_indices.append(current_index)
    # add length of this place plus characters used to split
    current_index += page["text_length"] + 3

In [None]:
# add page start index to the dataframe of page text and calculate end of page
page_df_start = page_df.insert_column(
    3, pl.Series("page_start", page_start_indices)
).with_columns(page_end=pl.col("page_start").add(pl.col("text_length")))
page_df_start

Spot check alignment of page text and annotation text.

In [None]:
# check text index matching
first_row = quotes_df.row(0, named=True)
first_row

In [None]:
text_substring = text[first_row["start_index"] : first_row["end_index"]]
print(text_substring)

„die Arbeiter¬
klasse nicht die fertige Staatsmaschine einfach in Besitz nehmen und für ihre
eigenen Zwecke in Bewegung setzen kann“


In [None]:
print(first_row["QUOTE_TRANSCRIPTION"])

„die Arbeiter¬ klasse nicht die fertige Staatsmaschine einfach in Besitz nehmen und für ihre eigenen Zwecke in Bewegung setzen kann“


In [None]:
text_substring.replace("\n", " ") == first_row["QUOTE_TRANSCRIPTION"]

The text in the exported annotation does not include newlines, but once we're calculating text length and start/end index
correctly, we do have matching text (other than newlines).

## Combine page text with selected annotations

In [None]:
# join subset of quotes with page text; rename page text columns for clarity
quote_subset_pages = quote_subset_df.join_where(
    page_df_start.rename({"text": "page_text", "index": "page_index"}),
    pl.col("start_index") >= pl.col("page_start"),
    pl.col("start_index") <= pl.col("page_end"),
)
quote_subset_pages

Check alignment of text between quotes and page text.

In [None]:
# iterate over quotes and output to check that we're getting the correct content

for quote in quote_subset_pages.iter_rows(named=True):
    print(quote["QUOTE_TRANSCRIPTION"])
    page_start_index, page_end_index = (
        quote["start_index"] - quote["page_start"],
        quote["end_index"] - quote["page_start"],
    )
    print(f"{quote['TAGS']} (article {quote['start_index']}:{quote['end_index']} / page {page_start_index}:{page_end_index}) ")
    if page_end_index > len(quote['page_text']):
        print("*** quote end index is larger than page content")
    print(quote["page_text"][page_start_index:page_end_index])
    print("\n")

Zur Lösung dieses Widerspruchs" fährt er fort, „bedarf es noch vieler Mittelglieder.“ Er versprach, diese Lösung später zu geben.
Kapital (article 2262872:2263001 / page 2802:2931) 
Zur Lösung
dieses Widerspruchs" fährt er fort, „bedarf es noch vieler Mittelglieder.“ Er
versprach, diese Lösung später zu geben.


daß dies Gesetz offenbar, aller auf den Augenschein gegründeten Erfahrung widerspricht". „
Kapital (article 2262782:2262872 / page 2712:2802) 
daß dies Gesetz offenbar,
aller auf den Augenschein gegründeten Erfahrung widerspricht". „


1 Vergl. „Kapital“, Bd. I, S. 329, Aum. 75, und Roscher, „System der Volkswirth¬ schaft, 1. Auflage (1854½), S. 359, 360, 361.
Kapital (article 1843563:1843689 / page 3696:3822) 
1 Vergl. „Kapital“, Bd. I, S. 329, Aum. 75, und Roscher, „System der Volkswirth¬
schaft, 1. Auflage (1854½), S. 359, 360, 361.


Marx hat ferner die Ausführung von Roscher total auf den Kopf gestellt¬ und endlich den Anschein erweckt, als ob sein Werthbegriff etwas wefen

Some pages have more than one quote, but in this set the quotes on the same page are from the same Marx text.

In [None]:
# which pages have more than one quote?
quote_subset_pages.group_by("page_index").agg(
    pl.col("TAGS").count().alias("count"),
    pl.col("TAGS").unique().str.join("|"),
).filter(pl.col("count").gt(1))

See the quotes in the context of the page. Use the slider to move between different quotes.

In [None]:
quote_slider = mo.ui.slider(
    start=0,
    stop=quote_subset_pages.height - 1,
    step=1,
    label="Quote from subset",
)
quote_slider

In [None]:
def show_page(quote):
    page_start_index, page_end_index = (
        quote["start_index"] - quote["page_start"],
        quote["end_index"] - quote["page_start"],
    )
    # at least one page includes an asterisk; escape so we don't get unintentional italics
    before_quote = quote["page_text"][0:page_start_index].replace("*", r"\*")
    quote_text = quote["page_text"][page_start_index:page_end_index].replace(
        "*", r"\*"
    )
    after_quote = quote["page_text"][page_end_index:].replace("*", r"\*")

    return mo.md(f"""
    Tag: {quote["TAGS"]}<br/>
    Page index: {quote["page_index"]} (article: {page_start_index}:{page_end_index} page: {page_start_index}:{page_end_index})

    {before_quote}**{quote_text}**{after_quote}
    """)


show_page(quote_subset_pages.row(quote_slider.value, named=True))

Save the identified subset of quotes and associated page text for use in other experiments.

In [None]:
quote_subset_pages

In [None]:
quote_subset_pages.write_csv("data/subset/direct_quotes.csv", include_bom=True)