# Review Excerpt Overlap
By Laure Thompson and Rebecca Sutton Koeser

(c)2025 Trustees of Princeton University. Permission granted for non-commercial distribution online under the [Apache 2.0 License](https://github.com/Princeton-CDH/corppa/blob/feature/license-headers/LICENSE).

In [1]:
import pathlib

import polars as pl

from corppa.config import get_config
from corppa.poetry_detection.polars_utils import load_excerpts_df

In [2]:
# load local configuration options to get path to data
config_opts = get_config()

data_dir = pathlib.Path(config_opts["poem_dataset"]["data_dir"])
if not data_dir.exists() or not data_dir.is_dir():
    raise ValueError(f"Data directory {data_dir} not found. " + 
                     "\nCheck your configuration file, and remember to use an absolute path for the poem dataset data directory.")
else:
    print(f"Data will be loaded from {data_dir}")

# Create a dictionary of data files for lookup based on file base name without any extension
# so that excerpts data can be .csv or compressed .csv.gz 
data_paths = {data_file.stem.split('.', 1)[0]: data_file for data_file in data_dir.iterdir()}

Data will be loaded from /Users/lauret/cdh-dev/ppa/ppa-found-poems/data


In [3]:
%%time
# load the excerpts into a polars dataframe

excerpts_df = load_excerpts_df(data_paths["excerpts"]).lazy()

CPU times: user 3.04 s, sys: 225 ms, total: 3.27 s
Wall time: 3.17 s


## Gather some basic stastics

In [4]:
# Gather excerpt-level overlap sets
overlaps = (
    excerpts_df
    # Filter to excerpts in pages with mulitple excerpts
    .filter(pl.col("page_id").is_duplicated())
    .join_where(
        excerpts_df,
        # (1) Excerpts have same page ID
        pl.col("page_id") == pl.col("page_id_right"),
        # (2) Excerpts overlap
        ## Left's start index < Right's end index
        pl.col("ppa_span_start") < pl.col("ppa_span_end_right"),
        ## Right's start index < Left's end index
        pl.col("ppa_span_start_right") < pl.col("ppa_span_end"),
        ## Distinct excerpts (excludes self-matches)
        (pl.col("excerpt_id") != pl.col("excerpt_id_right")) |
         (pl.col("poem_id") != pl.col("poem_id_right")),
    )
    # Group by excerpt
    .group_by("page_id", "excerpt_id", "poem_id")
    .agg(
        # Aggregate-useful excerpt info
        pl.col("ppa_span_start").first(),
        pl.col("ppa_span_end").first(),
        # pl.col("poem_title").first(),
        # pl.col("poem_author").first(),
        # Overlapping excerpt info
        pl.len().alias("n_excerpts"),
        pl.col("poem_id_right").n_unique().alias("n_poems"),
        pl.col("excerpt_id_right"),
        pl.col("poem_id_right"),
        # pl.col("poem_title_right")
    )
    .collect()
    # Sort by page and span indices
    .sort("page_id", "ppa_span_start", "ppa_span_end")
)

overlaps

page_id,excerpt_id,poem_id,ppa_span_start,ppa_span_end,n_excerpts,n_poems,excerpt_id_right,poem_id_right
str,str,str,i64,i64,u32,u32,list[str],list[str]
"""A01224.100""","""p@173:747""","""Virgil_Aeneid""",173,747,2,2,"[""p@179:556"", ""p@179:533""]","[""Z300534278"", ""Z200263304""]"
"""A01224.100""","""p@179:533""","""Z200263304""",179,533,2,2,"[""p@173:747"", ""p@179:556""]","[""Virgil_Aeneid"", ""Z300534278""]"
"""A01224.100""","""p@179:556""","""Z300534278""",179,556,2,2,"[""p@173:747"", ""p@179:533""]","[""Virgil_Aeneid"", ""Z200263304""]"
"""A01224.101""","""p@707:1188""","""Z300486026""",707,1188,2,2,"[""p@707:1188"", ""p@707:1188""]","[""Z200438244"", ""Z400485836""]"
"""A01224.101""","""p@707:1188""","""Z200438244""",707,1188,2,2,"[""p@707:1188"", ""p@707:1188""]","[""Z400485836"", ""Z300486026""]"
…,…,…,…,…,…,…,…,…
"""yale.39002032008188.00000292""","""p@2215:2260""","""Z200409938""",2215,2260,1,1,"[""p@2151:2258""]","[""William-Shakespeare_As-You-Like-It""]"
"""yale.39002088447587.00000050""","""p@330:806""","""Z200177398""",330,806,3,3,"[""p@628:777"", ""p@665:702"", ""p@627:723""]","[""Z300380365"", ""Z200475115"", ""Z200449615""]"
"""yale.39002088447587.00000050""","""p@627:723""","""Z200449615""",627,723,3,3,"[""p@628:777"", ""p@665:702"", ""p@330:806""]","[""Z300380365"", ""Z200475115"", ""Z200177398""]"
"""yale.39002088447587.00000050""","""p@628:777""","""Z300380365""",628,777,3,3,"[""p@665:702"", ""p@627:723"", ""p@330:806""]","[""Z200475115"", ""Z200449615"", ""Z200177398""]"


In [5]:
# Total count
n_excerpts = excerpts_df.collect().height

# Excerpts that overlap with an excerpt with the same poem ID
same_poem_count = overlaps.filter(pl.col("poem_id").is_in(pl.col("poem_id_right"))).height

# Excerpts that overlap with an excerpt with a different poem ID
diff_poem_count = overlaps.filter(
    # Does not contain poem_id
    (~pl.col("poem_id").is_in(pl.col("poem_id_right"))) |
    # Or, contains more than one poem_id
    (pl.col("n_poems") > 1)
).height


print(f"Total Excerpts: {n_excerpts:,}\n")
print(f"Excerpts with Overlap: {overlaps.height:,}({overlaps.height/n_excerpts*100:.1f}%)")
print(f"............same poem: {same_poem_count:,}({same_poem_count/n_excerpts*100:.1f})%)")
print(f".......different poem: {diff_poem_count:,}({diff_poem_count/n_excerpts*100:.1f})%)")

Total Excerpts: 1,478,114

Excerpts with Overlap: 955,570(64.6%)
............same poem: 123,257(8.3)%)
.......different poem: 892,944(60.4)%)


## Filtering

In [6]:
# Filter to excerpts that have an overlap with an excerpt of the same poem ID
overlaps.filter(pl.col("poem_id").is_in(pl.col("poem_id_right")))

page_id,excerpt_id,poem_id,ppa_span_start,ppa_span_end,n_excerpts,n_poems,excerpt_id_right,poem_id_right
str,str,str,i64,i64,u32,u32,list[str],list[str]
"""A01224.12""","""p@730:885""","""Virgil_Aeneid""",730,885,1,1,"[""p@879:962""]","[""Virgil_Aeneid""]"
"""A01224.12""","""p@879:962""","""Virgil_Aeneid""",879,962,1,1,"[""p@730:885""]","[""Virgil_Aeneid""]"
"""A01224.144""","""p@904:967""","""Virgil_Aeneid""",904,967,1,1,"[""p@959:1058""]","[""Virgil_Aeneid""]"
"""A01224.144""","""p@959:1058""","""Virgil_Aeneid""",959,1058,1,1,"[""p@904:967""]","[""Virgil_Aeneid""]"
"""A01224.155""","""p@318:417""","""Virgil_Aeneid""",318,417,1,1,"[""p@415:499""]","[""Virgil_Aeneid""]"
…,…,…,…,…,…,…,…,…
"""yale.39002032008188.00000036""","""p@598:1066""","""King-James-Bible_Psalms""",598,1066,1,1,"[""p@1034:1319""]","[""King-James-Bible_Psalms""]"
"""yale.39002032008188.00000036""","""p@1034:1319""","""King-James-Bible_Psalms""",1034,1319,1,1,"[""p@598:1066""]","[""King-James-Bible_Psalms""]"
"""yale.39002032008188.00000290""","""p@28:257""","""King-James-Bible_Psalms""",28,257,1,1,"[""p@255:587""]","[""King-James-Bible_Psalms""]"
"""yale.39002032008188.00000290""","""p@255:587""","""King-James-Bible_Psalms""",255,587,2,1,"[""p@28:257"", ""p@583:906""]","[""King-James-Bible_Psalms"", ""King-James-Bible_Psalms""]"


In [7]:
# Filter to excerpts that have an overlap with an excerpt of a different poem ID
overlaps.filter((pl.col("n_poems") > 1 ) | (~pl.col("poem_id").is_in(pl.col("poem_id"))))

page_id,excerpt_id,poem_id,ppa_span_start,ppa_span_end,n_excerpts,n_poems,excerpt_id_right,poem_id_right
str,str,str,i64,i64,u32,u32,list[str],list[str]
"""A01224.100""","""p@173:747""","""Virgil_Aeneid""",173,747,2,2,"[""p@179:556"", ""p@179:533""]","[""Z300534278"", ""Z200263304""]"
"""A01224.100""","""p@179:533""","""Z200263304""",179,533,2,2,"[""p@173:747"", ""p@179:556""]","[""Virgil_Aeneid"", ""Z300534278""]"
"""A01224.100""","""p@179:556""","""Z300534278""",179,556,2,2,"[""p@173:747"", ""p@179:533""]","[""Virgil_Aeneid"", ""Z200263304""]"
"""A01224.101""","""p@707:1188""","""Z300486026""",707,1188,2,2,"[""p@707:1188"", ""p@707:1188""]","[""Z200438244"", ""Z400485836""]"
"""A01224.101""","""p@707:1188""","""Z200438244""",707,1188,2,2,"[""p@707:1188"", ""p@707:1188""]","[""Z400485836"", ""Z300486026""]"
…,…,…,…,…,…,…,…,…
"""yale.39002032008188.00000289""","""p@143:377""","""Z300526946""",143,377,3,3,"[""p@129:413"", ""p@51:620"", ""p@51:643""]","[""Z300522344"", ""Z200478688"", ""King-James-Bible_Psalms""]"
"""yale.39002088447587.00000050""","""p@330:806""","""Z200177398""",330,806,3,3,"[""p@628:777"", ""p@665:702"", ""p@627:723""]","[""Z300380365"", ""Z200475115"", ""Z200449615""]"
"""yale.39002088447587.00000050""","""p@627:723""","""Z200449615""",627,723,3,3,"[""p@628:777"", ""p@665:702"", ""p@330:806""]","[""Z300380365"", ""Z200475115"", ""Z200177398""]"
"""yale.39002088447587.00000050""","""p@628:777""","""Z300380365""",628,777,3,3,"[""p@665:702"", ""p@627:723"", ""p@330:806""]","[""Z200475115"", ""Z200449615"", ""Z200177398""]"


## Examples

In [8]:
def get_overlap_set(page_id, excerpt_id):
    """
    Builds a DataFrame of an excerpt and all of its overlaps where the first
    row correpsonds to the queried page excerpt.
    """
    filtered_df = overlaps.filter((pl.col("page_id") == page_id) & (pl.col("excerpt_id") == excerpt_id))
    if filtered_df.height == 0:
        print(f"Excerpt with page_id {page_id} and excerpt_id {excerpt_id} does not exist in overlaps")
        return
    row = filtered_df.row(0, named=True)
    overlap_struct = {
        "page_id": row["page_id"],
        "excerpt_id": [row["excerpt_id"]] + row["excerpt_id_right"],
        "poem_id": [row["poem_id"]]+ row["poem_id_right"],
    }
    return pl.DataFrame(overlap_struct)

### Excerpts that only overlap with excerpts with different poem identifications

In [9]:
page_id = "A01224.100"
excerpt_id = "p@173:747"
get_overlap_set(page_id, excerpt_id)

page_id,excerpt_id,poem_id
str,str,str
"""A01224.100""","""p@173:747""","""Virgil_Aeneid"""
"""A01224.100""","""p@179:556""","""Z300534278"""
"""A01224.100""","""p@179:533""","""Z200263304"""


In [10]:
page_id = "A66698.156"
excerpt_id = "p@946:1486"
get_overlap_set(page_id, excerpt_id)

page_id,excerpt_id,poem_id
str,str,str
"""A66698.156""","""p@946:1486""","""William-Shakespeare_Venus-and-…"
"""A66698.156""","""p@646:1486""","""Z300440514"""
"""A66698.156""","""p@946:1486""","""Z200482662"""


### Excerpts that only overlap with excerpts identified with the same poem

In [11]:
page_id = "A32749.162"
excerpt_id = "p@3378:4969"
get_overlap_set(page_id, excerpt_id)

page_id,excerpt_id,poem_id
str,str,str
"""A32749.162""","""p@3378:4969""","""Geoffrey-Chaucer_The-Canterbur…"
"""A32749.162""","""p@180:3474""","""Geoffrey-Chaucer_The-Canterbur…"
"""A32749.162""","""p@4833:5867""","""Geoffrey-Chaucer_The-Canterbur…"


In [12]:
page_id = "uc1.$b31635.00000668"
excerpt_id = "a@39:772"
get_overlap_set(page_id, excerpt_id)

page_id,excerpt_id,poem_id
str,str,str
"""uc1.$b31635.00000668""","""a@39:772""","""Z300682318"""
"""uc1.$b31635.00000668""","""p@39:282""","""Z300682318"""
"""uc1.$b31635.00000668""","""p@288:1034""","""Z300682318"""


### Excerpts that overlap with excerpts with a mix of poem identifications

In [13]:
page_id = "A12226.73"
excerpt_id = "p@1:857"
get_overlap_set(page_id, excerpt_id)

page_id,excerpt_id,poem_id
str,str,str
"""A12226.73""","""p@1:857""","""Z200438289"""
"""A12226.73""","""p@855:987""","""Z200438289"""
"""A12226.73""","""p@1:987""","""Z200490769"""
"""A12226.73""","""p@1:987""","""Z300485933"""


In [14]:
page_id = "CW0111540239.0092"
excerpt_id = "a@379:592"
get_overlap_set(page_id, excerpt_id)

page_id,excerpt_id,poem_id
str,str,str
"""CW0111540239.0092""","""a@379:592""","""John-Milton_Paradise-Lost"""
"""CW0111540239.0092""","""p@114:562""","""Z300444150"""
"""CW0111540239.0092""","""p@379:596""","""John-Milton_Paradise-Lost"""
"""CW0111540239.0092""","""p@379:596""","""Z200437755"""


In [15]:
page_id = "wu.89099904617.00000063"
excerpt_id = "a@323:371"
get_overlap_set(page_id, excerpt_id)

page_id,excerpt_id,poem_id
str,str,str
"""wu.89099904617.00000063""","""a@323:371""","""William-Shakespeare_The-Tempes…"
"""wu.89099904617.00000063""","""p@174:512""","""William-Shakespeare_The-Tempes…"
"""wu.89099904617.00000063""","""p@115:605""","""Z300441190"""
"""wu.89099904617.00000063""","""p@115:605""","""Z300431123"""
