# Display Passim Sentence Pairs Results

This notebook displays the results from Passim's pairwise alignment output (`align.json`).

## Data Structure

Each record in the `align.json` file contains:
- **Core alignment fields**: `uid`, `uid2`, `begin`, `end`, `begin2`, `end2`, `s1`, `s2`, `matches`
- **Document metadata**: `id`, `id2`, `series`, `series2` (and other original fields)
- **Alignment text**: `s1` and `s2` show the aligned texts with `-` characters indicating gaps
- **Quality metric**: `matches` indicates the number of matching characters


In [94]:
import csv
import json

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import HTML, display

# Set up plotting style
plt.style.use("default")
sns.set_palette("husl")

In [95]:
def load_alignments(file_path, max_records=None):
    """
    Load alignment records from the JSONL file.

    Args:
        file_path: Path to the align.json file
        max_records: Maximum number of records to load (None for all)

    Returns:
        List of alignment records
    """
    alignments = []

    with open(file_path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if max_records and i >= max_records:
                break
            try:
                record = json.loads(line.strip())
                alignments.append(record)
            except json.JSONDecodeError as e:
                print(f"Error parsing line {i + 1}: {e}")
                continue

    print(f"Loaded {len(alignments)} alignment records")
    return alignments

In [96]:
# Load the alignment data
align_file = (
    "passim_output/align.json/part-00000-d544c988-e282-48a2-b625-2f36e241ff2b-c000.json"
)
alignments = load_alignments(align_file)

print(f"\nFirst record keys: {list(alignments[0].keys())}")
print("\nSample record structure:")
for key, value in list(alignments[0].items()):
    print(f"  {key}: {value}")

Loaded 182 alignment records

First record keys: ['uid2', 'uid', 'gid', 'gid2', 'begin2', 'end2', 'begin', 'end', 'id', 'series', 'id2', 'series2', 's1', 's2', 'matches']

Sample record structure:
  uid2: 1153728304711461001
  uid: -4805259505436882714
  gid: -7154195190137206662
  gid2: 5507437211704151162
  begin2: 1826
  end2: 3217
  begin: 853247
  end: 854537
  id: 1896-97a
  series: dnz
  id2: page_303
  series2: mega
  s1:  eine kritische Geschichte der Technologi------e -noch---- nich----------t -existir--t, -un-------d--- be-------------merkt ------------we-------i----------t---------e-----r-------------: -----"Darwin hat das Interesse auf die Ge--schichte der natürlichen Technologie gelenkt, d. h. auf die Bildung der Pflanzen- und Thierorgane als Produktions¬ instrumente für das Leben der Pflanzen und Thiere. Verdient die Bildungsgeschichte der produktiven Organe des Gesellschaftsmenschen, der materiellen Basis jeder besonderen Gesell¬ schaftsorganisation, nicht die gleiche A

In [None]:
def display_alignment(record, max_length=500):
    """
    Display a single alignment record by removing gaps to show the original text.

    Args:
        record: Alignment record dictionary
        max_length: Maximum length of text to display
    """
    # Reverse series1 and series2 to match the order: mega --> dnz
    doc1_id = record.get("id2", "Unknown")  # show id2 (mega) first
    doc2_id = record.get("id", "Unknown")  # Then id (dnz)
    series1 = record.get("series2", "Unknown")  # show series2 (mega) first
    series2 = record.get("series", "Unknown")  # Then series (dnz)

    # Remove gaps to get original text (reverse s2 and s1)
    original_1 = record.get("s2", "").replace("-", "")  # now s2 (mega) first
    original_2 = record.get("s1", "").replace("-", "")  # then s1 (dnz)

    # Truncate if too long
    if len(original_1) > max_length:
        original_1 = original_1[:max_length] + "..."
    if len(original_2) > max_length:
        original_2 = original_2[:max_length] + "..."

    # Create display with explicit background color for visibility
    html = f"""
    <div style="background: #222; color: #fff; border: 1px solid #444; margin: 10px 0; padding: 15px; border-radius: 5px;">
        <h4 style="color: #e3e3e3;">Alignment: {doc1_id} ({series1}) ↔ {doc2_id} ({series2})</h4>
        
        <div style="display: flex; gap: 20px;">
            <div style="flex: 1;">
                <h5 style="color: #b5b5b5;">Document 1 ({doc1_id}):</h5>
                <div style="font-family: monospace; background: #fff; color: #111; padding: 10px; border-radius: 3px;">
                    {original_1}
                </div>
            </div>
            <div style="flex: 1;">
                <h5 style="color: #b5b5b5;">Document 2 ({doc2_id}):</h5>
                <div style="font-family: monospace; background: #fff; color: #111; padding: 10px; border-radius: 3px;">
                    {original_2}
                </div>
            </div>
        </div>
        
    </div>
    """

    display(HTML(html))


print("Alignment display functions defined!")

Alignment display functions defined!


In [98]:
# Display only the first 5 pairs
display_count = 5

print(
    f"=== Displaying First {display_count} of {len(alignments)} Alignment Records ===\n"
)
for i, record in enumerate(alignments[:display_count]):
    print(f"\n--- Alignment {i + 1} ---")
    display_alignment(record, max_length=300)

=== Displaying First 5 of 182 Alignment Records ===


--- Alignment 1 ---



--- Alignment 2 ---



--- Alignment 3 ---



--- Alignment 4 ---



--- Alignment 5 ---


In [99]:
# Export sentence pairs and metadata to CSV
export_filename = "alignment_sentence_pairs.csv"
csv_fields = [
    "alignment_index",
    "id2",
    "series2",
    "begin2",
    "end2",
    "s2_no_gaps",
    "id",
    "series",
    "begin",
    "end",
    "s1_no_gaps",
    "matches",
]

with open(export_filename, mode="w", encoding="utf-8", newline="") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=csv_fields)
    writer.writeheader()
    for i, record in enumerate(alignments):
        id2 = record.get("id2", "Unknown")  # doc1 = id2 (mega)
        id = record.get("id", "Unknown")  # doc2 = id (dnz)
        series2 = record.get("series2", "Unknown")
        series = record.get("series", "Unknown")
        matches = record.get("matches", 0)
        begin2, end2 = record.get("begin2", 0), record.get("end2", 0)
        begin, end = record.get("begin", 0), record.get("end", 0)
        s2_no_gaps = record.get("s2", "").replace("-", "")
        s1_no_gaps = record.get("s1", "").replace("-", "")
        writer.writerow(
            {
                "alignment_index": i + 1,
                "id2": id2,
                "series2": series2,
                "begin2": begin2,
                "end2": end2,
                "s2_no_gaps": s2_no_gaps,
                "id": id,
                "series": series,
                "begin": begin,
                "end": end,
                "s1_no_gaps": s1_no_gaps,
                "matches": matches,
            }
        )

print(f"\nExported all pairs and their metadata to: {export_filename}")


Exported all pairs and their metadata to: alignment_sentence_pairs.csv
