# Passim Text Reuse Analysis Results

This notebook displays and analyzes text reuse patterns discovered by Passim between MEGA and DNZ text corpora.


In [3]:
import json
from collections import defaultdict
from pathlib import Path

import pandas as pd

## 1. Load Passim Results


In [4]:
# Load Passim results file
passim_results_file = Path(
    "passim_output/out.json/part-00000-e37aa9d7-c47b-485e-85ff-aee83b0233a1-c000.json"
)

# Read all results
results = []
with open(passim_results_file, "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            results.append(json.loads(line))

print(f"Total text reuse instances found: {len(results)}")

Total text reuse instances found: 226


## 2. Basic Statistics


In [8]:
# Calculate basic statistics
clusters = set()
text_lengths = []

for result in results:
    clusters.add(result.get("cluster"))
    text = result.get("text", "")
    text_lengths.append(len(text))

# Group results by cluster
clusters_dict = defaultdict(list)
for result in results:
    clusters_dict[result.get("cluster")].append(result)

print("=" * 60)
print("PASSIM TEXT REUSE ANALYSIS SUMMARY")
print("=" * 60)
print(f"Total text fragments: {len(results)}")
print(f"Total clusters: {len(clusters)}")
if text_lengths:
    print(
        f"Average text length: {sum(text_lengths) / len(text_lengths):.1f} characters"
    )
    print(f"Min/Max text length: {min(text_lengths)}/{max(text_lengths)} characters")
else:
    print("No text lengths available.")
print("=" * 60)

PASSIM TEXT REUSE ANALYSIS SUMMARY
Total text fragments: 226
Total clusters: 112
Average text length: 445.9 characters
Min/Max text length: 67/2094 characters


In [10]:
# Create cluster summary table
def create_cluster_summary():
    """Create summary statistics for all clusters"""
    cluster_stats = []

    for cluster_id, cluster_data in clusters_dict.items():
        mega_count = len([r for r in cluster_data if r.get("series") == "mega"])
        dnz_count = len([r for r in cluster_data if r.get("series") == "dnz"])
        total_count = len(cluster_data)
        avg_length = (
            sum(len(r["text"]) for r in cluster_data) / total_count
            if total_count > 0
            else 0
        )

        cluster_type = (
            "Cross-corpus" if mega_count > 0 and dnz_count > 0 else "Single-corpus"
        )

        cluster_stats.append(
            {
                "Cluster ID": cluster_id,
                "Total Fragments": total_count,
                "MEGA": mega_count,
                "DNZ": dnz_count,
                "Avg Length": f"{avg_length:.0f}",
                "Type": cluster_type,
            }
        )

    # Sort by total fragments (descending)
    cluster_stats.sort(key=lambda x: x["Total Fragments"], reverse=True)
    return pd.DataFrame(cluster_stats)


# Display cluster summary
cluster_df = create_cluster_summary()
print("\nTop 15 Clusters by Fragment Count:")
print(cluster_df.head(15).to_string(index=False))

# Focus on cross-corpus clusters
cross_corpus_df = cluster_df[cluster_df["Type"] == "Cross-corpus"]
print(f"\nCross-corpus Clusters ({len(cross_corpus_df)} total):")
print(cross_corpus_df.head(10).to_string(index=False))


Top 15 Clusters by Fragment Count:
 Cluster ID  Total Fragments  MEGA  DNZ Avg Length         Type
          5                3     2    1        726 Cross-corpus
17179869186                3     2    1        166 Cross-corpus
          0                2     1    1        306 Cross-corpus
          1                2     1    1       2086 Cross-corpus
          2                2     1    1       1063 Cross-corpus
          3                2     1    1        269 Cross-corpus
          4                2     1    1        624 Cross-corpus
          6                2     1    1        748 Cross-corpus
          7                2     1    1        225 Cross-corpus
          8                2     1    1        634 Cross-corpus
          9                2     1    1        164 Cross-corpus
         10                2     1    1        195 Cross-corpus
         11                2     1    1        248 Cross-corpus
         13                2     1    1        226 Cross-corpus
    

## 4. Show Clusters

In [None]:
def display_cluster_comparison(cluster_id, max_chars=300):
    """Display detailed comparison for a specific cluster"""
    if cluster_id not in clusters_dict:
        print(f"Cluster {cluster_id} not found")
        return

    cluster_data = clusters_dict[cluster_id]
    mega_data = [r for r in cluster_data if r.get("series") == "mega"]
    dnz_data = [r for r in cluster_data if r.get("series") == "dnz"]

    print(f"\n{'=' * 80}")
    print(f"CLUSTER {cluster_id} - TEXT REUSE COMPARISON")
    print(f"MEGA texts: {len(mega_data)} | DNZ texts: {len(dnz_data)}")
    print(f"{'=' * 80}")

    # Display each fragment
    for i, result in enumerate(cluster_data):
        series = result.get("series", "").upper()
        doc_id = result["id"]
        text = result["text"]
        uid = result["uid"]
        position = f"{result['begin']}-{result['end']}"

        # Truncate text for display
        display_text = text[:max_chars] + "..." if len(text) > max_chars else text

        print(f"\n{i + 1}. [{series}] Document: {doc_id}")
        print(f"   UID: {uid} | Position: {position}")
        print(f'   Text: "{display_text}"')

        # Show source references if available
        if "src" in result and result["src"]:
            print("   References:")
            for src in result["src"]:
                print(f"     → UID {src['uid']} (pos: {src['begin']}-{src['end']})")
        print("-" * 80)


# Display the most interesting cross-corpus clusters
interesting_clusters = cross_corpus_df.head(5)["Cluster ID"].tolist()
for cluster_id in interesting_clusters:
    display_cluster_comparison(cluster_id)


CLUSTER 5 - TEXT REUSE COMPARISON
MEGA texts: 2 | DNZ texts: 1

1. [DNZ] Document: 1901-02b
   UID: -5394965545327344677 | Position: 1364475-1365402
   Text: "Negation stellt dann das individuelle Eigenthum wieder her, aber auf Grundlage der Errungenschaft der kapilalistischen Aera, nämlich der Cooperation freier Arbeiter und ihrem Gemeineigenthum an der Erde und den durch die Arbeit selbst producirten Productionsmitteln." Marx besitzt eine ausgebreitete ..."
--------------------------------------------------------------------------------

2. [MEGA] Document: page_500
   UID: 977443739562304537 | Position: 1125-2090
   Text: "so in ein Naturgesetz mystificirte Gesetz der kapitalistischen Accumu- lation drückt in der That nur aus, daß ihre Natur jede solche Abnahme im Exploitationsgrad der Arbeit oder jede solche Steigerung des Arbeitspreises ausschließt, welche die stetige Reproduktion des Kapitalverhältnisses und seine ..."
   References:
     → UID -5394965545327344677 (pos: 1364475