<a href="https://colab.research.google.com/github/Shravan-Kumar-18/legal-domain-similarity/blob/main/Paragraph_Filtering3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 📦 Setup: clean environment
!pip install -U datasets huggingface_hub fsspec
!pip install -q datasets transformers scikit-learn tqdm

Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting huggingface_hub
  Downloading huggingface_hub-0.33.4-py3-none-any.whl.metadata (14 kB)
Collecting fsspec
  Downloading fsspec-2025.7.0-py3-none-any.whl.metadata (12 kB)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-4.0.0-py3-none-any.whl (494 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.8/494.8 kB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-0.33.4-py3-none-any.whl (515 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m515.3/515.3 kB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, huggingface_hub, datasets
  Attempting uninstall: fsspec
    Found existing installat

In [None]:
# Imports
import torch
import numpy as np
import re
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr, spearmanr
from collections import defaultdict, Counter

In [None]:
#  Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
#  LegalLongformer (safe dtype)
tokenizer = AutoTokenizer.from_pretrained("lexlms/legal-longformer-large")
model = AutoModel.from_pretrained("lexlms/legal-longformer-large", torch_dtype=torch.float32).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/379 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

Some weights of LongformerModel were not initialized from the model checkpoint at lexlms/legal-longformer-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#  Load 10 expression-related cases
dataset = load_dataset("lex_glue", "ecthr_a", split="train")
subset = dataset.filter(lambda d: "freedom of expression" in str(d["text"]).lower()).select(range(50))
texts_raw = [doc["text"] if isinstance(doc["text"], str) else " ".join(doc["text"]) for doc in subset]
titles = [f"Case {i}" for i in range(len(texts_raw))]

README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/42.4M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/5.68M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/5.26M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9000 [00:00<?, ? examples/s]

In [None]:
#  Extract citations
def extract_citations(text):
    return set(re.findall(r"(Article\s\d+|Section\s\d+|\(.*?\)|\[\d{4}\][^\]]+\])", text))
citation_sets = [extract_citations(text) for text in texts_raw]

In [None]:
# Paragraph filtering
def is_structural(para):
    return bool(re.search(r'^(author:|bench:|party:|court:|coram:|before:)', para.strip().lower()))

def is_substantive(para):
    para = para.strip().lower()
    return len(para.split()) >= 25 and any(kw in para for kw in [
        "facts", "issue", "judgment", "reasoning", "held", "argument", "legal", "disputed"
    ])

In [None]:
def paragraph_filter(text):
    paras = [p.strip() for p in re.split(r"\n{2,}", text) if p.strip()]
    filtered, all_cites = [], extract_citations(text)
    if paras:
        if is_substantive(paras[0]): filtered.append(paras[0])
        if len(paras) > 1 and is_substantive(paras[-1]): filtered.append(paras[-1])
    for para in paras[1:-1]:
        if is_substantive(para) and not is_structural(para):
            if set(extract_citations(para)) & all_cites:
                filtered.append(para)
    return filtered if len(filtered) >= 3 else paras

filtered_texts = [" ".join(paragraph_filter(text)) for text in texts_raw]

In [None]:
#  Section segmentation
def segment_sections(text):
    sections = {"Facts": "", "Reasoning": "", "Judgment": ""}
    for para in re.split(r"\n{2,}", text):
        para_lower = para.lower()
        if "facts" in para_lower:
            sections["Facts"] += para + "\n"
        elif "reasoning" in para_lower or "held" in para_lower:
            sections["Reasoning"] += para + "\n"
        elif "judgment" in para_lower or "conclusion" in para_lower:
            sections["Judgment"] += para + "\n"
    return sections

In [None]:
#  Embedding
def get_embedding(text):
    try:
        if isinstance(text, list): text = " ".join(text)
        inputs = tokenizer(text, padding="max_length", truncation=True, return_tensors="pt", max_length=2048)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            output = model(**inputs)
            return output.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
    except Exception as e:
        print(f" Embedding error: {e}")
        return np.zeros(model.config.hidden_size)

In [None]:
#  Section-wise embeddings
section_embeddings = []
for text in filtered_texts:
    sections = segment_sections(text)
    vecs = []
    for name in ["Facts", "Reasoning", "Judgment"]:
        content = sections[name]
        vec = get_embedding(content) if content.strip() else np.zeros(model.config.hidden_size)
        vecs.append(vec)
    section_embeddings.append(np.mean(vecs, axis=0))

In [None]:
#  Cosine similarity
sim_matrix_cosine = cosine_similarity(section_embeddings)

# Citation similarity
sim_matrix_citation = np.zeros((len(titles), len(titles)))
for i in range(len(titles)):
    for j in range(len(titles)):
        union = citation_sets[i] | citation_sets[j]
        intersection = citation_sets[i] & citation_sets[j]
        sim_matrix_citation[i][j] = len(intersection) / len(union) if union else 0.0

#  Hybrid matrix
sim_matrix_avg = (sim_matrix_cosine + sim_matrix_citation) / 2

In [None]:
#  Matrix printer
def print_matrix(matrix, title):
    print(f"\n {title} Similarity Matrix:")
    for i, row in enumerate(matrix):
        row_str = "  ".join([f"{v:.2f}" for v in row])
        print(f"{titles[i].ljust(8)}{row_str}")
print_matrix(sim_matrix_cosine, "Cosine")
print_matrix(sim_matrix_citation, "Citation")
print_matrix(sim_matrix_avg, "Hybrid")


🔷 Cosine Similarity Matrix:
Case 0  1.00  0.60  0.42  0.91  0.84  0.45  0.86  0.68  0.89  0.77  0.48  0.91  0.51  0.93  0.80  0.90  0.65  0.50  0.49  0.63  0.66  0.58  0.61  0.77  0.40  0.85  0.79  0.75  0.86  0.91  0.00  0.80  0.65  0.57  0.92  0.78  0.84  0.88  0.85  0.64  0.37  0.76  0.82  0.52  0.91  0.86  0.52  0.25  0.74  0.52
Case 1  0.60  1.00  0.71  0.62  0.56  0.80  0.76  0.83  0.60  0.71  0.78  0.57  0.84  0.62  0.78  0.66  0.71  0.82  0.70  0.70  0.83  0.80  0.76  0.70  0.63  0.67  0.73  0.74  0.64  0.62  0.00  0.73  0.73  0.60  0.57  0.85  0.73  0.63  0.70  0.76  0.62  0.68  0.68  0.66  0.51  0.64  0.87  0.22  0.84  0.66
Case 2  0.42  0.71  1.00  0.53  0.51  0.74  0.64  0.71  0.48  0.57  0.88  0.38  0.75  0.44  0.67  0.60  0.75  0.74  0.76  0.68  0.67  0.69  0.75  0.54  0.72  0.59  0.59  0.70  0.47  0.51  0.00  0.61  0.63  0.61  0.46  0.71  0.65  0.58  0.54  0.86  0.73  0.58  0.69  0.73  0.36  0.52  0.69  0.13  0.67  0.65
Case 3  0.91  0.62  0.53  1.00  0.83  0.47  0.85  

In [None]:
#  Precision@5
k = 5
global_freq = Counter()
for c in citation_sets: global_freq.update(c)
def get_strong_citations(cites): return [c for c in cites if global_freq[c] <= 3]
silver_set = defaultdict(set)
for i, c_set in enumerate(citation_sets):
    strong_i = set(get_strong_citations(c_set))
    for j, other_set in enumerate(citation_sets):
        if i != j:
            strong_j = set(get_strong_citations(other_set))
            if strong_i & strong_j: silver_set[titles[i]].add(titles[j])
print(f"\n Silver Standard Evaluation (Precision@{k}):")
for i, title in enumerate(titles):
    sims = sorted([(titles[j], sim_matrix_avg[i][j]) for j in range(len(titles)) if j != i], key=lambda x: x[1], reverse=True)[:k]
    predicted = set(d for d, _ in sims)
    actual = silver_set[title]
    match = predicted & actual
    print(f"{title}: Precision@{k} = {len(match)/k:.2f} | Matches: {list(match)}")


 Silver Standard Evaluation (Precision@5):
Case 0: Precision@5 = 0.20 | Matches: ['Case 37']
Case 1: Precision@5 = 0.00 | Matches: []
Case 2: Precision@5 = 0.00 | Matches: []
Case 3: Precision@5 = 0.40 | Matches: ['Case 42', 'Case 37']
Case 4: Precision@5 = 0.20 | Matches: ['Case 16']
Case 5: Precision@5 = 0.00 | Matches: []
Case 6: Precision@5 = 0.00 | Matches: []
Case 7: Precision@5 = 0.00 | Matches: []
Case 8: Precision@5 = 0.20 | Matches: ['Case 27']
Case 9: Precision@5 = 0.40 | Matches: ['Case 26', 'Case 32']
Case 10: Precision@5 = 0.20 | Matches: ['Case 39']
Case 11: Precision@5 = 0.00 | Matches: []
Case 12: Precision@5 = 0.00 | Matches: []
Case 13: Precision@5 = 0.20 | Matches: ['Case 14']
Case 14: Precision@5 = 0.20 | Matches: ['Case 13']
Case 15: Precision@5 = 0.00 | Matches: []
Case 16: Precision@5 = 0.20 | Matches: ['Case 4']
Case 17: Precision@5 = 0.00 | Matches: []
Case 18: Precision@5 = 0.00 | Matches: []
Case 19: Precision@5 = 0.00 | Matches: []
Case 20: Precision@5 = 0

In [None]:
#  Gold standard
gold_pairs = [(titles[0], titles[1], 0.80), (titles[2], titles[4], 0.65), (titles[5], titles[7], 0.90), (titles[10], titles[13], 0.70)]
true_scores, model_scores = [], []
for d1, d2, expert in gold_pairs:
    i, j = titles.index(d1), titles.index(d2)
    true_scores.append(expert)
    model_scores.append(sim_matrix_avg[i][j])
if true_scores:
    p, _ = pearsonr(true_scores, model_scores)
    s, _ = spearmanr(true_scores, model_scores)
    print(f"\n Gold Correlation:\n → Pearson  : {p:.3f}\n → Spearman : {s:.3f}")


 Gold Correlation:
 → Pearson  : 0.963
 → Spearman : 0.800


In [None]:
#  Top-k Retrieval (Hybrid)
print("\n Top 5 Similar Documents (Hybrid Score):")
for i in range(len(titles)):
    scores = [(titles[j], sim_matrix_avg[i][j]) for j in range(len(titles)) if j != i]
    top5 = sorted(scores, key=lambda x: x[1], reverse=True)[:5]
    print(f"\n {titles[i]}")
    for doc, score in top5:
        print(f" → {doc} | Score: {score:.2f}")


📋 Top 5 Similar Documents (Hybrid Score):

 Case 0
 → Case 37 | Score: 0.49
 → Case 6 | Score: 0.49
 → Case 15 | Score: 0.48
 → Case 34 | Score: 0.48
 → Case 44 | Score: 0.48

 Case 1
 → Case 48 | Score: 0.55
 → Case 12 | Score: 0.48
 → Case 25 | Score: 0.46
 → Case 46 | Score: 0.46
 → Case 28 | Score: 0.45

 Case 2
 → Case 39 | Score: 0.46
 → Case 10 | Score: 0.45
 → Case 18 | Score: 0.39
 → Case 12 | Score: 0.39
 → Case 22 | Score: 0.38

 Case 3
 → Case 37 | Score: 0.52
 → Case 42 | Score: 0.47
 → Case 0 | Score: 0.47
 → Case 28 | Score: 0.46
 → Case 44 | Score: 0.46

 Case 4
 → Case 13 | Score: 0.44
 → Case 37 | Score: 0.43
 → Case 16 | Score: 0.43
 → Case 34 | Score: 0.43
 → Case 0 | Score: 0.42

 Case 5
 → Case 21 | Score: 0.43
 → Case 17 | Score: 0.42
 → Case 46 | Score: 0.42
 → Case 20 | Score: 0.41
 → Case 12 | Score: 0.40

 Case 6
 → Case 0 | Score: 0.49
 → Case 15 | Score: 0.48
 → Case 35 | Score: 0.48
 → Case 14 | Score: 0.48
 → Case 37 | Score: 0.48

 Case 7
 → Case 48 | S

In [None]:
#  Search Engine (text or title index)
def search_similar_docs(query_text_or_id, k=5):
    if isinstance(query_text_or_id, int):  # ID-based query
        qvec = section_embeddings[query_text_or_id]
        print(f"\n Query Case: {titles[query_text_or_id]}")
    else:  # Free-text query
        qvec = get_embedding(query_text_or_id)
        print(f"\n Free-text Query: {query_text_or_id[:60]}...")
    sims = cosine_similarity([qvec], section_embeddings)[0]
    top = sorted(zip(titles, sims), key=lambda x: x[1], reverse=True)[:k]
    for title, score in top:
        print(f" → {title} | Cosine Score: {score:.2f}")
search_similar_docs("article 10 freedom of expression", k=5)
search_similar_docs(45, k=5)



 Free-text Query: article 10 freedom of expression...
 → Case 38 | Cosine Score: 0.48
 → Case 8 | Cosine Score: 0.47
 → Case 37 | Cosine Score: 0.47
 → Case 3 | Cosine Score: 0.47
 → Case 42 | Cosine Score: 0.47

 Query Case: Case 45
 → Case 45 | Cosine Score: 1.00
 → Case 25 | Cosine Score: 0.90
 → Case 34 | Cosine Score: 0.88
 → Case 15 | Cosine Score: 0.86
 → Case 29 | Cosine Score: 0.86


In [None]:
def explain_similarity(query_id, target_id, top_para_count=2):
    print(f"\n Comparing: {titles[query_id]} ↔ {titles[target_id]}")

    #  Score
    sim_score = sim_matrix_avg[query_id][target_id]
    print(f"\n Hybrid Similarity Score: {sim_score:.3f}")

    #  Shared Citations
    c1, c2 = citation_sets[query_id], citation_sets[target_id]
    shared_cites = sorted(c1 & c2)
    print(f"\n Shared Citations ({len(shared_cites)}): {shared_cites if shared_cites else 'None'}")

    #  Common Themes (Keyword Overlap)
    def extract_keywords(text):
        legal_kw = ["freedom", "expression", "restriction", "right", "speech", "privacy", "press", "torture", "protection", "minority"]
        return {kw for kw in legal_kw if kw in text.lower()}

    k1 = extract_keywords(filtered_texts[query_id])
    k2 = extract_keywords(filtered_texts[target_id])
    common_kws = sorted(k1 & k2)
    print(f"\n Common Themes: {common_kws if common_kws else 'None'}")

    #  Similar Paragraphs
    def get_top_paragraphs(text1, text2, n=2):
        paras1 = [p.strip() for p in re.split(r"\n{2,}", text1) if len(p.strip()) > 80]
        paras2 = [p.strip() for p in re.split(r"\n{2,}", text2) if len(p.strip()) > 80]

        embs1 = [get_embedding(p) for p in paras1]
        embs2 = [get_embedding(p) for p in paras2]

        sim_table = np.array([[cosine_similarity([e1], [e2])[0][0] for e2 in embs2] for e1 in embs1])
        top_pairs = sorted([(i, j, sim_table[i][j]) for i in range(len(paras1)) for j in range(len(paras2))], key=lambda x: x[2], reverse=True)[:n]
        return [(paras1[i], paras2[j], score) for i, j, score in top_pairs]

    print(f"\n Most Similar Paragraphs:")
    top_matches = get_top_paragraphs(filtered_texts[query_id], filtered_texts[target_id], top_para_count)
    for i, (p1, p2, score) in enumerate(top_matches):
        print(f"\n Match {i+1} (Score: {score:.2f})")
        print(f" → [{titles[query_id]}]: {p1[:300]}...")
        print(f" → [{titles[target_id]}]: {p2[:300]}...")
explain_similarity(11, 14)



 Comparing: Case 11 ↔ Case 14

 Hybrid Similarity Score: 0.397

 Shared Citations (1): ['Article 9']

 Common Themes: ['expression', 'freedom', 'press', 'protection', 'right']

 Most Similar Paragraphs:

🔸 Match 1 (Score: 0.78)
 → [Case 11]: 9.  The applicant was born in 1941. He is a retired labourer. 10.  On 12 June 1995 the applicant took part in his capacity as the leader of Tarikat Aczmendi (a community describing itself as an Islamic sect) in a television programme, Ceviz Kabuğu (“Nutshell”), broadcast live on HBB, an independent ...
 → [Case 14]: 8.  The applicant, who is the eldest daughter of Prince Rainier III of Monaco, was born in 1957. Her official residence is in Monaco but she lives in the Paris area most of the time.
As a member of Prince Rainier’s family, the applicant is the president of certain humanitarian or cultural foundation...
