# üìö Legal RAG Reprocessing v2.0

**Purpose:** Re-chunk existing cases with granular splitting (25+ chunks/case) for better search accuracy.

**What this does:**
1. Fetches all cases from your Supabase `cases` table
2. Splits each judgment into smaller, more detailed chunks (~500 chars)
3. Generates embeddings using `bge-small-en-v1.5` (384 dimensions)
4. Clears old chunks and uploads new ones to `case_chunks`

**Estimated time:** 2-4 hours for 4,688 cases on free Colab GPU

In [None]:
#@title 1Ô∏è‚É£ Install Dependencies
!pip install -q supabase sentence-transformers tqdm

In [None]:
#@title 2Ô∏è‚É£ Configuration
import os

# @markdown Enter your Supabase credentials:
SUPABASE_URL = "https://vzqlwjibtujhrhjgwhhe.supabase.co"  # @param {type:"string"}
SUPABASE_KEY = ""  # @param {type:"string"}

# Chunking settings - OPTIMIZED for detailed search
CHUNK_SIZE = 500       # Characters per chunk (smaller = more granular)
CHUNK_OVERLAP = 100    # Overlap between chunks (preserves context)
MIN_CHUNK_SIZE = 100   # Minimum chunk size (skip tiny fragments)
TARGET_CHUNKS_PER_CASE = 25  # Approximate target

# Embedding model
EMBEDDING_MODEL = "BAAI/bge-small-en-v1.5"  # 384 dimensions, matches your DB

# Processing
BATCH_SIZE = 32        # Embeddings per batch
UPLOAD_BATCH = 100     # Chunks per upload batch

print("‚úÖ Configuration loaded")
print(f"   Chunk size: {CHUNK_SIZE} chars")
print(f"   Target: ~{TARGET_CHUNKS_PER_CASE} chunks/case")

In [None]:
#@title 3Ô∏è‚É£ Initialize Supabase & Model
from supabase import create_client
from sentence_transformers import SentenceTransformer
import torch

# Connect to Supabase
supabase = create_client(SUPABASE_URL, SUPABASE_KEY)
print("‚úÖ Supabase connected")

# Load embedding model on GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"üì± Device: {device}")

model = SentenceTransformer(EMBEDDING_MODEL, device=device)
print(f"‚úÖ Model loaded: {EMBEDDING_MODEL}")
print(f"   Embedding dimension: {model.get_sentence_embedding_dimension()}")

In [None]:
#@title 4Ô∏è‚É£ Fetch All Cases
from tqdm.auto import tqdm

def fetch_all_cases():
    """Fetch all cases from the cases table"""
    all_cases = []
    page_size = 1000
    offset = 0
    
    print("üì• Fetching cases from Supabase...")
    
    while True:
        response = supabase.table("cases").select(
            "id, hklii_id, case_name, neutral_citation, court, decision_date, full_text"
        ).range(offset, offset + page_size - 1).execute()
        
        if not response.data:
            break
            
        all_cases.extend(response.data)
        offset += page_size
        print(f"   Fetched {len(all_cases)} cases...")
        
        if len(response.data) < page_size:
            break
    
    print(f"‚úÖ Total cases fetched: {len(all_cases)}")
    return all_cases

cases = fetch_all_cases()

In [None]:
#@title 5Ô∏è‚É£ Improved Chunking Function
import re
from typing import List, Dict

def clean_html(text: str) -> str:
    """Remove HTML tags and clean up text"""
    if not text:
        return ""
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', ' ', text)
    # Remove multiple spaces/newlines
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters but keep legal punctuation
    text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', text)
    return text.strip()

def detect_section_type(text: str, position: float) -> str:
    """Detect the type of legal section based on content and position"""
    text_lower = text.lower()[:500]
    
    # Header/intro indicators
    if position < 0.1:
        if any(x in text_lower for x in ['background', 'introduction', 'before']):
            return 'background'
        return 'header'
    
    # Facts section
    if any(x in text_lower for x in ['the facts', 'factual background', 'evidence shows', 'witness', 'testified']):
        return 'facts'
    
    # Legal reasoning
    if any(x in text_lower for x in ['held that', 'in my judgment', 'i am satisfied', 'the court finds', 'it is clear that']):
        return 'reasoning'
    
    # Holding/disposition
    if any(x in text_lower for x in ['order', 'accordingly', 'appeal dismissed', 'appeal allowed', 'judgment for']):
        return 'holding'
    
    # Damages/compensation
    if any(x in text_lower for x in ['damages', 'compensation', 'quantum', 'award']):
        return 'damages'
    
    # Legal principles
    if any(x in text_lower for x in ['principle', 'test is', 'duty of care', 'negligence', 'breach']):
        return 'legal_principle'
    
    return 'general'

def chunk_judgment(case: Dict) -> List[Dict]:
    """Split a judgment into granular chunks with metadata"""
    text = clean_html(case.get('full_text', ''))
    
    if not text or len(text) < MIN_CHUNK_SIZE:
        return []
    
    chunks = []
    total_length = len(text)
    
    # Sliding window chunking with overlap
    start = 0
    chunk_index = 0
    
    while start < total_length:
        # Calculate end position
        end = min(start + CHUNK_SIZE, total_length)
        
        # Try to break at sentence boundary
        if end < total_length:
            # Look for sentence end within last 100 chars
            search_start = max(start + CHUNK_SIZE - 100, start)
            for punct in ['. ', '„ÄÇ', '\n\n']:
                last_punct = text.rfind(punct, search_start, end + 50)
                if last_punct > search_start:
                    end = last_punct + len(punct)
                    break
        
        chunk_text = text[start:end].strip()
        
        if len(chunk_text) >= MIN_CHUNK_SIZE:
            position = start / total_length
            
            chunks.append({
                'case_id': case.get('id'),
                'hklii_id': case.get('hklii_id', ''),
                'chunk_index': chunk_index,
                'chunk_text': chunk_text,
                'section_type': detect_section_type(chunk_text, position),
                'case_name': case.get('case_name', ''),
                'neutral_citation': case.get('neutral_citation', ''),
                'court': case.get('court', ''),
                'decision_date': case.get('decision_date', ''),
            })
            chunk_index += 1
        
        # Move start with overlap
        start = end - CHUNK_OVERLAP if end < total_length else total_length
    
    return chunks

# Test on first case
if cases:
    test_chunks = chunk_judgment(cases[0])
    print(f"‚úÖ Test chunking: {len(test_chunks)} chunks from first case")
    if test_chunks:
        print(f"   First chunk preview: {test_chunks[0]['chunk_text'][:100]}...")

In [None]:
#@title 6Ô∏è‚É£ Clear Old Chunks (IMPORTANT!)
#@markdown This will delete all existing chunks before re-processing.
#@markdown Make sure you want to proceed!

CONFIRM_DELETE = True  # @param {type:"boolean"}

if CONFIRM_DELETE:
    print("üóëÔ∏è Deleting old chunks...")
    
    # Delete in batches to avoid timeout
    deleted_total = 0
    while True:
        # Get batch of IDs
        response = supabase.table("case_chunks").select("id").limit(1000).execute()
        if not response.data:
            break
        
        ids = [r['id'] for r in response.data]
        supabase.table("case_chunks").delete().in_("id", ids).execute()
        deleted_total += len(ids)
        print(f"   Deleted {deleted_total} chunks...")
    
    print(f"‚úÖ Cleared {deleted_total} old chunks")
else:
    print("‚è≠Ô∏è Skipping deletion (will add to existing chunks)")

In [None]:
#@title 7Ô∏è‚É£ Process All Cases (Main Loop)
from tqdm.auto import tqdm
import numpy as np

def process_all_cases(cases: List[Dict]):
    """Process all cases: chunk, embed, and upload"""
    
    all_chunks = []
    total_chunks = 0
    cases_with_text = 0
    
    print("üì¶ Chunking all cases...")
    for case in tqdm(cases, desc="Chunking"):
        chunks = chunk_judgment(case)
        if chunks:
            all_chunks.extend(chunks)
            cases_with_text += 1
    
    total_chunks = len(all_chunks)
    avg_chunks = total_chunks / cases_with_text if cases_with_text > 0 else 0
    
    print(f"\n‚úÖ Chunking complete:")
    print(f"   Cases with text: {cases_with_text}")
    print(f"   Total chunks: {total_chunks}")
    print(f"   Average chunks/case: {avg_chunks:.1f}")
    
    # Generate embeddings in batches
    print(f"\nüî¢ Generating embeddings for {total_chunks} chunks...")
    
    embeddings = []
    texts = [c['chunk_text'] for c in all_chunks]
    
    for i in tqdm(range(0, len(texts), BATCH_SIZE), desc="Embedding"):
        batch_texts = texts[i:i + BATCH_SIZE]
        batch_embeddings = model.encode(
            batch_texts,
            batch_size=BATCH_SIZE,
            show_progress_bar=False,
            normalize_embeddings=True
        )
        embeddings.extend(batch_embeddings.tolist())
    
    print(f"‚úÖ Generated {len(embeddings)} embeddings")
    
    # Add embeddings to chunks
    for i, chunk in enumerate(all_chunks):
        chunk['embedding'] = embeddings[i]
    
    # Upload to Supabase in batches
    print(f"\nüì§ Uploading {total_chunks} chunks to Supabase...")
    
    uploaded = 0
    errors = 0
    
    for i in tqdm(range(0, len(all_chunks), UPLOAD_BATCH), desc="Uploading"):
        batch = all_chunks[i:i + UPLOAD_BATCH]
        
        try:
            supabase.table("case_chunks").insert(batch).execute()
            uploaded += len(batch)
        except Exception as e:
            errors += len(batch)
            print(f"\n‚ö†Ô∏è Upload error at batch {i}: {str(e)[:100]}")
            # Try one by one for failed batch
            for chunk in batch:
                try:
                    supabase.table("case_chunks").insert(chunk).execute()
                    uploaded += 1
                    errors -= 1
                except:
                    pass
    
    print(f"\n" + "="*50)
    print(f"‚úÖ PROCESSING COMPLETE")
    print(f"="*50)
    print(f"   Cases processed: {cases_with_text}")
    print(f"   Chunks created: {total_chunks}")
    print(f"   Chunks uploaded: {uploaded}")
    print(f"   Errors: {errors}")
    print(f"   Avg chunks/case: {avg_chunks:.1f}")
    
    return uploaded

# Run processing
uploaded = process_all_cases(cases)

In [None]:
#@title 8Ô∏è‚É£ Verify Results
# Check new chunk count
response = supabase.table("case_chunks").select("id", count="exact").execute()
print(f"\nüìä Verification:")
print(f"   Total chunks in database: {response.count}")

# Check embedding count
response2 = supabase.rpc("count_embeddings").execute()
# Alternative check
sample = supabase.table("case_chunks").select("id, hklii_id, chunk_index, section_type").limit(5).execute()
print(f"\nüìù Sample chunks:")
for row in sample.data:
    print(f"   {row['hklii_id']} - Chunk {row['chunk_index']} ({row['section_type']})")

# Check average chunks per case
unique_cases = supabase.table("case_chunks").select("hklii_id").execute()
unique_ids = set(r['hklii_id'] for r in unique_cases.data)
print(f"\n   Unique cases: {len(unique_ids)}")
if unique_ids:
    print(f"   Avg chunks/case: {response.count / len(unique_ids):.1f}")

---
## ‚úÖ Done!

Your cases have been re-processed with more granular chunking.

**Next steps:**
1. Go to https://cabase.vercel.app/
2. Test a search query
3. Results should now be more detailed and accurate!

**Expected improvement:**
- Before: ~6 chunks/case (just summaries)
- After: ~25 chunks/case (full judgment details)
- Search can now find specific facts, damages, legal principles