In [1]:
# Cell 1 — runtime setup
!pip install -q sentence-transformers faiss-cpu pymupdf pdfplumber transformers accelerate datasets openai tiktoken


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m62.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m73.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m55.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Cell 2 — clone repo if public, otherwise use manual upload
REPO="https://github.com/sanikasiva/LeXI-Phase-2"

# try to clone
import os, subprocess, sys
if not os.path.exists("/content/LeXI-Phase-2"):
    try:
        subprocess.check_call(["git", "clone", REPO])
        print("Cloned repo to /content/LeXI-Phase-2")
    except Exception as e:
        print("Could not clone repo (maybe private). Please upload your PDF manually using the Colab file browser.")
else:
    print("Repo already present.")


Cloned repo to /content/LeXI-Phase-2


In [3]:
# Cell 3 — locate the uploaded draft automatically if present
import glob, os
candidates = glob.glob("/content/**/LexiDesk*.*", recursive=True) + glob.glob("/content/*LexiDesk*.*")
if len(candidates)==0:
    print("No LexiDesk PDF auto-detected. If you uploaded it manually, set PDF_PATH variable to the file path.")
    PDF_PATH = None
else:
  PDF_PATH = "/content/LexiDesk__Legal_AI_WorkBench_Draft1 (2).pdf"
  print("Using PDF:", PDF_PATH)

# set PDF_PATH manually if not auto-detected:
# PDF_PATH = "/content/LexiDesk__Legal_AI_WorkBench_Draft1 (2).pdf"


Using PDF: /content/LexiDesk__Legal_AI_WorkBench_Draft1 (2).pdf


In [4]:
# NEW CELL — list files
import os
os.listdir("/content")


['.config',
 'LeXI-Phase-2',
 'LexiDesk__Legal_AI_WorkBench_Draft1 (2).pdf',
 'sample_data']

In [5]:
# Cell 4 — extract text per page, keep page numbers
import fitz  # pymupdf
import json, os

if 'PDF_PATH' not in globals() or PDF_PATH is None:
    raise SystemExit("Please set PDF_PATH to the path of your uploaded PDF (see previous cell).")

doc = fitz.open(PDF_PATH)
pages = []
for i in range(len(doc)):
    text = doc[i].get_text("text")
    pages.append({"doc_id": os.path.basename(PDF_PATH), "page": i+1, "text": text})

# save
os.makedirs("data", exist_ok=True)
with open("data/lexidesk_pages.jsonl","w",encoding="utf-8") as f:
    for p in pages:
        f.write(json.dumps(p, ensure_ascii=False) + "\n")
print(f"Extracted {len(pages)} pages → data/lexidesk_pages.jsonl")


Extracted 8 pages → data/lexidesk_pages.jsonl


In [6]:
# Check first page content
import json
with open("data/lexidesk_pages.jsonl","r",encoding="utf-8") as f:
    first = json.loads(f.readline())
print("Page:", first["page"])
print(first["text"][:500])


Page: 1
LeXIDesk: Legal AI Workbench
Malavika Suresh1, Sanika Siva S2, Shivani Shibu3
1Department of Computer Science and Engineering,
Mar Athanasius College of Engineering (Autonomous), Kothamangalam
Abstract—The Legal AI Workbench addresses inefficiencies in
legal workflows caused by manual, fragmented processes in clause
review, contract negotiation, and lit igation research. It offers an
advanced AI-driven platform for automating and enhancing legal
document processing using sophisticated Natural La


In [7]:
# Cell 5 — corrected fallback SBD (safe for Python regex)

import re, json, sys, os

def fallback_sbd(text):
    # Common legal abbreviations to protect
    abbreviations = [
        "Mr.", "Mrs.", "Ms.", "Dr.", "Prof.", "Hon.", "Smt.",
        "No.", "Nos.", "Art.", "Sec.", "Secs.", "vs.", "v.",
        "Ltd.", "Inc.", "Co.", "Fig.", "fig.", "et al."
    ]

    placeholder = "<DOT>"
    protected_text = text

    # Protect abbreviation dots
    for abbr in abbreviations:
        protected_text = protected_text.replace(abbr, abbr.replace(".", placeholder))

    # Split sentences safely
    sentences = re.split(r'(?<=[.!?])\s+', protected_text)

    # Restore dots
    restored = []
    for s in sentences:
        s = s.replace(placeholder, ".").strip()
        if s:
            restored.append(s)

    return restored

# Try importing your SBD (optional)
sys.path.append("/content/LeXI-Phase-2")
sbd_fn = None

try:
    for root, _, files in os.walk("/content/LeXI-Phase-2"):
        for f in files:
            if "sbd" in f.lower():
                path = os.path.join(root, f)
                spec = __import__("importlib.util").util.spec_from_file_location("user_sbd", path)
                mod = __import__("importlib.util").util.module_from_spec(spec)
                spec.loader.exec_module(mod)
                for fn in ["segment_text", "sbd_segment", "predict_sentences"]:
                    if hasattr(mod, fn):
                        sbd_fn = getattr(mod, fn)
                        print(f"Using SBD from repo: {path} → {fn}")
                        break
        if sbd_fn:
            break
except Exception as e:
    print("Could not import project SBD:", e)

# Fallback if project SBD not found
if sbd_fn is None:
    print("Using corrected fallback SBD.")
    sbd_fn = fallback_sbd

# Test on first page
with open("data/lexidesk_pages.jsonl","r",encoding="utf-8") as f:
    example = json.loads(f.readline())

sents = sbd_fn(example["text"])
print("Sample sentences from page 1:")
for s in sents[:6]:
    print("-", s)


Could not import project SBD: 'NoneType' object has no attribute 'loader'
Using corrected fallback SBD.
Sample sentences from page 1:
- LeXIDesk: Legal AI Workbench
Malavika Suresh1, Sanika Siva S2, Shivani Shibu3
1Department of Computer Science and Engineering,
Mar Athanasius College of Engineering (Autonomous), Kothamangalam
Abstract—The Legal AI Workbench addresses inefficiencies in
legal workflows caused by manual, fragmented processes in clause
review, contract negotiation, and lit igation research.
- It offers an
advanced AI-driven platform for automating and enhancing legal
document processing using sophisticated Natural Language Pro
cessing (NLP) methods.
- A core component is robust sentence
boundary detec tion (SBD) tailored for legal texts.
- Leveraging
a hybrid deep learning architec ture combining Convolutional
Neural Networks (CNN) and Conditional Random Fields (CRF),
the model captures intricate character-level contexts around de
limiters while modeling sequential token 

In [8]:
# Cell 6 — chunking using SBD output

import json, os

# Load extracted pages
with open("data/lexidesk_pages.jsonl", "r", encoding="utf-8") as f:
    pages = [json.loads(line) for line in f]

def chunk_document(pages, sbd_fn, max_chars=1600):
    """
    Create chunks from sentence-segmented legal text.
    max_chars controls chunk size (≈ 300–500 tokens).
    """
    chunks = []

    for p in pages:
        sentences = sbd_fn(p["text"])
        current_chunk = []
        current_len = 0

        for sent in sentences:
            if current_len + len(sent) > max_chars and current_chunk:
                chunks.append({
                    "doc_id": p["doc_id"],
                    "page": p["page"],
                    "text": " ".join(current_chunk)
                })
                current_chunk = [sent]
                current_len = len(sent)
            else:
                current_chunk.append(sent)
                current_len += len(sent)

        if current_chunk:
            chunks.append({
                "doc_id": p["doc_id"],
                "page": p["page"],
                "text": " ".join(current_chunk)
            })

    return chunks

chunks = chunk_document(pages, sbd_fn)

print(f"Total chunks created: {len(chunks)}")
print("Example chunk:\n", chunks[0]["text"][:500])


Total chunks created: 30
Example chunk:
 LeXIDesk: Legal AI Workbench
Malavika Suresh1, Sanika Siva S2, Shivani Shibu3
1Department of Computer Science and Engineering,
Mar Athanasius College of Engineering (Autonomous), Kothamangalam
Abstract—The Legal AI Workbench addresses inefficiencies in
legal workflows caused by manual, fragmented processes in clause
review, contract negotiation, and lit igation research. It offers an
advanced AI-driven platform for automating and enhancing legal
document processing using sophisticated Natural La


In [9]:
# Cell 7 — save chunks

os.makedirs("data", exist_ok=True)

with open("data/chunks.jsonl", "w", encoding="utf-8") as f:
    for c in chunks:
        f.write(json.dumps(c, ensure_ascii=False) + "\n")

print("Saved chunks to data/chunks.jsonl")


Saved chunks to data/chunks.jsonl


In [10]:
# Cell 8 — embedding + FAISS index

from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import pickle

# Load embedding model
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

texts = [c["text"] for c in chunks]
embeddings = embed_model.encode(
    texts,
    show_progress_bar=True,
    convert_to_numpy=True
)

# Build FAISS index
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings.astype("float32"))

# Save index and metadata
os.makedirs("index", exist_ok=True)
faiss.write_index(index, "index/faiss.index")

with open("index/metadata.pkl", "wb") as f:
    pickle.dump(chunks, f)

print(f"FAISS index created with {index.ntotal} vectors")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

FAISS index created with 30 vectors


In [11]:
# Cell 9 — retrieval function

import pickle
import numpy as np

# Reload index + metadata
index = faiss.read_index("index/faiss.index")
with open("index/metadata.pkl", "rb") as f:
    metadata = pickle.load(f)

def retrieve_chunks(query, k=5):
    q_emb = embed_model.encode([query]).astype("float32")
    distances, indices = index.search(q_emb, k)

    results = []
    for idx, dist in zip(indices[0], distances[0]):
        if idx < len(metadata):
            item = metadata[idx].copy()
            item["distance"] = float(dist)
            item["chunk_id"] = idx
            results.append(item)
    return results

# Test retrieval
test_results = retrieve_chunks("What is LeXiDesk?", k=3)
for r in test_results:
    print(f"[Page {r['page']}] {r['text'][:200]}...\n")


[Page 7] Association
for Computational Linguistics, 2021, pp. 3995–4007. [17] I. Rehbein, J. Ruppenhofer, and T. Schmidt, “Improving sentence
boundary detection for spoken language transcripts,” in Proceedings...

[Page 7] Key contributions include the CNN-CRF hybrid architec-
ture for precise legal segmentation, multi-signal extractive
summarization outperforming single-method baselines, and
comprehensive evaluation wi...

[Page 5] CNN-CRF achieves F1-scores of 0.940 and 0.933 on the CC
test set (4% improvement over baseline CRF), and 0.885
average F1 on Indian legal data, surpassing standard toolkits. IV. RESULTS
The Phase-1 ev...



In [12]:
# Cell 10 — local LLM answer generation (Flan-T5)

from transformers import pipeline

qa_pipeline = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    max_length=512
)

def generate_answer(question, retrieved_chunks):
    context = "\n\n".join(
        [f"[Page {c['page']}] {c['text']}" for c in retrieved_chunks]
    )

    prompt = f"""
You are a legal research assistant.
Answer the question ONLY using the context below.
If the answer is not present, say:
"I cannot find the answer in the provided documents."

Context:
{context}

Question:
{question}
"""

    result = qa_pipeline(prompt, do_sample=False)
    return result[0]["generated_text"]


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


In [13]:
# Cell 11 — end-to-end test

while True:
    query = input("\nAsk a question (or type 'exit'): ")
    if query.lower() in ["exit", "quit"]:
        break

    retrieved = retrieve_chunks(query, k=5)
    answer = generate_answer(query, retrieved)

    print("\n=== ANSWER ===\n")
    print(answer)

    print("\n=== SOURCES ===")
    for r in retrieved:
        print(f"- Page {r['page']} | Chunk {r['chunk_id']}")



Ask a question (or type 'exit'): what is llm


Token indices sequence length is longer than the specified maximum sequence length for this model (1228 > 512). Running this sequence through the model will result in indexing errors



=== ANSWER ===

large language models

=== SOURCES ===
- Page 2 | Chunk 7
- Page 2 | Chunk 8
- Page 2 | Chunk 6
- Page 6 | Chunk 21
- Page 7 | Chunk 28

Ask a question (or type 'exit'): what is legal text

=== ANSWER ===

legal documents often contain ambiguous punctuation, non- standard delimiters, domain-specific abbreviations, and com- plex syntactic structures

=== SOURCES ===
- Page 7 | Chunk 24
- Page 8 | Chunk 29
- Page 7 | Chunk 26
- Page 1 | Chunk 1
- Page 2 | Chunk 4

Ask a question (or type 'exit'): exit


In [14]:
# STEP 1 — mount repo path and scan for SBD-related files

import os, sys

REPO_PATH = "/content/LeXI-Phase-2"
sys.path.append(REPO_PATH)

print("Scanning repo for SBD-related files...\n")

for root, _, files in os.walk(REPO_PATH):
    for f in files:
        if any(k in f.lower() for k in ["sbd", "segment", "boundary", "predict"]):
            print(os.path.join(root, f))


Scanning repo for SBD-related files...

/content/LeXI-Phase-2/LeXiDesk_SBD_Summarizer_Report.ipynb
/content/LeXI-Phase-2/predict.py
/content/LeXI-Phase-2/src/predict.py
/content/LeXI-Phase-2/results/sbd_metrics.csv
/content/LeXI-Phase-2/results/sbd_per_document_metrics.csv
/content/LeXI-Phase-2/results/plots/sbd_confusion_matrix.pdf
/content/LeXI-Phase-2/results/plots/sbd_confusion_matrix.png
/content/LeXI-Phase-2/results/plots/sbd_metrics_bar.pdf
/content/LeXI-Phase-2/results/plots/sbd_metrics_bar.png
/content/LeXI-Phase-2/results/plots/sbd_per_document_f1.png
/content/LeXI-Phase-2/results/plots/sbd_per_document_f1.pdf


In [15]:
# STEP 1 — mount repo path and scan for SBD-related files

import os, sys

REPO_PATH = "/content/LeXI-Phase-2"
sys.path.append(REPO_PATH)

print("Scanning repo for SBD-related files...\n")

for root, _, files in os.walk(REPO_PATH):
    for f in files:
        if any(k in f.lower() for k in ["sbd", "segment", "boundary", "predict"]):
            print(os.path.join(root, f))


Scanning repo for SBD-related files...

/content/LeXI-Phase-2/LeXiDesk_SBD_Summarizer_Report.ipynb
/content/LeXI-Phase-2/predict.py
/content/LeXI-Phase-2/src/predict.py
/content/LeXI-Phase-2/results/sbd_metrics.csv
/content/LeXI-Phase-2/results/sbd_per_document_metrics.csv
/content/LeXI-Phase-2/results/plots/sbd_confusion_matrix.pdf
/content/LeXI-Phase-2/results/plots/sbd_confusion_matrix.png
/content/LeXI-Phase-2/results/plots/sbd_metrics_bar.pdf
/content/LeXI-Phase-2/results/plots/sbd_metrics_bar.png
/content/LeXI-Phase-2/results/plots/sbd_per_document_f1.png
/content/LeXI-Phase-2/results/plots/sbd_per_document_f1.pdf


In [16]:
# Cell A — print predict.py (root) and src/predict.py to inspect
paths = [
    "/content/LeXI-Phase-2/predict.py",
    "/content/LeXI-Phase-2/src/predict.py"
]

for p in paths:
    print("\n" + "="*80)
    print("FILE:", p)
    print("="*80)
    try:
        with open(p, "r", encoding="utf-8") as f:
            lines = f.readlines()
        # print up to 400 lines for safety
        for i, line in enumerate(lines[:400], start=1):
            print(f"{i:04d}: {line.rstrip()}")
        if len(lines) > 400:
            print(f"... (file longer, {len(lines)} lines total)")
    except FileNotFoundError:
        print("Not found:", p)
    except Exception as e:
        print("Error reading file:", e)



FILE: /content/LeXI-Phase-2/predict.py
0001: # In predict.py (in the root LexiDesk folder)
0002: 
0003: import argparse
0004: import sys
0005: import joblib
0006: import re
0007: import torch
0008: import pandas as pd
0009: from src.feature_extractor import token_to_features, add_neighboring_token_features
0010: from src.cnn_model import LegalSBD_CNN # Import the CNN class definition
0011: from src.crf_model import CONTEXT_WINDOW_SIZE, DELIMITERS # Import constants
0012: from src.summarizer import SentenceSummarizer
0013: 
0014: # --- 1. Define Model Paths ---
0015: BASELINE_MODEL_PATH = 'saved_models/crf_baseline_model.joblib'
0016: HYBRID_MODEL_PATH = 'saved_models/crf_hybrid_model.joblib'
0017: CNN_MODEL_PATH = 'saved_models/cnn_model.pth'
0018: 
0019: # --- 2. Load All Trained Models ---
0020: def load_models():
0021:     """Load all trained models and return them along with necessary components."""
0022:     print("Loading all trained models...")
0023:     try:
0024:         # Lo

In [18]:
# Install CRF dependency and a couple of common extras used by the repo
!pip install -q sklearn-crfsuite python-crfsuite seqeval
# optional helpful packages (if not already installed)
!pip install -q joblib pandas torch torchvision


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [19]:
import os, glob, textwrap
base = "/content/LeXI-Phase-2"
paths = [
    os.path.join(base, "saved_models"),
    os.path.join(base, "data", "processed", "train_data.csv")
]

print("Checking saved_models/ and train_data.csv ...\n")
for p in paths:
    if os.path.exists(p):
        if os.path.isdir(p):
            print(f"{p} (dir) -> contains {len(os.listdir(p))} files. Sample:")
            print(textwrap.indent("\n".join(os.listdir(p)[:20]), "  "))
        else:
            print(f"{p} (file) -> size {os.path.getsize(p):,} bytes")
    else:
        print(f"{p} -> NOT FOUND")

# quick listing of important saved model names
print("\nLooking for typical model filenames under saved_models/:")
for fname in ["crf_baseline_model.joblib", "crf_hybrid_model.joblib", "cnn_model.pth"]:
    candidate = os.path.join(base, "saved_models", fname)
    print(fname, "->", "FOUND" if os.path.exists(candidate) else "MISSING")


Checking saved_models/ and train_data.csv ...

/content/LeXI-Phase-2/saved_models (dir) -> contains 4 files. Sample:
  cnn_model.pth
  crf_hybrid_model.joblib
  crf_baseline_model.joblib
  performance_report.json
/content/LeXI-Phase-2/data/processed/train_data.csv (file) -> size 762,865 bytes

Looking for typical model filenames under saved_models/:
crf_baseline_model.joblib -> FOUND
crf_hybrid_model.joblib -> FOUND
cnn_model.pth -> FOUND


In [24]:
# Create required directories
!mkdir -p /content/LeXI-Phase-2/saved_models
!mkdir -p /content/LeXI-Phase-2/data/processed

# Move uploaded files into expected locations
!cp /content/crf_baseline_model.joblib /content/LeXI-Phase-2/saved_models/ 2>/dev/null
!cp /content/crf_hybrid_model.joblib  /content/LeXI-Phase-2/saved_models/
!cp /content/cnn_model.pth            /content/LeXI-Phase-2/saved_models/
!cp /content/train_data.csv           /content/LeXI-Phase-2/data/processed/

# Verify
!ls -lh /content/LeXI-Phase-2/saved_models
!ls -lh /content/LeXI-Phase-2/data/processed


cp: cannot stat '/content/crf_hybrid_model.joblib': No such file or directory
cp: cannot stat '/content/cnn_model.pth': No such file or directory
cp: cannot stat '/content/train_data.csv': No such file or directory
total 1.1M
-rw-r--r-- 1 root root  79K Dec 25 07:19 cnn_model.pth
-rw-r--r-- 1 root root 556K Dec 25 07:19 crf_baseline_model.joblib
-rw-r--r-- 1 root root 407K Dec 25 07:19 crf_hybrid_model.joblib
-rw-r--r-- 1 root root  406 Dec 25 07:19 performance_report.json
total 1.1M
-rw-r--r-- 1 root root 311K Dec 25 07:19 test_data.csv
-rw-r--r-- 1 root root 745K Dec 25 07:19 train_data.csv


In [25]:
# === Run this cell to wire the real models into the pipeline and rebuild index ===
import importlib.util, os, sys, json, inspect, traceback
from sentence_transformers import SentenceTransformer
import faiss, numpy as np, pickle

REPO_ROOT = "/content/LeXI-Phase-2"
PREDICT_PATH = os.path.join(REPO_ROOT, "predict.py")
assert os.path.exists(PREDICT_PATH), f"{PREDICT_PATH} not found"

# 1) Import predict.py fresh
spec = importlib.util.spec_from_file_location("lexi_predict", PREDICT_PATH)
lexi_predict = importlib.util.module_from_spec(spec)
spec.loader.exec_module(lexi_predict)
print("Imported predict.py")

# 2) Check model objects
baseline = getattr(lexi_predict, "baseline_crf_model", None)
hybrid = getattr(lexi_predict, "hybrid_crf_model", None)
cnn = getattr(lexi_predict, "cnn_model", None)
print("Model availability -> baseline:", baseline is not None, "hybrid:", hybrid is not None, "cnn:", cnn is not None)

# 3) Build wrapper sbd_fn that always returns list[str]
def fallback_sbd_simple(text):
    import re
    placeholders = {"Mr.":"Mr<dot>","Dr.":"Dr<dot>","Mrs.":"Mrs<dot>","Sec.":"Sec<dot>","Art.":"Art<dot>"}
    t = text
    for k,v in placeholders.items():
        t = t.replace(k, v)
    parts = [s.strip().replace("<dot>", ".") for s in __import__("re").split(r'(?<=[.!?])\s+', t) if s.strip()]
    return parts

seg = getattr(lexi_predict, "segment_text", None)
if seg is None or not callable(seg):
    raise RuntimeError("segment_text function not found in predict.py")

sig = inspect.signature(seg)
selected_model = hybrid if hybrid is not None else baseline
use_hybrid_flag = hybrid is not None

def sbd_fn(text):
    try:
        kwargs = {}
        if "use_hybrid_features" in sig.parameters:
            kwargs["use_hybrid_features"] = use_hybrid_flag
        if "return_cnn_probs" in sig.parameters:
            kwargs["return_cnn_probs"] = False
        if "model" in sig.parameters:
            res = seg(text, selected_model, **kwargs)
        else:
            res = seg(text, **kwargs)
        if isinstance(res, tuple):
            res = res[0]
        if isinstance(res, str):
            return [s.strip() for s in res.splitlines() if s.strip()]
        if isinstance(res, (list, tuple)):
            return list(res)
        print("Unexpected segment_text return type:", type(res), "-> using fallback splitter")
        return fallback_sbd_simple(text)
    except Exception as e:
        print("Error while calling segment_text (falling back):", e)
        traceback.print_exc()
        return fallback_sbd_simple(text)

print("sbd_fn ready. Using hybrid:", use_hybrid_flag)

# 4) Sanity test on first page
with open("data/lexidesk_pages.jsonl","r",encoding="utf-8") as f:
    first = json.loads(f.readline())
print("\nSample text preview (first 300 chars):")
print(first["text"][:300], "\n---\n")
sents = sbd_fn(first["text"])
print("Detected sentences (sample page):", len(sents))
for i,s in enumerate(sents[:8],1):
    print(f"{i}. {s[:200]}")

# 5) Re-chunk document using sbd_fn
with open("data/lexidesk_pages.jsonl","r",encoding="utf-8") as f:
    pages = [json.loads(l) for l in f]

def chunk_document(pages, sbd_fn, max_chars=1600):
    chunks=[]
    for p in pages:
        sentences = sbd_fn(p["text"])
        cur=[]; cur_len=0
        for sent in sentences:
            if cur_len + len(sent) > max_chars and cur:
                chunks.append({"doc_id": p["doc_id"], "page": p["page"], "text": " ".join(cur)})
                cur=[sent]; cur_len=len(sent)
            else:
                cur.append(sent); cur_len += len(sent)
        if cur:
            chunks.append({"doc_id": p["doc_id"], "page": p["page"], "text": " ".join(cur)})
    return chunks

chunks = chunk_document(pages, sbd_fn)
print("\nCreated chunks:", len(chunks))

# Save chunks
os.makedirs("data", exist_ok=True)
with open("data/chunks.jsonl","w",encoding="utf-8") as f:
    for c in chunks:
        f.write(json.dumps(c, ensure_ascii=False) + "\n")
print("Saved data/chunks.jsonl")

# 6) Rebuild embeddings + FAISS
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
texts = [c["text"] for c in chunks]
embs = embed_model.encode(texts, show_progress_bar=True, convert_to_numpy=True)
dim = embs.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embs.astype("float32"))
os.makedirs("index", exist_ok=True)
faiss.write_index(index, "index/faiss.index")
with open("index/metadata.pkl","wb") as f:
    pickle.dump(chunks, f)
print("FAISS index rebuilt with", index.ntotal, "vectors")

# 7) Simple retrieval test
def retrieve(query, k=5):
    q_emb = embed_model.encode([query]).astype("float32")
    D,I = index.search(q_emb, k)
    results=[]
    for idx, dist in zip(I[0], D[0]):
        if idx < len(chunks):
            r = chunks[idx].copy()
            r["distance"]=float(dist); r["chunk_id"]=idx
            results.append(r)
    return results

q = "What is LeXiDesk?"
print("\nRetrieving for query:", q)
res = retrieve(q, k=5)
for r in res:
    print(f"- [Page {r['page']}] {r['text'][:200]}...\n  chunk_id={r['chunk_id']} dist={r['distance']:.4f}")


Loading all trained models...
Some features may be unavailable. Continuing with available models...
Imported predict.py
Model availability -> baseline: False hybrid: False cnn: False
sbd_fn ready. Using hybrid: False

Sample text preview (first 300 chars):
LeXIDesk: Legal AI Workbench
Malavika Suresh1, Sanika Siva S2, Shivani Shibu3
1Department of Computer Science and Engineering,
Mar Athanasius College of Engineering (Autonomous), Kothamangalam
Abstract—The Legal AI Workbench addresses inefficiencies in
legal workflows caused by manual, fragmented pr 
---

Error while calling segment_text (falling back): 'NoneType' object has no attribute 'predict'
Detected sentences (sample page): 30
1. LeXIDesk: Legal AI Workbench
Malavika Suresh1, Sanika Siva S2, Shivani Shibu3
1Department of Computer Science and Engineering,
Mar Athanasius College of Engineering (Autonomous), Kothamangalam
Abstrac
2. It offers an
advanced AI-driven platform for automating and enhancing legal
document processing us

Traceback (most recent call last):
  File "/tmp/ipython-input-598390528.py", line 48, in sbd_fn
    res = seg(text, selected_model, **kwargs)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/LeXI-Phase-2/predict.py", line 141, in segment_text
    labels = model.predict([sentence_features])[0]
             ^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'predict'
Traceback (most recent call last):
  File "/tmp/ipython-input-598390528.py", line 48, in sbd_fn
    res = seg(text, selected_model, **kwargs)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/LeXI-Phase-2/predict.py", line 141, in segment_text
    labels = model.predict([sentence_features])[0]
             ^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'predict'
Traceback (most recent call last):
  File "/tmp/ipython-input-598390528.py", line 48, in sbd_fn
    res = seg(text, selected_model, **kwargs)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/LeX

Error while calling segment_text (falling back): 'NoneType' object has no attribute 'predict'
Error while calling segment_text (falling back): 'NoneType' object has no attribute 'predict'
Error while calling segment_text (falling back): 'NoneType' object has no attribute 'predict'
Error while calling segment_text (falling back): 'NoneType' object has no attribute 'predict'

Created chunks: 30
Saved data/chunks.jsonl


Traceback (most recent call last):
  File "/tmp/ipython-input-598390528.py", line 48, in sbd_fn
    res = seg(text, selected_model, **kwargs)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/LeXI-Phase-2/predict.py", line 141, in segment_text
    labels = model.predict([sentence_features])[0]
             ^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'predict'
Traceback (most recent call last):
  File "/tmp/ipython-input-598390528.py", line 48, in sbd_fn
    res = seg(text, selected_model, **kwargs)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/LeXI-Phase-2/predict.py", line 141, in segment_text
    labels = model.predict([sentence_features])[0]
             ^^^^^^^^^^^^^
AttributeError: 'NoneType' object has no attribute 'predict'
Traceback (most recent call last):
  File "/tmp/ipython-input-598390528.py", line 48, in sbd_fn
    res = seg(text, selected_model, **kwargs)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/LeX

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

FAISS index rebuilt with 30 vectors

Retrieving for query: What is LeXiDesk?
- [Page 7] Association
for Computational Linguistics, 2021, pp. 3995–4007. [17] I. Rehbein, J. Ruppenhofer, and T. Schmidt, “Improving sentence
boundary detection for spoken language transcripts,” in Proceedings...
  chunk_id=27 dist=1.2953
- [Page 7] Key contributions include the CNN-CRF hybrid architec-
ture for precise legal segmentation, multi-signal extractive
summarization outperforming single-method baselines, and
comprehensive evaluation wi...
  chunk_id=24 dist=1.2989
- [Page 5] CNN-CRF achieves F1-scores of 0.940 and 0.933 on the CC
test set (4% improvement over baseline CRF), and 0.885
average F1 on Indian legal data, surpassing standard toolkits. IV. RESULTS
The Phase-1 ev...
  chunk_id=17 dist=1.3492
- [Page 7] The evaluation was conducted primarily on a synthetic
corpus designed to emulate common citation and punctuation
patterns; accordingly, caution is warranted when extrapolat-
ing these resul

In [26]:
# === FIX: Set correct working directory so predict.py can find models ===
import os, sys

REPO_ROOT = "/content/LeXI-Phase-2"

os.chdir(REPO_ROOT)          # <<< THIS IS THE KEY FIX
sys.path.append(REPO_ROOT)

print("Current working directory:", os.getcwd())
print("Saved models:", os.listdir("saved_models"))


Current working directory: /content/LeXI-Phase-2
Saved models: ['cnn_model.pth', 'crf_hybrid_model.joblib', 'crf_baseline_model.joblib', 'performance_report.json']


In [28]:
# Fix missing data file, extract if necessary, and test sbd_fn
import os, glob, shutil, json

REPO_DATA_DIR = "/content/LeXI-Phase-2/data"
os.makedirs(REPO_DATA_DIR, exist_ok=True)
repo_pages_path = os.path.join(REPO_DATA_DIR, "lexidesk_pages.jsonl")

# 1) Look for an existing lexidesk_pages.jsonl anywhere under /content
candidates = glob.glob("/content/**/lexidesk_pages.jsonl", recursive=True)
candidates = [c for c in candidates if not c.startswith(REPO_DATA_DIR)] + [c for c in candidates if c.startswith(REPO_DATA_DIR)]
if candidates:
    src = candidates[0]
    if os.path.abspath(src) != os.path.abspath(repo_pages_path):
        shutil.copy(src, repo_pages_path)
        print("Copied existing pages file into repo:", src, "→", repo_pages_path)
    else:
        print("Pages file already in repo:", repo_pages_path)
else:
    # 2) Try to find the PDF and extract pages into repo
    pdf_candidates = glob.glob("/content/**/LexiDesk*.*", recursive=True) + glob.glob("/content/*LexiDesk*.*")
    pdf_candidates = [p for p in pdf_candidates if p.lower().endswith(('.pdf','.PDF'))]
    if pdf_candidates:
        pdf_path = pdf_candidates[0]
        print("Found PDF:", pdf_path, " — extracting pages to", repo_pages_path)
        try:
            import fitz  # pymupdf
            doc = fitz.open(pdf_path)
            pages = []
            for i in range(len(doc)):
                text = doc[i].get_text("text")
                pages.append({"doc_id": os.path.basename(pdf_path), "page": i+1, "text": text})
            with open(repo_pages_path, "w", encoding="utf-8") as f:
                for p in pages:
                    f.write(json.dumps(p, ensure_ascii=False) + "\n")
            print(f"Extracted {len(pages)} pages → {repo_pages_path}")
        except Exception as e:
            raise RuntimeError("Failed to extract PDF pages (install/availability issue): " + str(e))
    else:
        raise FileNotFoundError("No pages file or PDF found. Upload your PDF to Colab (Files sidebar) or re-run the extraction cell.")

# 3) Quick sanity test with sbd_fn
print("\nSanity-check: running sentence segmentation (sbd_fn) on page 1...")
with open(repo_pages_path, "r", encoding="utf-8") as f:
    first = json.loads(f.readline())

try:
    sents = sbd_fn(first["text"])
    print("Detected sentences on first page:", len(sents))
    for i,s in enumerate(sents[:8], start=1):
        print(f"{i}. {s[:250]}")
except Exception as e:
    print("Error when calling sbd_fn:", e)
    import traceback; traceback.print_exc()


Copied existing pages file into repo: /content/data/lexidesk_pages.jsonl → /content/LeXI-Phase-2/data/lexidesk_pages.jsonl

Sanity-check: running sentence segmentation (sbd_fn) on page 1...
Detected sentences on first page: 30
1. LeXIDesk: Legal AI Workbench
Malavika Suresh1, Sanika Siva S2, Shivani Shibu3
1Department of Computer Science and Engineering,
Mar Athanasius College of Engineering (Autonomous), Kothamangalam
Abstract—The Legal AI Workbench addresses inefficiencies 
2. It offers an
advanced AI-driven platform for automating and enhancing legal
document processing using sophisticated Natural Language Pro
cessing (NLP) methods.
3. A core component is robust sentence
boundary detec tion (SBD) tailored for legal texts. Leveraging
a hybrid deep learning architec ture combining Convolutional
Neural Networks (CNN) and Conditional Random Fields (CRF),
the model captures intricate ch
4. Legal doc-
uments present challenges such as complex structures, frequent
citations, abbreviations,

In [29]:
# === Cell: Import predict.py, create sbd_fn wrapper, and test it ===

import importlib.util, os, sys, json, inspect, traceback

PREDICT_PATH = "/content/LeXI-Phase-2/predict.py"
assert os.path.exists(PREDICT_PATH), f"{PREDICT_PATH} not found"

# load module from file (name it lexi_predict to avoid collisions)
spec = importlib.util.spec_from_file_location("lexi_predict", PREDICT_PATH)
lexi_predict = importlib.util.module_from_spec(spec)
spec.loader.exec_module(lexi_predict)
print("Imported:", PREDICT_PATH)

# Inspect available model objects
baseline = getattr(lexi_predict, "baseline_crf_model", None)
hybrid = getattr(lexi_predict, "hybrid_crf_model", None)
cnn = getattr(lexi_predict, "cnn_model", None)
print("Model availability -> baseline:", baseline is not None, "hybrid:", hybrid is not None, "cnn:", cnn is not None)

# Choose model to pass into segment_text (prefer hybrid if available)
if hybrid is not None:
    selected_model = hybrid
    use_hybrid_flag = True
elif baseline is not None:
    selected_model = baseline
    use_hybrid_flag = False
else:
    selected_model = None
    use_hybrid_flag = False
    print("Warning: No CRF models loaded. The predict.segment_text call may fail; a fallback will be used below.")

# Fallback rule-based splitter (safe)
def fallback_sbd_simple(text):
    import re
    placeholders = {"Mr.":"Mr<dot>","Dr.":"Dr<dot>","Mrs.":"Mrs<dot>","Sec.":"Sec<dot>","Art.":"Art<dot>","Fig.":"Fig<dot>"}
    t = text
    for k,v in placeholders.items():
        t = t.replace(k, v)
    parts = [s.strip().replace("<dot>", ".") for s in re.split(r'(?<=[.!?])\s+', t) if s.strip()]
    return parts

# Build wrapper: calls predict.segment_text(...) and normalizes output to list[str]
def make_sbd_fn(module, model, use_hybrid):
    seg = getattr(module, "segment_text", None)
    if seg is None or not callable(seg):
        raise RuntimeError("segment_text not found in predict.py")
    sig = inspect.signature(seg)
    # create wrapper
    def sbd_fn(text):
        try:
            # call with return_cnn_probs if available (we set False)
            kwargs = {}
            if "use_hybrid_features" in sig.parameters:
                kwargs["use_hybrid_features"] = use_hybrid
            if "return_cnn_probs" in sig.parameters:
                kwargs["return_cnn_probs"] = False
            # if model is expected (common), pass it; otherwise call without model
            if "model" in sig.parameters:
                res = seg(text, model, **kwargs)
            else:
                res = seg(text, **kwargs)
            # if function returns (sentences, probs) tuple, take first
            if isinstance(res, tuple):
                res = res[0]
            # ensure list[str]
            if isinstance(res, str):
                return [s.strip() for s in res.splitlines() if s.strip()]
            if isinstance(res, (list, tuple)):
                return list(res)
            # unexpected type -> fallback
            print("Unexpected segment_text return type:", type(res), "-> using fallback splitter")
            return fallback_sbd_simple(text)
        except Exception as e:
            print("Error while calling segment_text (falling back):", e)
            traceback.print_exc()
            return fallback_sbd_simple(text)
    return sbd_fn

# Attempt to build sbd_fn; if that fails, use fallback
try:
    if selected_model is None:
        print("No model object available in predict.py; segment_text may still work if it loads models internally.")
    sbd_fn = make_sbd_fn(lexi_predict, selected_model, use_hybrid_flag)
    print("sbd_fn wrapper created successfully (calls predict.segment_text).")
except Exception as e:
    print("Could not create sbd_fn from predict.py:", e)
    print("Using simple fallback splitter instead.")
    sbd_fn = fallback_sbd_simple

# Quick test on first page of extracted pages
with open("data/lexidesk_pages.jsonl", "r", encoding="utf-8") as f:
    sample = json.loads(f.readline())

print("\nSample text preview (first 400 chars):\n", sample["text"][:400], "\n")
sents = sbd_fn(sample["text"])
print("Detected sentences:", len(sents))
print("First 8 sentences:")
for i, s in enumerate(sents[:8], start=1):
    print(f"{i}. {s[:300]}")


Loading all trained models...
All models loaded successfully.
Imported: /content/LeXI-Phase-2/predict.py
Model availability -> baseline: True hybrid: True cnn: True
sbd_fn wrapper created successfully (calls predict.segment_text).

Sample text preview (first 400 chars):
 LeXIDesk: Legal AI Workbench
Malavika Suresh1, Sanika Siva S2, Shivani Shibu3
1Department of Computer Science and Engineering,
Mar Athanasius College of Engineering (Autonomous), Kothamangalam
Abstract—The Legal AI Workbench addresses inefficiencies in
legal workflows caused by manual, fragmented processes in clause
review, contract negotiation, and lit igation research. It offers an
advanced AI-d 

Detected sentences: 30
First 8 sentences:
1. LeXIDesk: Legal AI Workbench
Malavika Suresh1, Sanika Siva S2, Shivani Shibu3
1Department of Computer Science and Engineering,
Mar Athanasius College of Engineering (Autonomous), Kothamangalam
Abstract—The Legal AI Workbench addresses inefficiencies in
legal workflows caused by m

In [30]:
# Re-chunk using the sbd_fn you just created
import json, os

with open("data/lexidesk_pages.jsonl","r",encoding="utf-8") as f:
    pages = [json.loads(l) for l in f]

def chunk_document(pages, sbd_fn, max_chars=1600):
    chunks=[]
    for p in pages:
        sents = sbd_fn(p["text"])
        cur=[]; cur_len=0
        for s in sents:
            if cur_len + len(s) > max_chars and cur:
                chunks.append({"doc_id": p["doc_id"], "page": p["page"], "text": " ".join(cur)})
                cur=[s]; cur_len=len(s)
            else:
                cur.append(s); cur_len += len(s)
        if cur:
            chunks.append({"doc_id": p["doc_id"], "page": p["page"], "text": " ".join(cur)})
    return chunks

chunks = chunk_document(pages, sbd_fn, max_chars=1600)
print("Chunks created:", len(chunks))
os.makedirs("data", exist_ok=True)
with open("data/chunks.jsonl","w",encoding="utf-8") as f:
    for c in chunks:
        f.write(json.dumps(c, ensure_ascii=False) + "\n")
print("Saved -> data/chunks.jsonl")


Chunks created: 31
Saved -> data/chunks.jsonl


In [31]:
# Build embeddings and FAISS index
from sentence_transformers import SentenceTransformer
import faiss, numpy as np, pickle, os, json

embed_model = SentenceTransformer("all-MiniLM-L6-v2")
with open("data/chunks.jsonl","r",encoding="utf-8") as f:
    chunks = [json.loads(l) for l in f]

texts = [c["text"] for c in chunks]
embs = embed_model.encode(texts, show_progress_bar=True, convert_to_numpy=True)

dim = embs.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embs.astype("float32"))

os.makedirs("index", exist_ok=True)
faiss.write_index(index, "index/faiss.index")
with open("index/metadata.pkl","wb") as f:
    pickle.dump(chunks, f)
print("FAISS index rebuilt with", index.ntotal, "vectors")


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

FAISS index rebuilt with 31 vectors


In [32]:
# Simple retriever + local generator (flan-t5 fallback)
import pickle, faiss, numpy as np
from transformers import pipeline

# load
index = faiss.read_index("index/faiss.index")
with open("index/metadata.pkl","rb") as f:
    chunks = pickle.load(f)

def retrieve(query, k=5):
    q_emb = embed_model.encode([query]).astype("float32")
    D,I = index.search(q_emb, k)
    res=[]
    for idx, dist in zip(I[0], D[0]):
        m = chunks[idx].copy()
        m["distance"]=float(dist); m["chunk_id"]=idx
        res.append(m)
    return res

# local generator (optional — you already have a generator cell)
qa = pipeline("text2text-generation", model="google/flan-t5-base", max_length=512)

def answer_with_context(q):
    ret = retrieve(q, k=5)
    context = "\n\n".join([f"[{r['doc_id']}:p{r['page']}] {r['text'][:1000]}" for r in ret])
    prompt = f"Use ONLY the CONTEXT to answer. Cite with [DOC:p#].\n\nCONTEXT:\n{context}\n\nQUESTION:\n{q}"
    out = qa(prompt, do_sample=False)[0]["generated_text"]
    return out, ret

# Example queries to paste:
queries = [
    "What is LeXiDesk?",
    "What problem does LeXiDesk aim to solve in legal NLP?",
    "What evaluation metrics are reported for the sentence boundary detection model?"
]
for q in queries:
    print("\nQUERY:", q)
    ans, sources = answer_with_context(q)
    print("ANSWER:\n", ans)
    print("SOURCES:")
    for s in sources:
        print(f" - [p{s['page']}] chunk_id={s['chunk_id']} dist={s['distance']:.4f}")


Device set to use cuda:0
Token indices sequence length is longer than the specified maximum sequence length for this model (1492 > 512). Running this sequence through the model will result in indexing errors



QUERY: What is LeXiDesk?
ANSWER:
 LeXiDesk, a robust preprocessing pipeline for legal document analysis comprising a hybrid CNN-CRF sentence boundary detection model (F1-score: 0.8000, precision: 0.9524, recall: 0.6897) and weighted ex- tractive summarizer (ROUGE-L F1: 0.8227, BERT Score F1: 0.9719) evaluated across synt
SOURCES:
 - [p7] chunk_id=25 dist=1.2989
 - [p7] chunk_id=28 dist=1.3387
 - [p5] chunk_id=18 dist=1.3492
 - [p7] chunk_id=24 dist=1.3750
 - [p1] chunk_id=0 dist=1.4541

QUERY: What problem does LeXiDesk aim to solve in legal NLP?
ANSWER:
 inefficiencies in legal workflows caused by manual, fragmented processes in clause review, contract negotiation, and lit igation research. It offers an advanced AI-driven platform for automating and enhancing legal document processing using sophisticated Natural Language Pro cessing (NLP) methods. A core component is robust sentence boundary detec tion (SBD) tailored for legal texts. Leveraging a hybrid deep learning architec ture co

In [33]:
# Quick precision@k if you have a small gold set
gold = [
    {"q":"What is LeXiDesk?","expected_page":1},
    {"q":"What metrics are reported for SBD?","expected_page":5},
]
def precision_at_k(q, exp_page, k=5):
    res = retrieve(q,k)
    pages = [r['page'] for r in res]
    return 1.0 if exp_page in pages else 0.0

import numpy as np
scores = [precision_at_k(g['q'], g['expected_page']) for g in gold]
print("Precision@5:", np.mean(scores))


Precision@5: 1.0
