### ELASTIC_SEARCH


In [1]:
from elasticsearch import Elasticsearch

# -----------------------
# Connect to ES 8.x server
# -----------------------
es_client = Elasticsearch(
    "http://localhost:9200",
    request_timeout=30
)

In [11]:


# -----------------------
# Define mappings
# -----------------------

# For collection_new and collection_old (normal text chunks)
index_settings_chunks = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "chunk_id": {"type": "keyword"},
            "doc_id": {"type": "keyword"},
            "text": {"type": "text"},
            "metadata": {
                "properties": {
                    "chapter": {"type": "text"},
                    "section_number": {"type": "keyword"},
                    "section_title": {"type": "text"},
                    "law_name": {"type": "keyword"}
                }
            }
        }
    }
}

# For collection_map (different schema: mapping between laws)
index_settings_map = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "source_file": {"type": "keyword"},
            "chunk_id": {"type": "keyword"},
            "fields": {
                "properties": {
                    "New_Law_Section": {"type": "keyword"},
                    "Old_Law_Section": {"type": "keyword"},
                    "Subject": {"type": "text"},
                    "Summary_of_comparison": {"type": "text"}
                }
            }
        }
    }
}

# -----------------------
# Create 3 separate indices
# -----------------------
collections = {
    "collection_new": index_settings_chunks,
    "collection_old": index_settings_chunks,
    "collection_map": index_settings_map
}

for idx, settings in collections.items():
    if not es_client.indices.exists(index=idx):
        es_client.indices.create(
            index=idx,
            settings=settings["settings"],
            mappings=settings["mappings"]
        )
        print(f"Index {idx} created successfully!")
    else:
        print(f"Index {idx} already exists. Skipping creation.")


Index collection_new already exists. Skipping creation.
Index collection_old already exists. Skipping creation.
Index collection_map already exists. Skipping creation.


In [9]:
from tqdm.auto import tqdm

In [10]:
import json
import os
collection_files = {
    "collection_new": ["bns.json", "bnss.json", "bsa.json"],
    "collection_old": ["ipc.json", "crpc.json", "iea.json"],
    "collection_map": ["mapping_of_laws.json"]
}

data_dir = "../data"

# -----------------------
# Load and index documents
# -----------------------
for collection, files in collection_files.items():
    for fname in files:
        file_path = os.path.join(data_dir, fname)
        with open(file_path, "r", encoding="utf-8") as f:
            documents = json.load(f)

        for doc in tqdm(documents, desc=f"Indexing {fname} into {collection}"):
            es_client.index(index=collection, document=doc,timeout="300s")

Indexing bns.json into collection_new:   0%|          | 0/450 [00:00<?, ?it/s]

Indexing bnss.json into collection_new:   0%|          | 0/1176 [00:00<?, ?it/s]

Indexing bsa.json into collection_new:   0%|          | 0/164 [00:00<?, ?it/s]

Indexing ipc.json into collection_old:   0%|          | 0/1271 [00:00<?, ?it/s]

Indexing crpc.json into collection_old:   0%|          | 0/1946 [00:00<?, ?it/s]

Indexing iea.json into collection_old:   0%|          | 0/467 [00:00<?, ?it/s]

Indexing mapping_of_laws.json into collection_map:   0%|          | 0/1237 [00:00<?, ?it/s]

In [3]:
def elastic_search_all(query):
    """
    Runs the same search query on:
      - collection_new
      - collection_old
      - mapping_of_laws
    Returns combined results from all three.
    """

    # --- Search 1: collection_new ---
    search_new = {
        "size": 10,
        "query": {
            "bool": {
                "must": [{
                    "multi_match": {
                        "query": query,
                        "fields": [
                            "text^3",
                            "metadata.section_title",
                            "metadata.section_number",
                            "metadata.law_name"
                        ],
                        "type": "best_fields"
                    }
                }]
            }
        }
    }
    resp_new = es_client.search(index="collection_new", query=search_new["query"], size=search_new["size"])
    results_new = [hit["_source"] for hit in resp_new["hits"]["hits"]]

    # --- Search 2: collection_old ---
    search_old = {
        "size": 5,
        "query": {
            "bool": {
                "must": [{
                    "multi_match": {
                        "query": query,
                        "fields": [
                            "text^3",
                            "metadata.section_title",
                            "metadata.section_number",
                            "metadata.law_name"
                        ],
                        "type": "best_fields"
                    }
                }]
            }
        }
    }
    resp_old = es_client.search(index="collection_old", query=search_old["query"], size=search_old["size"])
    results_old = [hit["_source"] for hit in resp_old["hits"]["hits"]]

    search_map = {
        "size": 5,
        "query": {
            "bool": {
                "must": [{
                    "multi_match": {
                        "query": query,
                        "fields": [
                            "fields.New_Law_Section^3",
                            "fields.Old_Law_Section^3",
                            "fields.Subject",
                            "fields.Summary_of_comparison"
                        ],
                        "type": "best_fields"
                    }
                }]
            }
        }
    }
    resp_map = es_client.search(index="collection_map", query=search_map["query"], size=search_map["size"])
    results_map = [hit["_source"] for hit in resp_map["hits"]["hits"]]

    # --- Combined results ---
    combined_results = {
        "collection_new": results_new,
        "collection_old": results_old,
        "collection_map": results_map  # key updated here
    }

    return combined_results

### VECTOR_SEARCH

In [38]:
import os
import json
import uuid
from fastembed import TextEmbedding
from qdrant_client import QdrantClient, models

# -----------------------
# CONFIGURATION
# -----------------------
data_dir = "../data"
VECTOR_SIZE = 1024  # bge-large-en-v1.5 has 1024-dimension embeddings
BATCH_SIZE = 32

In [None]:
# Initialize Qdrant client
qd_client = QdrantClient("http://localhost:6333")



In [None]:
# Initialize embedding model
model_handle = TextEmbedding(model_name="BAAI/bge-large-en-v1.5")

In [32]:
# Collections
collection_new = "collection_new"
collection_old = "collection_old"
collection_map = "collection_map"

In [None]:
# -----------------------
# Ensure collection exists
# -----------------------
def ensure_collection_exists(qd_client, collection_name, vector_size=VECTOR_SIZE):
    if not qd_client.collection_exists(collection_name):
        print(f"⚙️ Creating collection: {collection_name}")
        qd_client.create_collection(
            collection_name=collection_name,
            vectors_config=models.VectorParams(
                size=vector_size,
                distance=models.Distance.COSINE
            )
        )
    else:
        info = qd_client.get_collection(collection_name)
        print(f"✅ Collection '{collection_name}' already exists with {info.vectors_count} vectors.")

# -----------------------
# COLLECTIONS & FILES
# -----------------------
collection_files = {
    collection_new: ["bns.json", "bnss.json", "bsa.json"],
    collection_old: ["ipc.json", "crpc.json", "iea.json"],
    collection_map: ["mapping_of_laws.json"]
}

# -----------------------
# LOAD & UPSERT INTO QDRANT
# -----------------------
for collection, files in collection_files.items():
    ensure_collection_exists(qd_client, collection)

    for fname in files:
        file_path = os.path.join(data_dir, fname)
        if not os.path.exists(file_path):
            print(f"⚠️ File not found: {file_path}")
            continue

        with open(file_path, "r", encoding="utf-8") as f:
            documents = json.load(f)

        for start in range(0, len(documents), BATCH_SIZE):
            batch_docs = documents[start:start + BATCH_SIZE]
            points = []

            # Prepare texts for embedding
            texts = [
                doc.get("text", "") if collection in [collection_new, collection_old]
                else json.dumps(doc, ensure_ascii=False)
                for doc in batch_docs
            ]

            # Generate embeddings for the batch
            vectors = list(model_handle.embed(texts))
            assert len(vectors) == len(batch_docs)

            for doc, vector in zip(batch_docs, vectors):
                point_id = str(uuid.uuid4())

                payload = {
                    "chunk_id": doc.get("chunk_id"),
                    "doc_id": doc.get("doc_id"),
                    "content": doc.get("text", ""),
                }

                if collection in [collection_new, collection_old]:
                    payload["metadata"] = {
                        "section_number": doc.get("metadata", {}).get("section_number"),
                        "section_title": doc.get("metadata", {}).get("section_title"),
                        "law_name": doc.get("metadata", {}).get("law_name")
                    }
                else:
                    payload.update({
                        "source_file": doc.get("source_file"),
                        "fields": doc.get("fields", {})
                    })

                point = models.PointStruct(id=point_id, vector=vector, payload=payload)
                points.append(point)

            # Upsert to Qdrant
            qd_client.upsert(collection_name=collection, points=points)
            print(f"✅ Indexed {len(points)} documents into {collection} from {fname} (batch {start // BATCH_SIZE + 1})")

⚙️ Creating new collection: collection_new


UnexpectedResponse: Unexpected Response: 409 (Conflict)
Raw response content:
b'{"status":{"error":"Wrong input: Collection `collection_new` already exists!"},"time":0.002103127}'

In [26]:
def vector_search(question):
    print("⚙️ Running vector_search...")

    # --- Search in collection_new (10 results) ---
    query_points_new = qd_client.query_points(
        collection_name=collection_new,
        query=models.Document(
            text=question,
            model=model_handle
        ),
        limit=10,
        with_payload=True
    )

    # --- Search in collection_old (5 results) ---
    query_points_old = qd_client.query_points(
        collection_name=collection_old,
        query=models.Document(
            text=question,
            model=model_handle
        ),
        limit=5,
        with_payload=True
    )

    # --- Search in collection_map (5 results) ---
    query_points_map = qd_client.query_points(
        collection_name=collection_map,
        query=models.Document(
            text=question,
            model=model_handle
        ),
        limit=5,
        with_payload=True
    )

    # Collect all results
    results_new = [{"source": "new", "payload": p.payload} for p in query_points_new.points]
    results_old = [{"source": "old", "payload": p.payload} for p in query_points_old.points]
    results_map = [{"source": "map", "payload": p.payload} for p in query_points_map.points]

    # Combine all results
    combined_results = results_new + results_old + results_map

    print(f"✅ Retrieved {len(results_new)} (new) + {len(results_old)} (old) + {len(results_map)} (map) results.")
    return combined_results


#### Save questions to csv file

In [46]:
import pandas as pd
import json
import os
from tqdm import tqdm

# ----------------------------------------------------
# 1. Define the file paths
# ----------------------------------------------------
file_names = [
    "../data/questions/llm_questions_mapping.json",
    "../data/questions/llm_questions_newlaws.json",
    "../data/questions/llm_questions_oldlaws.json"
]

# ----------------------------------------------------
# 2. Helper function to infer collection from filename
# ----------------------------------------------------
def infer_collection_from_filename(file_path):
    file_name = os.path.basename(file_path).lower()
    if "mapping" in file_name:
        return "collection_map"
    elif "newlaws" in file_name:
        return "collection_new"
    elif "oldlaws" in file_name:
        return "collection_old"
    else:
        return "unknown_collection"

# ----------------------------------------------------
# 3. Function to load questions from a JSON file
# ----------------------------------------------------
def load_questions_with_chunk_id_as_q_id(file_path):
    all_records = []
    collection = infer_collection_from_filename(file_path)

    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
    except FileNotFoundError:
        raise FileNotFoundError(f"File not found at: {file_path}")

    for entry in data:
        q_id = entry.get("chunk_id", "NO_CHUNK_ID")
        questions = entry.get("llm_questions", [])
        for question in questions:
            all_records.append({
                "q_id": q_id,
                "question": question,
                "collection": collection
            })
    return all_records

# ----------------------------------------------------
# 4. Load and process all files
# ----------------------------------------------------
all_questions = []
for file_name in file_names:
    try:
        all_questions.extend(load_questions_with_chunk_id_as_q_id(file_name))
        print(f"✅ Successfully loaded: {file_name}")
    except FileNotFoundError:
        print(f"❌ ERROR: File not found: {file_name}. Please check the path.")
    except Exception as e:
        print(f"❌ An error occurred processing {file_name}: {e}")

# ----------------------------------------------------
# 5. Create a Pandas DataFrame
# ----------------------------------------------------
if all_questions:
    df = pd.DataFrame(all_questions)

    print("\n" + "=" * 40)
    print("DataFrame Created Successfully:")
    print("=" * 40)
    print(df.head(5))
    print("-" * 40)
    print(f"Total rows in DataFrame: {len(df)}")
    print(f"Number of unique chunk_id/q_id values: {df['q_id'].nunique()}")
    print(f"Collections present: {df['collection'].unique()}")
else:
    print("\nDataFrame creation failed: No data was loaded.")

# ----------------------------------------------------
# 6. Optional: save combined data to CSV
# ----------------------------------------------------
output_csv_path = "../data/all-questions.csv"
df.to_csv(output_csv_path, index=False)
print(f"\n✅ Combined questions saved to {output_csv_path}")


✅ Successfully loaded: ../data/questions/llm_questions_mapping.json
✅ Successfully loaded: ../data/questions/llm_questions_newlaws.json
✅ Successfully loaded: ../data/questions/llm_questions_oldlaws.json

DataFrame Created Successfully:
               q_id                                           question  \
0  BNSS_to_CrPC_268  What does the chunk_id BNSS_to_CrPC_268 imply ...   
1  BNSS_to_CrPC_268  In the absence of textual content, what type o...   
2  BNSS_to_CrPC_268  If you were to populate this chunk with CrPC s...   
3  BNSS_to_CrPC_268  How would you annotate this chunk to indicate ...   
4  BNSS_to_CrPC_268  What metadata fields would be necessary to loc...   

       collection  
0  collection_map  
1  collection_map  
2  collection_map  
3  collection_map  
4  collection_map  
----------------------------------------
Total rows in DataFrame: 6700
Number of unique chunk_id/q_id values: 1279
Collections present: ['collection_map' 'collection_new' 'collection_old']

✅ Combin

In [47]:
import os

# Make sure the directory exists
os.makedirs("../data", exist_ok=True)

# Save the DataFrame
df.to_csv("../data/all-questions.csv", index=False)
print("Saved to ../data/all-questions.csv")



Saved to ../data/all-questions.csv


In [48]:
df_allquestions = pd.read_csv('../data/all-questions.csv')
all_questions = df_allquestions.to_dict(orient='records')
all_questions

[{'q_id': 'BNSS_to_CrPC_268',
  'question': 'What does the chunk_id BNSS_to_CrPC_268 imply about the source and target statutes involved?',
  'collection': 'collection_map'},
 {'q_id': 'BNSS_to_CrPC_268',
  'question': 'In the absence of textual content, what type of questions would be most appropriate to evaluate the BNSS_to_CrPC_268 mapping?',
  'collection': 'collection_map'},
 {'q_id': 'BNSS_to_CrPC_268',
  'question': 'If you were to populate this chunk with CrPC section 268 content, what would be the most likely legal issue area to focus on?',
  'collection': 'collection_map'},
 {'q_id': 'BNSS_to_CrPC_268',
  'question': 'How would you annotate this chunk to indicate its empty text and its purpose within a larger dataset?',
  'collection': 'collection_map'},
 {'q_id': 'BNSS_to_CrPC_268',
  'question': 'What metadata fields would be necessary to locate the actual legal text corresponding to BNSS_to_CrPC_268 in a repository?',
  'collection': 'collection_map'},
 {'q_id': 'BSA_to_IE

## Evaluation Results

In [53]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)


In [55]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)


In [57]:

def evaluate(all_questions, search_function):
    relevance_total = []

    for q in tqdm(all_questions):
        q_id = q['q_id']
        collection_origin = q['collection']
    
        # Run the search
        results = elastic_search_all(query=q['question'])
        chunks = results[collection_origin]
    
        # Compute relevance for this collection
        relevance = [d['chunk_id'] == q_id for d in chunks]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

#### ElasticSearch Evaluation Results

In [58]:
evaluate(all_questions, lambda q: elastic_search_all(q['question']))

100%|██████████| 6700/6700 [01:32<00:00, 72.24it/s]


{'hit_rate': 0.6080597014925373, 'mrr': 0.5757641554134102}

#### VectorSearch Evaluation Results