In [4]:

import ollama
import json
import numpy as np
import faiss
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tqdm

def filter_by_metadata(query, metadata_list):
    """
    Returns indices of documents whose metadata matches the query.
    It checks if the query contains words from 'chapter' or 'file' metadata.
    """
    query_lower = query.lower()
    candidate_indices = []
    for i, meta in enumerate(metadata_list):
        chapter = meta.get("chapter", "").lower()
        file_name = meta.get("file", "").lower()
        # If the query includes either the chapter or file keywords, add the document index.
        if chapter in query_lower or file_name in query_lower:
            candidate_indices.append(i)
    return candidate_indices


def extract_keywords(content):
    """Extract legal keywords from content"""
    response = ollama.generate(
        model="deepseek-r1:32b",
        prompt=f"Extract 5-7 legal keywords from this text:\n{content}\n Keywords:  Output as JSON"
    )
    raw_text = response["response"].strip()
    raw_output = response["response"].strip()
    if '```json' in raw_output:
        raw_output = raw_output.split('```json')[1].split('```')[0]
   
        
    
    result = json.loads(raw_output)
    return result["keywords"]


def extract_fields(content,fields):
    """Extract legal keywords from content"""
    response = ollama.generate(
        model="deepseek-r1:32b",
        prompt=f"Extract filed for :\n{content}\n from those fields:{fields} example output as JSON jsut 'field': (one the fields) "
    )
    raw_text = response["response"].strip()
    raw_output = response["response"].strip()
    try :
        if '```json' in raw_output:
            raw_output = raw_output.split('```json')[1].split('```')[0]
    
            
        
        result = json.loads(raw_output)
        return result["field"]
    except : 
        print(raw_output)


# 1. Load Documents (Already Split)
with open("C:\dev\Ramzey\MOUSTACHAR\RAG\\new_laws\\All_laws.json", "r", encoding="utf-8") as file:
    laws = json.load(file)
with open("C:\dev\Ramzey\MOUSTACHAR\RAG\\new_laws\\fields.txt", "r",encoding="utf-8") as file:
    fields = file.read()


documents = []
metadata = {
    'indices': [],
    'title': [],
    'keywords': [],
    'field': []
}
pbar = tqdm.tqdm(total=len(laws))
for idx, law in enumerate(laws):
    law_title = list(law.keys())[0]
    content = str(law[law_title])
    
    # Store document and metadata
    documents.append(content)
    metadata['indices'].append(idx)
    metadata['title'].append(law_title)
    metadata['keywords'].append(extract_keywords(content))  # Implement keyword extraction
    metadata['field'].append(extract_fields(content,fields))  # Implement type classification
    pbar.update(1)
pbar.close()


# 2. Compute Embeddings for Each Document Using Ollama
all_embeddings = []
for doc in documents:
    response = ollama.embeddings(
        model="snowflake-arctic-embed2",
        prompt=doc
    )
    all_embeddings.append(response["embedding"])

embeddings_np = np.array(all_embeddings, dtype=np.float32)
index = faiss.IndexFlatL2(embeddings_np.shape[1])
index.add(embeddings_np)
print(f"Faiss index size: {index.ntotal}")



# 6. Querying Function Using Faiss


  0%|          | 22/5887 [08:07<35:42:25, 21.92s/it]

TypeError: list indices must be integers or slices, not str