In [1]:

import ollama
import json
import numpy as np
import faiss
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tqdm

def filter_by_metadata(query, metadata_list):
    """
    Returns indices of documents whose metadata matches the query.
    It checks if the query contains words from 'chapter' or 'file' metadata.
    """
    query_lower = query.lower()
    candidate_indices = []
    for i, meta in enumerate(metadata_list):
        chapter = meta.get("chapter", "").lower()
        file_name = meta.get("file", "").lower()
        # If the query includes either the chapter or file keywords, add the document index.
        if chapter in query_lower or file_name in query_lower:
            candidate_indices.append(i)
    return candidate_indices


def extract_keywords(content):
    """Extract legal keywords from content"""
    response = ollama.generate(
        model="deepseek-r1:32b",
        prompt=f"Extract 5-7 legal keywords from this text:\n{content}\n Keywords:  Output as JSON"
    )
    raw_text = response["response"].strip()
    raw_output = response["response"].strip()
    if '```json' in raw_output:
        raw_output = raw_output.split('```json')[1].split('```')[0]
   
        
    
    result = json.loads(raw_output)
    return result["keywords"]


def extract_fields(content,fields):
    """Extract legal keywords from content"""
    response = ollama.generate(
        model="deepseek-r1:32b",
        prompt=f"Extract filed for :\n{content}\n from those fields:{fields} example output as JSON jsut 'field': (one the fields) "
    )
    raw_text = response["response"].strip()
    raw_output = response["response"].strip()
    try :
        if '```json' in raw_output:
            raw_output = raw_output.split('```json')[1].split('```')[0]
    
            
        
        result = json.loads(raw_output)
        return result["field"]
    except : 
        print(raw_output)


# 1. Load Documents (Already Split)
with open("new_laws\\All_laws.json", "r", encoding="utf-8") as file:
    laws = json.load(file)
with open("new_laws/fields.txt", "r",encoding="utf-8") as file:
    fields = file.read()


documents = []
metadata = {
    'indices': [],
    'title': [],
    'keywords': [],
    'field': []
}
pbar = tqdm.tqdm(total=len(laws))
for idx, law in enumerate(laws):
    law_title = list(law.keys())[0]
    content = str(law[law_title])
    
    # Store document and metadata
    documents.append(content)
    metadata['indices'].append(idx)
    metadata['title'].append(law_title)
    metadata['keywords'].append(extract_keywords(content))  # Implement keyword extraction
    metadata['field'].append(extract_fields(content,fields))  # Implement type classification
    pbar.update(1)
pbar.close()


# 2. Compute Embeddings for Each Document Using Ollama
all_embeddings = []
for doc in documents:
    response = ollama.embeddings(
        model="snowflake-arctic-embed2",
        prompt=doc
    )
    all_embeddings.append(response["embedding"])

embeddings_np = np.array(all_embeddings, dtype=np.float32)
index = faiss.IndexFlatL2(embeddings_np.shape[1])
index.add(embeddings_np)
print(f"Faiss index size: {index.ntotal}")



# 6. Querying Function Using Faiss


  0%|          | 11/5887 [03:39<33:28:19, 20.51s/it]

KeyboardInterrupt: 

In [None]:
def apply_keyword_boost(results, keywords, top_k=5):
    """Boost documents containing keywords"""
    boosted = []
    for idx, score in results:
        doc_keywords = metadata['keywords'][idx]
        content = documents[idx].lower()
        
        # Calculate keyword matches
        matches = sum(
            1 for kw in keywords
            if kw.lower() in content or kw.lower() in doc_keywords
        )
        boosted.append((idx, score * (1 + matches * 0.2)))  # 20% boost per match
    
    return sorted(boosted, key=lambda x: x[1], reverse=True)[:top_k]

def parse_query(query,chapters,fields):
    """Extract filters and keywords from query using LLM"""
    response = ollama.generate(
        model="deepseek-r1:32b",
        prompt=f"""Analyze this legal query and output:
        1. Metadata filters (field , chapter name (Derived from the field and keywords)) 
        
        get field from those fields {fields}
        2. Important keywords
        3. ouput is french
        
        Query: {query}
        
        Output as JSON: {{
            "filters": {{"chapter": "...", "field": "..."}},
            "keywords": ["...", "..."]
        }}"""
    )
    
    # Clean the response
    raw_output = response["response"].strip()
    if '```json' in raw_output:
        raw_output = raw_output.split('```json')[1].split('```')[0]
        print("sssssss")
        
    
    result = json.loads(raw_output)
    return result
def search_with_metadata(query_vector, eligible_indices, k=10):
    """Search with metadata filtering using Faiss"""
    if not eligible_indices:
        return []
    
    # Create subset index
    subset_embeddings = embeddings_np[eligible_indices]
    subset_index = faiss.IndexFlatL2(subset_embeddings.shape[1])
    subset_index.add(subset_embeddings)
    
    # Search subset
    distances, indices = subset_index.search(query_vector, k)
    
    # Map back to original indices
    return [(eligible_indices[i], -dist) for i, dist in zip(indices[0], distances[0])]
def format_context(docs, metas):
    """Format context with metadata"""
    return "\n\n".join(
        f"Law: {meta['title']}\nType: {meta['law_type']}\nContent: {doc}"
        for doc, meta in zip(docs, metas)
    )


def rag_query(query: str, temperature: float = 0.7):
    # Extract filters and keywords
    query_filters, keywords = parse_query(query)
    
    # Step 1: Filter by metadata
    eligible_indices = filter_by_metadata(query_filters)
    
    # Step 2: Vector search with metadata filtering
    query_embedding = ollama.embeddings(
        model="snowflake-arctic-embed2",
        prompt=query
    )["embedding"]
    
    # Search with metadata filtering
    vector_results = search_with_metadata(
        np.array([query_embedding], dtype=np.float32),
        eligible_indices,
        k=10
    )
    
    # Step 3: Keyword boosting
    boosted_results = apply_keyword_boost(vector_results, keywords)
    
    # Get top 3 results
    top_indices = [idx for idx, _ in boosted_results[:3]]
    context_docs = [documents[i] for i in top_indices]
    context_meta = [{k: metadata[k][i] for k in metadata} for i in top_indices]
    
    # Generate answer
    response = ollama.generate(
        model="mistral:latest",
        prompt=f"""Legal Context:
        {format_context(context_docs, context_meta)}
        
        Question: {query}
        Answer with legal citations:"""
    )
    return response["response"]

query = "c'est combien le prix de passeport  "
print("Question:", query)
answer = rag_query(query)
print("-"*38,"answer","-"*38)
print("Answer:", answer)


Question: c'est combien le prix de passeport  
---------------Context---------------
- A défaut de payement à l'échéance, le créancier peut, quinze jours après une simplesignification faite au débiteur et au tiers bailleur de gage, s'il y en a un, faire procéder à la ventepublique des objets donnés en gage.Sur la requête des parties, le président du tribunal peut désigner, pour y procéder, un agent de l'Etathabilité pour le faire.Toute clause qui autorise le créancier à s'approprier le gage ou à en disposer sans les formalités ci-dessus prescrites, est nulle.

Les passeports, à délivrer aux personnes indigentes et reconnues hors d’état d’en acquitter lemontant, sont exemptés du droit de timbre dans des conditions qui seront fixées par un texteréglementaire pris à l’initiative conjointe des autorités compétentes.

-  Toute vente amiable, promesse de vente et, plus généralement, toute cession defonds de commerce consentie même sous condition ou sous la forme d'un autre contrat, touteattr