In [10]:
from sklearn.model_selection import train_test_split
import json
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter
import numpy as np

In [11]:
# Function to count tokens
def count_tokens(text):
    return len(tokenizer.encode(text))

In [12]:
# Chunking function
def chunk_text(text, chunk_size=512, overlap=100):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=overlap,
        length_function=count_tokens
    )
    return text_splitter.split_text(text)

In [13]:
# Function to generate embeddings
def generate_embeddings(texts):
    return model.encode(texts, convert_to_tensor=True)

In [14]:
# Function to retrieve top 5 most similar chunks from the FAISS index
def retrieve_top_5(query, index, embedding_df, model):
    # Generate the embedding for the query
    query_embedding = model.encode([query])[0]  # Get the query embedding
    
    # Search the FAISS index for the top 5 closest matches
    D, I = index.search(np.array([query_embedding]), k=5)
    
    # Retrieve the top 5 results
    results = []
    for idx in I[0]:
        result = embedding_df.iloc[idx]
        results.append({
            "file_name": result["file_name"],
            "chunk_id": result["chunk_id"],
            "text": result["text"],
            "similarity_score": D[0][list(I[0]).index(idx)]  # Distance (lower is more similar)
        })
    
    return results

In [15]:
# Function to generate response from Llama model
def generate_response_with_llama(query, retrieved_docs, llm):
    # Concatenate the retrieved documents into a prompt for the Llama model
    context = "\n".join([doc["text"] for doc in retrieved_docs])
    
    # Construct the prompt for Llama
    prompt = f"Based on the following legal documents, answer the query:\n\n{context}\n\nQuery: {query}\nAnswer:"
    
    # Pass the prompt to the Llama model
    response = llm.invoke(prompt)
    
    return response