In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import tiktoken
from tqdm import tqdm
import time
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import ollama
from langchain_ollama import OllamaLLM
import ast
import pickle




In [2]:
%run ../utils/fine_tuning_util.ipynb
%run ../utils/save_and_load_util.ipynb

In [41]:
# Load the CSV file
df = pd.read_csv("eda_law_cases.csv")

### Chunking Text for Training & Retrieval

In [42]:
# Define a tokenizer
tokenizer = tiktoken.get_encoding("cl100k_base")  # Using OpenAI's tokenizer
max_chunk_size = 512

In [45]:
chunked_data = []
total_files = len(df)

start_time = time.time()

with tqdm(total=total_files, desc="Chunking Progress", unit="file") as pbar:
    for _, row in df.iterrows():
        file_name = row["file_name"]
        
        # Ensure 'text' is a valid string
        text = row["text"]
        if isinstance(text, str):  # Proceed only if text is a string
            chunks = chunk_text(text)
            for i, chunk in enumerate(chunks):
                chunked_data.append({"file_name": file_name, "chunk_id": i, "text": chunk})
        else:
            print(f"Skipping file {file_name} due to invalid text.")
        
        pbar.update(1)
    
    # Calculate time left and show it dynamically
    elapsed_time = time.time() - start_time
    remaining_time = (elapsed_time / pbar.n) * (total_files - pbar.n)
    pbar.set_postfix(remaining=f"{remaining_time:.2f}s")

Chunking Progress:  96%|█████████▌| 1833/1910 [00:48<00:01, 51.40file/s]

Skipping file sc_hc_la_89_2022.pdf due to invalid text.


Chunking Progress: 100%|██████████| 1910/1910 [00:50<00:00, 37.70file/s, remaining=0.00s]


In [46]:
# Convert to DataFrame
chunked_df = pd.DataFrame(chunked_data)

In [47]:
# Save the chunked data
chunked_df.to_csv("chunked_law_cases.csv", index=False)
end_time = time.time()

### Embedding Generation

In [3]:
chunked_df = pd.read_csv("chunked_law_cases.csv")

In [4]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
# Create FAISS index
d = 384
index = faiss.IndexFlatL2(d)

In [6]:
chunked_data_with_embeddings = []
total_chunks = len(chunked_df)

start_time = time.time()

with tqdm(total=total_chunks, desc="Generating Embeddings", unit="chunk") as pbar:
    batch_size = 32  # Adjust the batch size based on available memory

    for i in range(0, total_chunks, batch_size):
        batch_texts = chunked_df["text"].iloc[i:i + batch_size].tolist()
        embeddings = generate_embeddings(batch_texts)

        # Add embeddings to the dataframe
        for j, emb in enumerate(embeddings):
            chunked_data_with_embeddings.append({
                "file_name": chunked_df["file_name"].iloc[i + j],
                "chunk_id": chunked_df["chunk_id"].iloc[i + j],
                "text": chunked_df["text"].iloc[i + j],
                "embedding": emb.cpu().numpy()
            })
        
        pbar.update(batch_size)

Generating Embeddings:   3%|▎         | 896/30646 [01:20<44:51, 11.05chunk/s] 

In [7]:
# Convert the chunked data with embeddings to a DataFrame
embedding_df = pd.DataFrame(chunked_data_with_embeddings)

In [8]:
# Save the embeddings to disk
embedding_df.to_csv("law_cases_with_embeddings.csv", index=False)

### Creating FAISS Index

In [4]:
# Load the chunked data with embeddings
embedding_df = pd.read_csv("law_cases_with_embeddings.csv")

In [16]:
# Convert string embeddings back to NumPy arrays
def parse_embedding(embedding_str):
    try:
        return np.array(ast.literal_eval(embedding_str), dtype=np.float32)
    except:
        return np.zeros(384, dtype=np.float32)

In [17]:
embedding_df["embedding"] = embedding_df["embedding"].apply(parse_embedding)

In [19]:
# Convert embeddings to a 2D NumPy array
embeddings_matrix = np.vstack(embedding_df["embedding"].values)

In [20]:
# Define the FAISS index (L2 Normalized for cosine similarity search)
embedding_dim = embeddings_matrix.shape[1]  # Get the embedding dimension
index = faiss.IndexFlatL2(embedding_dim)

In [21]:
# Add embeddings to FAISS index
index.add(embeddings_matrix)

In [22]:
# Save FAISS index
faiss.write_index(index, "law_cases_index.index")

In [25]:
# Save metadata (file names & chunk IDs) for retrieval
metadata = embedding_df[["file_name", "chunk_id", "text"]].to_dict(orient="records")
with open("faiss_metadata.pkl", "wb") as f:
    pickle.dump(metadata, f)

In [3]:
# Load FAISS index
index = faiss.read_index("law_cases_index.index")

In [4]:
# Load metadata (file names & text chunks)
with open("faiss_metadata.pkl", "rb") as f:
    metadata = pickle.load(f)

In [5]:
# Load the embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")  # Same model used before

In [6]:
llm = OllamaLLM(model="llama3.1")

In [7]:
def retrieve_relevant_cases(query, top_k=5):
    # Convert query into an embedding
    query_embedding = embedding_model.encode([query])
    query_embedding = np.array(query_embedding, dtype=np.float32)  

    # Search FAISS for top 10 matches (to get diversity)
    distances, indices = index.search(query_embedding, top_k * 3)  

    # Keep track of unique documents
    unique_cases = {}
    
    for i in range(len(indices[0])):
        idx = indices[0][i]
        if idx < len(metadata):  # Ensure index is valid
            case = metadata[idx]
            file_name = case["file_name"]
            
            if file_name not in unique_cases:  # Add only if not already included
                unique_cases[file_name] = case

            if len(unique_cases) == top_k:  # Stop when we have top_k unique documents
                break

    return list(unique_cases.values())  # Return only unique cases

In [8]:
def generate_response_with_llama(query):
    """
    Given a legal query, retrieve relevant law cases and generate a response using Llama 3.1.
    """
    # Retrieve relevant legal cases
    relevant_cases = retrieve_relevant_cases(query)

    # Combine case texts
    case_texts = "\n\n".join([f"Case {i+1}: {case['text']}" for i, case in enumerate(relevant_cases)])

    # Construct Llama 3.1 prompt
    prompt = f"""
    You are a legal AI assistant. Answer the query based on the following legal cases:

    {case_texts}

    Query: {query}
    Answer:
    """

    # Get response from Llama 3.1
    response = llm.invoke(prompt)

    return response, relevant_cases

In [9]:
# Example usage
user_query = "What are the legal rights of a tenant in a lease agreement dispute?"
response, matched_cases = generate_response_with_llama(user_query)

In [10]:
# Print response
print("🔷 AI Legal Assistant Response:\n")
print(response)

🔷 AI Legal Assistant Response:

Based on general principles of law and the cases provided, I'll address the query. Please note that specific provisions may vary depending on jurisdiction (in this case, Sri Lanka).

In general, when it comes to a lease agreement dispute, the rights of a tenant can be summarized as follows:

1. **Right to Quiet Possession**: The tenant has the right to peaceful enjoyment of the property during the tenancy period. This means they should not be disturbed or evicted without proper notice and due process.
2. **Right to Repairs and Maintenance**: The landlord is responsible for maintaining the property in a habitable condition. If repairs are necessary, the tenant can request the landlord to take action.
3. **Right to Receive Rent Payments**: The tenant has the right to receive rent payments as agreed upon in the lease agreement. However, if there's a dispute over rent or late payment fees, the tenant may seek mediation or legal recourse.
4. **Right to Termin

In [11]:
# Print matched case file names
print("\n🔷 Top Matching Case Files:")
for case in matched_cases:
    print(f"📄 {case['file_name']}")


🔷 Top Matching Case Files:
📄 012009.pdf
📄 01a_01f_2017_tab.pdf
