In [69]:
from sklearn.model_selection import train_test_split
import pandas as pd
import tiktoken
from tqdm import tqdm
import time
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import ollama
from langchain_ollama import OllamaLLM
import ast
import pickle
import time
import psutil
import evaluate

In [31]:
%run ../utils/fine_tuning_util.ipynb
%run ../utils/save_and_load_util.ipynb

In [32]:
# Load the CSV file
df = pd.read_csv("eda_law_cases.csv")

In [34]:
# Load the true questions and answers
qa_data = pd.read_csv("../data/processed/Questions & Answers.csv")
questions = qa_data["question"].tolist()
print(questions)

['How does the Supreme Court differentiate between an interlocutory order and a final order in civil appeals?', 'What is the legal significance of the immunity granted to Attorneys-at-Law regarding statements made in pleadings?', 'In tax disputes, how should conflicts between two statutes be resolved?', 'How does Sri Lankan law determine the ‘value’ of shares for stamp duty purposes?']


### Chunking Text for Training & Retrieval

In [35]:
# Define a tokenizer
tokenizer = tiktoken.get_encoding("cl100k_base")  # Using OpenAI's tokenizer
max_chunk_size = 512

In [36]:
chunked_data = []
total_files = len(df)

start_time = time.time()

with tqdm(total=total_files, desc="Chunking Progress", unit="file") as pbar:
    for _, row in df.iterrows():
        file_name = row["file_name"]
        
        # Ensure 'text' is a valid string
        text = row["text"]
        if isinstance(text, str):  # Proceed only if text is a string
            chunks = chunk_text(text)
            for i, chunk in enumerate(chunks):
                chunked_data.append({"file_name": file_name, "chunk_id": i, "text": chunk})
        else:
            print(f"Skipping file {file_name} due to invalid text.")
        
        pbar.update(1)
    
    # Calculate time left and show it dynamically
    elapsed_time = time.time() - start_time
    remaining_time = (elapsed_time / pbar.n) * (total_files - pbar.n)
    pbar.set_postfix(remaining=f"{remaining_time:.2f}s")

Chunking Progress:   5%|▌         | 157/2988 [00:02<00:27, 101.36file/s]

Skipping file ca_ba_43_2011.pdf due to invalid text.


Chunking Progress:  12%|█▏        | 347/2988 [00:03<00:20, 130.85file/s]

Skipping file ca_dc_horana_563_96_f.pdf due to invalid text.


Chunking Progress:  14%|█▍        | 433/2988 [00:04<00:19, 130.54file/s]

Skipping file ca_dc_matara_05_97.pdf due to invalid text.


Chunking Progress:  16%|█▋        | 493/2988 [00:05<00:19, 130.11file/s]

Skipping file ca_dc_panadura_416_92.pdf due to invalid text.
Skipping file ca_dc_ratnapura_646_99.pdf due to invalid text.


Chunking Progress:  22%|██▏       | 669/2988 [00:06<00:09, 237.09file/s]

Skipping file ca_pa_26_2011.pdf due to invalid text.


Chunking Progress:  26%|██▌       | 769/2988 [00:06<00:07, 285.31file/s]

Skipping file ca_phc_apn_27_2012.pdf due to invalid text.


Chunking Progress:  29%|██▉       | 880/2988 [00:07<00:09, 213.17file/s]

Skipping file ca_writ_1591_06.pdf due to invalid text.


Chunking Progress:  34%|███▍      | 1011/2988 [00:07<00:10, 190.68file/s]

Skipping file ca_writ_603_10.pdf due to invalid text.


Chunking Progress:  97%|█████████▋| 2910/2988 [00:58<00:01, 47.48file/s] 

Skipping file sc_hc_la_89_2022.pdf due to invalid text.


Chunking Progress: 100%|██████████| 2988/2988 [01:00<00:00, 49.53file/s, remaining=0.00s]


In [37]:
# Convert to DataFrame
chunked_df = pd.DataFrame(chunked_data)

In [38]:
# Save the chunked data
chunked_df.to_csv("chunked_law_cases.csv", index=False)
end_time = time.time()

### Embedding Generation

In [39]:
chunked_df = pd.read_csv("chunked_law_cases.csv")

In [40]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [41]:
# Convert questions to embeddings
query_vectors = np.array(model.encode(questions, convert_to_numpy=True))

In [42]:
# Create FAISS index
d = 384
index = faiss.IndexFlatL2(d)

In [43]:
chunked_data_with_embeddings = []
total_chunks = len(chunked_df)

start_time = time.time()

with tqdm(total=total_chunks, desc="Generating Embeddings", unit="chunk") as pbar:
    batch_size = 32  # Adjust the batch size based on available memory

    for i in range(0, total_chunks, batch_size):
        batch_texts = chunked_df["text"].iloc[i:i + batch_size].tolist()
        embeddings = generate_embeddings(batch_texts)

        # Add embeddings to the dataframe
        for j, emb in enumerate(embeddings):
            chunked_data_with_embeddings.append({
                "file_name": chunked_df["file_name"].iloc[i + j],
                "chunk_id": chunked_df["chunk_id"].iloc[i + j],
                "text": chunked_df["text"].iloc[i + j],
                "embedding": emb.cpu().numpy()
            })
        
        pbar.update(batch_size)

Generating Embeddings: 35008chunk [1:00:30,  9.64chunk/s]                        


In [44]:
# Convert the chunked data with embeddings to a DataFrame
embedding_df = pd.DataFrame(chunked_data_with_embeddings)

In [45]:
# Save the embeddings to disk
embedding_df.to_csv("law_cases_with_embeddings.csv", index=False)

### Creating FAISS Index

In [46]:
# Load the chunked data with embeddings
embedding_df = pd.read_csv("law_cases_with_embeddings.csv")

In [47]:
# Convert string embeddings back to NumPy arrays
def parse_embedding(embedding_str):
    try:
        return np.array(ast.literal_eval(embedding_str), dtype=np.float32)
    except:
        return np.zeros(384, dtype=np.float32)

In [48]:
embedding_df["embedding"] = embedding_df["embedding"].apply(parse_embedding)

In [49]:
# Convert embeddings to a 2D NumPy array
embeddings_matrix = np.vstack(embedding_df["embedding"].values)

In [50]:
# Define the FAISS index (L2 Normalized for cosine similarity search)
embedding_dim = embeddings_matrix.shape[1]  # Get the embedding dimension
index = faiss.IndexFlatL2(embedding_dim)

In [51]:
# Add embeddings to FAISS index
index.add(embeddings_matrix)

In [52]:
# Save FAISS index
faiss.write_index(index, "law_cases_index.index")

In [53]:
# Save metadata (file names & chunk IDs) for retrieval
metadata = embedding_df[["file_name", "chunk_id", "text"]].to_dict(orient="records")
with open("faiss_metadata.pkl", "wb") as f:
    pickle.dump(metadata, f)

In [54]:
# Load FAISS index
index = faiss.read_index("law_cases_index.index")

In [55]:
# Load metadata (file names & text chunks)
with open("faiss_metadata.pkl", "rb") as f:
    metadata = pickle.load(f)

In [56]:
# Load the embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")  # Same model used before

In [57]:
llm = OllamaLLM(model="llama3.1")

In [58]:
def retrieve_relevant_cases(query, top_k=5):
    # Convert query into an embedding
    query_embedding = embedding_model.encode([query])
    query_embedding = np.array(query_embedding, dtype=np.float32)  

    # Search FAISS for top 10 matches (to get diversity)
    distances, indices = index.search(query_embedding, top_k * 3)  

    # Keep track of unique documents
    unique_cases = {}
    
    for i in range(len(indices[0])):
        idx = indices[0][i]
        if idx < len(metadata):  # Ensure index is valid
            case = metadata[idx]
            file_name = case["file_name"]
            
            if file_name not in unique_cases:  # Add only if not already included
                unique_cases[file_name] = case

            if len(unique_cases) == top_k:  # Stop when we have top_k unique documents
                break

    return list(unique_cases.values())  # Return only unique cases

In [59]:
# Perform FAISS search for the questions
k = 1  # Get only the top result
_, ground_truth_indices = index.search(query_vectors, k)

# Flatten the array since FAISS returns a list of lists
ground_truth_indices = ground_truth_indices.flatten()

In [60]:
def evaluate_faiss_index(index, query_vectors, ground_truth_indices, k=5):
    """Evaluates FAISS retrieval performance"""
    start_time = time.time()
    cpu_usage_before = psutil.cpu_percent(interval=None)
    
    _, retrieved_indices = index.search(query_vectors, k)
    
    cpu_usage_after = psutil.cpu_percent(interval=None)
    response_time = time.time() - start_time

    # Calculate accuracy (percentage of ground truth indices found in retrieved indices)
    correct_retrievals = sum([1 for gt, retrieved in zip(ground_truth_indices, retrieved_indices) if gt in retrieved])
    accuracy = correct_retrievals / len(ground_truth_indices)

    print(f"FAISS Evaluation:\n- Response Time: {response_time:.4f}s\n- CPU Usage: {cpu_usage_after - cpu_usage_before:.2f}%\n- Accuracy: {accuracy:.2%}")
    return response_time, cpu_usage_after - cpu_usage_before, accuracy

In [61]:
# Example usage
response_time, cpu_usage, accuracy = evaluate_faiss_index(index, query_vectors, ground_truth_indices)

FAISS Evaluation:
- Response Time: 0.0094s
- CPU Usage: 47.30%
- Accuracy: 100.00%


## Evaluation With Llama Model

In [62]:
def generate_response_with_llama(query):
    """
    Given a legal query, retrieve relevant law cases and generate a response using Llama 3.1.
    """
    # Retrieve relevant legal cases
    relevant_cases = retrieve_relevant_cases(query)

    # Combine case texts
    case_texts = "\n\n".join([f"Case {i+1}: {case['text']}" for i, case in enumerate(relevant_cases)])

    # Construct Llama 3.1 prompt
    prompt = f"""
    You are a legal AI assistant. Answer the query based on the following legal cases:

    {case_texts}

    Query: {query}
    Answer:
    """

    # Get response from Llama 3.1
    response = llm.invoke(prompt)

    return response, relevant_cases

In [63]:
# Load evaluation dataset
qa_data = pd.read_csv("../data/processed/Questions & Answers.csv")
questions = qa_data["question"].tolist()
ground_truth_answers = qa_data["answer"].tolist()  # True legal answers

In [64]:
# Track execution time
start_time = time.time()

# Initialize empty lists
llm_responses = []
retrieved_cases = []

# Monitor system resource usage
cpu_usage_list = []
memory_usage_list = []

In [65]:
# Process each question with a progress bar
for i, question in enumerate(tqdm(questions, desc="Processing Questions", unit="question")):
    question_start_time = time.time()  # Track time for each question

    # Record CPU & memory usage before processing
    cpu_before = psutil.cpu_percent(interval=None)
    memory_before = psutil.virtual_memory().percent

    # Generate response using LLaMA
    response, matched_cases = generate_response_with_llama(question)
    
    # Store results
    llm_responses.append(response)
    retrieved_cases.append(matched_cases)

    # Record CPU & memory usage after processing
    cpu_after = psutil.cpu_percent(interval=None)
    memory_after = psutil.virtual_memory().percent

    # Compute stats
    avg_cpu = (cpu_before + cpu_after) / 2
    avg_memory = (memory_before + memory_after) / 2
    elapsed_time_per_question = time.time() - question_start_time

    # Append to lists
    cpu_usage_list.append(avg_cpu)
    memory_usage_list.append(avg_memory)

    # Print log after every question
    print(f"✅ Completed {i + 1}/{len(questions)} | Time: {elapsed_time_per_question:.2f}s | "
          f"CPU: {avg_cpu:.2f}% | Memory: {avg_memory:.2f}%")

Processing Questions:  25%|██▌       | 1/4 [21:24<1:04:13, 1284.46s/question]

✅ Completed 1/4 | Time: 1284.46s | CPU: 33.15% | Memory: 82.90%


Processing Questions:  50%|█████     | 2/4 [34:36<33:09, 994.63s/question]   

✅ Completed 2/4 | Time: 791.71s | CPU: 40.85% | Memory: 87.95%


Processing Questions:  75%|███████▌  | 3/4 [43:12<12:56, 776.24s/question]

✅ Completed 3/4 | Time: 516.35s | CPU: 43.30% | Memory: 86.55%


Processing Questions: 100%|██████████| 4/4 [53:53<00:00, 808.30s/question]

✅ Completed 4/4 | Time: 640.64s | CPU: 33.00% | Memory: 86.60%





In [66]:
# Final execution time
total_time = time.time() - start_time
final_cpu_usage = sum(cpu_usage_list) / len(cpu_usage_list)
final_memory_usage = sum(memory_usage_list) / len(memory_usage_list)

print(f"\n✅ All {len(questions)} questions processed in {total_time:.2f} seconds.")
print(f"📊 Final Average CPU Usage: {final_cpu_usage:.2f}%")
print(f"📊 Final Average Memory Usage: {final_memory_usage:.2f}%")


✅ All 4 questions processed in 3680.47 seconds.
📊 Final Average CPU Usage: 37.58%
📊 Final Average Memory Usage: 86.00%


In [71]:
# Load ROUGE for text comparison
rouge = evaluate.load("rouge")

In [72]:
# Evaluate responses
llm_evaluation = rouge.compute(predictions=llm_responses, references=ground_truth_answers)

print("LLaMA Response Evaluation:", llm_evaluation)

LLaMA Response Evaluation: {'rouge1': 0.21996053611837663, 'rouge2': 0.06654455294784305, 'rougeL': 0.14734921260779954, 'rougeLsum': 0.16019937901302683}


In [74]:
bertscore = evaluate.load("bertscore")
bert_evaluation = bertscore.compute(predictions=llm_responses, references=ground_truth_answers, lang="en")

print("BERTScore:", bert_evaluation)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore: {'precision': [0.8117244243621826, 0.8116472959518433, 0.7999582290649414, 0.7952038645744324], 'recall': [0.8550233840942383, 0.863749623298645, 0.8507528901100159, 0.8673533201217651], 'f1': [0.8328114748001099, 0.836888313293457, 0.8245741128921509, 0.8297131061553955], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.45.2)'}


In [9]:
# Example usage
user_query = "What are the legal rights of a tenant in a lease agreement dispute?"
response, matched_cases = generate_response_with_llama(user_query)

In [10]:
# Print response
print("🔷 AI Legal Assistant Response:\n")
print(response)

🔷 AI Legal Assistant Response:

Based on general principles of law and the cases provided, I'll address the query. Please note that specific provisions may vary depending on jurisdiction (in this case, Sri Lanka).

In general, when it comes to a lease agreement dispute, the rights of a tenant can be summarized as follows:

1. **Right to Quiet Possession**: The tenant has the right to peaceful enjoyment of the property during the tenancy period. This means they should not be disturbed or evicted without proper notice and due process.
2. **Right to Repairs and Maintenance**: The landlord is responsible for maintaining the property in a habitable condition. If repairs are necessary, the tenant can request the landlord to take action.
3. **Right to Receive Rent Payments**: The tenant has the right to receive rent payments as agreed upon in the lease agreement. However, if there's a dispute over rent or late payment fees, the tenant may seek mediation or legal recourse.
4. **Right to Termin

In [11]:
# Print matched case file names
print("\n🔷 Top Matching Case Files:")
for case in matched_cases:
    print(f"📄 {case['file_name']}")


🔷 Top Matching Case Files:
📄 012009.pdf
📄 01a_01f_2017_tab.pdf
