In [3]:
import pandas as pd
import networkx as nx
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pickle
import joblib
import spacy
from sympy import symbols, Implies
import faiss
import torch
from langchain_ollama import OllamaLLM
from tqdm import tqdm
import time
import evaluate
import psutil

In [4]:
# Load the chunked legal cases dataset
chunked_cases_df = pd.read_csv("chunked_law_cases.csv")

In [5]:
# Initialize graph
G = nx.DiGraph()

In [6]:
# Load a sentence embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

In [7]:
# Create node embeddings
chunk_embeddings = model.encode(chunked_cases_df["text"].tolist(), convert_to_tensor=True)

In [8]:
# Add nodes to the graph
for index, row in chunked_cases_df.iterrows():
    G.add_node(row["chunk_id"], text=row["text"], embedding=chunk_embeddings[index])

In [9]:
# Establish logical dependencies between chunks
for i, row in chunked_cases_df.iterrows():
    current_embedding = chunk_embeddings[i].cpu().numpy()
    similarities = cosine_similarity([current_embedding], chunk_embeddings.cpu().numpy())[0]
    
    # Find the top related chunks (excluding itself)
    top_related = np.argsort(similarities)[-6:-1]  # Get top 5 similar chunks
    
    for related_index in top_related:
        related_chunk_id = chunked_cases_df.iloc[related_index]["chunk_id"]
        G.add_edge(row["chunk_id"], related_chunk_id, weight=similarities[related_index])

In [10]:
# # Save graph using pickle (more suitable for complex objects)
# with open("../data/processed/logical_reasoning/graph_data.pkl", "wb") as f:
#     pickle.dump(G, f)

### Logical Reasoning Implementation

In [11]:
def is_logically_relevant(query, chunk_text):
    """
    Rule-based filtering to check if a chunk is logically relevant.
    - Prioritizes legal precedent cases.
    - Ensures the chunk discusses related legal principles.
    """
    keywords = ["precedent", "ruling", "judgment", "legal principle", "interpretation"]
    if any(keyword in chunk_text.lower() for keyword in keywords):
        return True  # Accept if it contains key legal reasoning terms
    return False  # Reject otherwise

In [12]:
# Convert embeddings to NumPy array
embedding_dim = chunk_embeddings.shape[1]  # Get the embedding size
faiss_index = faiss.IndexFlatL2(embedding_dim)  # L2 (Euclidean) distance index

In [13]:
# Convert PyTorch tensor to NumPy and add to FAISS
faiss_index.add(chunk_embeddings.cpu().numpy())  # Ensure it's a NumPy array

print("✅ FAISS index initialized with", faiss_index.ntotal, "legal case chunks.")

✅ FAISS index initialized with 34982 legal case chunks.


In [14]:
def retrieve_similar(query, top_k=5):
    """
    Retrieves top_k most similar legal case chunks using FAISS.
    """
    query_embedding = model.encode([query], convert_to_tensor=True).cpu().numpy()
    
    # FAISS similarity search
    _, indices = faiss_index.search(query_embedding, top_k)

    # Retrieve the actual text of similar chunks
    similar_chunks = chunked_cases_df.iloc[indices[0]]['text'].tolist()
    
    return similar_chunks

In [15]:
def logical_expand_graph_rag(query):
    """
    Expands the context for retrieval by using graph-based reasoning.
    - Retrieves top similar chunks.
    - Expands context using graph neighbors.
    - Applies rule-based filtering for logical relevance.
    """
    similar_chunks = retrieve_similar(query)  # Step 1: Retrieve initial similar chunks
    expanded_context = set(similar_chunks)  # Step 2: Store similar chunks

    # Step 3: Expand using graph-based logical connections
    for chunk in similar_chunks:
        chunk_id = chunked_cases_df[chunked_cases_df['text'] == chunk]['chunk_id'].values[0]
        neighbors = list(G.neighbors(chunk_id))  # Get connected chunks (logical relations)

        for neighbor in neighbors:
            neighbor_text = G.nodes[neighbor]['text']

            # Step 4: Apply symbolic reasoning (filter relevant legal cases)
            if is_logically_relevant(query, neighbor_text):  
                expanded_context.add(neighbor_text)

    return "\n".join(expanded_context)

In [16]:
llm = OllamaLLM(model="llama3.1")

In [17]:
def generate_logically_reasoned_response(query):
    """
    Generates a response using LLaMA with logically expanded retrieval.
    - Retrieves expanded legal context.
    - Uses LLaMA to generate responses based on enriched knowledge.
    """
    expanded_context = logical_expand_graph_rag(query)  # Step 1: Retrieve expanded legal context

    # Step 2: Format the prompt for LLaMA
    prompt = f"""
    Given the following legal case references, provide a logically sound answer to the query.

    Context:
    {expanded_context}

    Query: {query}

    Response:
    """

    # Step 3: Generate response using LLaMA
    response = llm.generate([prompt], max_tokens=500)
    return response

In [18]:
# Load evaluation dataset
questions_df = pd.read_csv("../data/processed/Questions & Answers.csv")

In [20]:
# # Select test queries
# test_questions = questions_df["question"].sample(3).tolist()

# # Iterate with tqdm progress bar
# for i, test_query in enumerate(tqdm(test_questions, desc="Generating Responses", unit="query")):
#     start_time = time.time()  # Track time per query

#     print(f"\n🔹 **Test Query {i+1}:** {test_query}")

#     # Generate response
#     response = generate_logically_reasoned_response(test_query)

#     elapsed_time = time.time() - start_time  # Calculate elapsed time

#     print(f"✅ **LLaMA Response:**\n{response}")
#     print(f"⏳ Time Taken: {elapsed_time:.2f}s\n{'-'*80}\n")

### Evaluate Responses Using ROUGE & BERTScore

In [21]:
# Load evaluation metrics
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

In [22]:
# Store evaluation results
evaluation_results = []

In [23]:
# Process each question with a progress bar
for _, row in tqdm(questions_df.iterrows(), total=len(questions_df), desc="Evaluating Responses", unit="question"):
    question, ground_truth = row["question"], row["answer"]

    # Ensure the inputs are valid strings
    ground_truth = str(ground_truth) if ground_truth else ""
    
    # Measure system performance
    start_time = time.time()
    cpu_before = psutil.cpu_percent(interval=None)
    memory_before = psutil.virtual_memory().percent

    # Generate LLaMA response with logical reasoning
    generated_response = generate_logically_reasoned_response(question)
    
    # Ensure the generated response is valid
    generated_response = str(generated_response) if generated_response else ""

    # Measure system performance after generation
    cpu_after = psutil.cpu_percent(interval=None)
    memory_after = psutil.virtual_memory().percent
    response_time = time.time() - start_time

    # Compute ROUGE scores if response is non-empty
    if generated_response.strip() and ground_truth.strip():
        rouge_scores = rouge.compute(predictions=[generated_response], references=[ground_truth])
        bert_scores = bertscore.compute(predictions=[generated_response], references=[ground_truth], lang="en")
    else:
        rouge_scores = {"rouge1": 0.0, "rouge2": 0.0, "rougeL": 0.0}
        bert_scores = {"precision": [0.0], "recall": [0.0], "f1": [0.0]}

    # Store results
    evaluation_results.append({
        "question": question,
        "response": generated_response,
        "response_time": response_time,
        "cpu_usage": (cpu_before + cpu_after) / 2,
        "memory_usage": (memory_before + memory_after) / 2,
        "rouge_scores": rouge_scores,
        "bert_precision": bert_scores["precision"][0],
        "bert_recall": bert_scores["recall"][0],
        "bert_f1": bert_scores["f1"][0],
    })

    # Print log after each question
    print(f"✅ Processed: {question} | Time: {response_time:.2f}s | "
          f"CPU: {(cpu_before + cpu_after) / 2:.2f}% | "
          f"Memory: {(memory_before + memory_after) / 2:.2f}%")

Evaluating Responses:   0%|          | 0/4 [00:00<?, ?question/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating Responses:  25%|██▌       | 1/4 [36:59<1:50:57, 2219.04s/question]

✅ Processed: How does the Supreme Court differentiate between an interlocutory order and a final order in civil appeals? | Time: 2198.12s | CPU: 41.15% | Memory: 73.75%


Evaluating Responses:  50%|█████     | 2/4 [1:17:12<1:17:47, 2333.58s/question]

✅ Processed: What is the legal significance of the immunity granted to Attorneys-at-Law regarding statements made in pleadings? | Time: 2394.36s | CPU: 56.35% | Memory: 94.05%


Evaluating Responses:  75%|███████▌  | 3/4 [1:56:04<38:52, 2332.72s/question]  

✅ Processed: In tax disputes, how should conflicts between two statutes be resolved? | Time: 2317.97s | CPU: 47.70% | Memory: 96.45%


Evaluating Responses: 100%|██████████| 4/4 [2:33:21<00:00, 2300.34s/question]

✅ Processed: How does Sri Lankan law determine the ‘value’ of shares for stamp duty purposes? | Time: 2218.91s | CPU: 52.35% | Memory: 95.30%





In [24]:
# Save results for later analysis
evaluation_df = pd.DataFrame(evaluation_results)
evaluation_df.to_csv("../data/processed/logical_reasoning/evaluation_results.csv", index=False)

print("\n✅ Evaluation completed! Results saved to `evaluation_results.csv`.")


✅ Evaluation completed! Results saved to `evaluation_results.csv`.


In [25]:
# Compute averages
avg_response_time = evaluation_df["response_time"].mean()
avg_cpu_usage = evaluation_df["cpu_usage"].mean()
avg_memory_usage = evaluation_df["memory_usage"].mean()

# Compute average ROUGE scores
avg_rouge = {metric: evaluation_df["rouge_scores"].apply(lambda x: x[metric]).mean() for metric in ["rouge1", "rouge2", "rougeL"]}

# Compute average BERTScore (Precision, Recall, F1)
avg_bert_precision = evaluation_df["bert_precision"].mean()
avg_bert_recall = evaluation_df["bert_recall"].mean()
avg_bert_f1 = evaluation_df["bert_f1"].mean()

# Print summary
print("\n📊 **Final Evaluation Results**")
print(f"⏳ Average Response Time: {avg_response_time:.2f}s")
print(f"💾 Average CPU Usage: {avg_cpu_usage:.2f}%")
print(f"🖥️ Average Memory Usage: {avg_memory_usage:.2f}%")
print(f"📌 Average ROUGE Scores: {avg_rouge}")
print(f"🔎 Average BERT Precision: {avg_bert_precision:.4f}")
print(f"🔎 Average BERT Recall: {avg_bert_recall:.4f}")
print(f"🔎 Average BERT F1 Score: {avg_bert_f1:.4f}")


📊 **Final Evaluation Results**
⏳ Average Response Time: 2282.34s
💾 Average CPU Usage: 49.39%
🖥️ Average Memory Usage: 89.89%
📌 Average ROUGE Scores: {'rouge1': 0.0024840936686022766, 'rouge2': 0.0005235178621953041, 'rougeL': 0.0014819822873653153}
🔎 Average BERT Precision: 0.7491
🔎 Average BERT Recall: 0.8424
🔎 Average BERT F1 Score: 0.7926
