In [16]:
import pandas as pd
import networkx as nx
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pickle
import joblib
import spacy
from sympy import symbols, Implies
import faiss
import torch
from langchain_ollama import OllamaLLM
from tqdm import tqdm
import time
import evaluate
import psutil
from collections import Counter
from rouge_score import rouge_scorer
from bert_score import score as bert_score

In [None]:
# Load the chunked legal cases dataset
chunked_cases_df = pd.read_csv("src/chunked_law_cases.csv")

In [3]:
# Load evaluation dataset
qa_df = pd.read_csv("../data/processed/Questions & Answers.csv")

In [4]:
# Show the first few rows of both datasets to verify
print("Chunked Cases Sample:")
print(chunked_cases_df.head())

Chunked Cases Sample:
    file_name  chunk_id                                               text
0  012009.pdf         0  IN THE SUPREME COURT OF THE DEMOCRATIC SOCIALI...
1  012009.pdf         1  Argued on :\nDecided on:\nJ.A.N. de Silva J\nT...
2  012009.pdf         2  the process culminated in the SLMC deciding to...
3  012009.pdf         3  full knowledge and acquiescence of the leader ...
4  012009.pdf         4  him that in order to be appointed as Chairman ...


In [5]:
print("\nQuestions & Answers Sample:")
print(qa_df.head())


Questions & Answers Sample:
                                            question  \
0  How does the Supreme Court differentiate betwe...   
1  What is the legal significance of the immunity...   
2  In tax disputes, how should conflicts between ...   
3  How does Sri Lankan law determine the ‘value’ ...   

                                              answer  
0  The distinction is crucial for determining the...  
1  Attorneys-at-law enjoy absolute immunity conce...  
2  Conflicts between statutes, particularly in ta...  
3  The Supreme Court emphasized that the "value" ...  


### Prepare the Sample Data (3 Samples)

In [6]:
# Select the first 3 rows from the QA dataset
sample_qa_df = qa_df.head(3).copy()

In [7]:
# Display the selected samples
print("Selected Samples for Evaluation:")
print(sample_qa_df)

Selected Samples for Evaluation:
                                            question  \
0  How does the Supreme Court differentiate betwe...   
1  What is the legal significance of the immunity...   
2  In tax disputes, how should conflicts between ...   

                                              answer  
0  The distinction is crucial for determining the...  
1  Attorneys-at-law enjoy absolute immunity conce...  
2  Conflicts between statutes, particularly in ta...  


### Self-Consistency Decoding (Majority Voting for Logic)

In [9]:
# Initialize the model
llm = OllamaLLM(model="llama3.1")

In [10]:
# Function to query with Self-Consistency Decoding
def generate_majority_answer(prompt, num_generations=5):
    responses = []
    cpu_start = psutil.cpu_percent(interval=None)
    start_time = time.time()

    for _ in range(num_generations):
        response = llm(prompt)
        responses.append(response.strip())

    end_time = time.time()
    cpu_end = psutil.cpu_percent(interval=None)
    duration = end_time - start_time
    avg_cpu = (cpu_start + cpu_end) / 2

    # Get the most common response
    final_answer = Counter(responses).most_common(1)[0][0]

    return {
        "final_answer": final_answer,
        "all_answers": responses,
        "response_time_sec": round(duration, 2),
        "cpu_usage_percent": round(avg_cpu, 2)
    }

In [11]:
# Create a results list
results = []

In [13]:
# Loop through each sample
for index, row in sample_qa_df.iterrows():
    question = row['question']
    gold_answer = row['answer']

    print(f"\nEvaluating Sample {index+1}...")
    result = generate_majority_answer(prompt=question, num_generations=5)

    results.append({
        "question": question,
        "gold_answer": gold_answer,
        "model_answer": result["final_answer"],
        "all_model_answers": result["all_answers"],
        "response_time_sec": result["response_time_sec"],
        "cpu_usage_percent": result["cpu_usage_percent"]
    })


Evaluating Sample 1...

Evaluating Sample 2...

Evaluating Sample 3...


In [14]:
# Convert to DataFrame
results_df = pd.DataFrame(results)

In [15]:
# Preview the output
print("\nModel Outputs with Metrics:")
print(results_df[["question", "model_answer", "response_time_sec", "cpu_usage_percent"]])


Model Outputs with Metrics:
                                            question  \
0  How does the Supreme Court differentiate betwe...   
1  What is the legal significance of the immunity...   
2  In tax disputes, how should conflicts between ...   

                                        model_answer  response_time_sec  \
0  In civil appeals, the Supreme Court differenti...            2746.88   
1  The immunity granted to attorneys-at-law regar...            2149.98   
2  When dealing with tax disputes involving confl...            2112.04   

   cpu_usage_percent  
0               45.0  
1               37.2  
2               44.0  


In [17]:
# Initialize ROUGE scorer
rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

In [18]:
# Lists for storing scores
accuracy_list = []
rouge1_list = []
rouge2_list = []
rougeL_list = []
bertscore_list = []

In [19]:
# Loop through each row and compute metrics
for index, row in results_df.iterrows():
    gold = row['gold_answer'].strip()
    pred = row['model_answer'].strip()

    # Accuracy (exact match)
    accuracy = int(gold.lower() == pred.lower())
    accuracy_list.append(accuracy)

    # ROUGE scores
    r_scores = rouge.score(gold, pred)
    rouge1_list.append(r_scores["rouge1"].fmeasure)
    rouge2_list.append(r_scores["rouge2"].fmeasure)
    rougeL_list.append(r_scores["rougeL"].fmeasure)

In [20]:
# Compute BERTScore (batch for all)
P, R, F1 = bert_score(
    cands=results_df["model_answer"].tolist(),
    refs=results_df["gold_answer"].tolist(),
    lang="en",
    verbose=True
)
bertscore_list = F1.tolist()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 21.61 seconds, 0.14 sentences/sec


In [21]:
# Add scores to DataFrame
results_df["accuracy"] = accuracy_list
results_df["rouge1"] = rouge1_list
results_df["rouge2"] = rouge2_list
results_df["rougeL"] = rougeL_list
results_df["bertscore"] = bertscore_list

In [22]:
# Preview results
print("\nEvaluation Scores Added:")
print(results_df[["accuracy", "rouge1", "rouge2", "rougeL", "bertscore"]])


Evaluation Scores Added:
   accuracy    rouge1    rouge2    rougeL  bertscore
0         0  0.236287  0.072034  0.139241   0.820778
1         0  0.245968  0.068826  0.149194   0.847917
2         0  0.230769  0.040650  0.121457   0.822547
