In [2]:
import pandas as pd
import networkx as nx
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pickle
import joblib
import spacy
from sympy import symbols, Implies
import faiss
import torch
from langchain_ollama import OllamaLLM
from tqdm import tqdm
import time
import evaluate
import psutil
from collections import Counter
from rouge_score import rouge_scorer
from bert_score import score as bert_score

In [9]:
# Load data
qa_df = pd.read_csv("../data/processed/Questions & Answers.csv")

In [10]:
# Initialize LLM and similarity model
llm = OllamaLLM(model="llama3.1")
embedder = SentenceTransformer("all-MiniLM-L6-v2")  # Lightweight and effective

In [11]:
# Contrastive Decoding Config
NUM_CANDIDATES = 5

In [12]:
# Init output list
results = []

In [13]:
for idx, row in qa_df.iterrows():
    question = row["question"]
    reference_answer = row["answer"]

    print(f"\n🔍 Processing question {idx+1}/{len(qa_df)}")

    # Generate multiple candidates
    candidates = []
    times = []
    cpu_usages = []

    for _ in range(NUM_CANDIDATES):
        start_time = time.time()
        cpu_before = psutil.cpu_percent(interval=None)

        answer = llm(question)  # You can optionally add chunked legal context here

        cpu_after = psutil.cpu_percent(interval=None)
        elapsed_time = time.time() - start_time

        candidates.append(answer)
        times.append(elapsed_time)
        cpu_usages.append(cpu_after)

    # Score candidates using semantic similarity to question (optional heuristic)
    question_embedding = embedder.encode(question, convert_to_tensor=True)
    candidate_embeddings = embedder.encode(candidates, convert_to_tensor=True)

    similarities = util.cos_sim(question_embedding, candidate_embeddings)[0]
    best_idx = similarities.argmax().item()
    best_candidate = candidates[best_idx]

    results.append({
        "question": question,
        "reference_answer": reference_answer,
        "generated_answer": best_candidate,
        "response_time": times[best_idx],
        "cpu_usage_percent": cpu_usages[best_idx]
    })


🔍 Processing question 1/4


  answer = llm(question)  # You can optionally add chunked legal context here



🔍 Processing question 2/4

🔍 Processing question 3/4

🔍 Processing question 4/4


In [None]:
# Save results
results_df = pd.DataFrame(results)
results_df.to_csv("src/contrastive_decoding_generated_results.csv", index=False)
print("\n✅ Saved generated results to 'generated_results.csv'")


✅ Saved generated results to 'generated_results.csv'


### Evaluation

In [None]:
# Load CSVs
qa_df = pd.read_csv("../data/processed/Questions & Answers.csv")
gen_df = pd.read_csv("src/contrastive_decoding_generated_results.csv")

In [16]:
# Merge on 'question' for a complete comparison
df = pd.merge(qa_df, gen_df, on="question")

In [17]:
# Initialize scorers
rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

In [18]:
rouge1_scores, rouge2_scores, rougeL_scores, accuracies = [], [], [], []

In [19]:
print("\n🔍 Calculating ROUGE & Accuracy...")

for ref, pred in zip(df['answer'], df['generated_answer']):
    scores = rouge.score(ref, pred)
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rouge2_scores.append(scores['rouge2'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

    # Exact match accuracy
    is_exact = int(ref.strip().lower() == pred.strip().lower())
    accuracies.append(is_exact)


🔍 Calculating ROUGE & Accuracy...


In [20]:
# Run BERTScore
print("🔍 Calculating BERTScore...")
P, R, F1 = bert_score(df['generated_answer'].tolist(), df['answer'].tolist(), lang="en", verbose=True)

🔍 Calculating BERTScore...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 28.41 seconds, 0.14 sentences/sec


In [21]:
# Summary Output
print("\n✅ Evaluation Summary:")
print(f"Accuracy (Exact Match):       {np.mean(accuracies):.4f}")
print(f"Average ROUGE-1 F1 Score:     {np.mean(rouge1_scores):.4f}")
print(f"Average ROUGE-2 F1 Score:     {np.mean(rouge2_scores):.4f}")
print(f"Average ROUGE-L F1 Score:     {np.mean(rougeL_scores):.4f}")
print(f"Average BERTScore F1 Score:   {F1.mean().item():.4f}")
print(f"Average Response Time:        {df['response_time'].mean():.2f} sec")
if 'cpu_usage_percent' in df.columns:
    print(f"Average CPU Usage:            {df['cpu_usage_percent'].mean():.2f}%")


✅ Evaluation Summary:
Accuracy (Exact Match):       0.0000
Average ROUGE-1 F1 Score:     0.2642
Average ROUGE-2 F1 Score:     0.0833
Average ROUGE-L F1 Score:     0.1619
Average BERTScore F1 Score:   0.8346
Average Response Time:        352.95 sec
Average CPU Usage:            63.38%


In [None]:
df["rouge1_f1"] = rouge1_scores
df["rouge2_f1"] = rouge2_scores
df["rougeL_f1"] = rougeL_scores
df["bertscore_f1"] = F1.tolist()
df["accuracy"] = accuracies

df.to_csv("src/contrastive_decoding_evaluation_report.csv", index=False)
print("\n📁 Detailed evaluation report saved to 'contrastive_decoding_evaluation_report.csv'")


📁 Detailed evaluation report saved to 'contrastive_decoding_evaluation_report.csv'
