In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import tiktoken
from tqdm import tqdm
import time
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import ollama
from langchain_ollama import OllamaLLM
import ast
import pickle
import networkx as nx
import psutil
from bert_score import score
from rouge import Rouge
from tqdm import tqdm




In [2]:
%run ../utils/fine_tuning_util.ipynb
%run ../utils/save_and_load_util.ipynb

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')  # Efficient embedding model
chunked_df = pd.read_csv("src/chunked_law_cases.csv")

In [7]:
chunked_df['embeddings'] = chunked_df['text'].apply(lambda x: model.encode(x))

In [8]:
G = nx.Graph()

In [9]:
for _, row in chunked_df.iterrows():
    G.add_node(row['chunk_id'], text=row['text'])
    if _ > 0:
        G.add_edge(row['chunk_id'], row['chunk_id'] - 1)  # Sequential linking

In [10]:
embeddings = np.array(chunked_df['embeddings'].tolist()).astype('float32')
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

In [11]:
def retrieve_similar(query):
    query_embedding = model.encode([query]).astype('float32')
    D, I = index.search(query_embedding, 5)  # Retrieve top 5 similar chunks
    return chunked_df.iloc[I[0]]['text'].tolist()

In [12]:
llm = OllamaLLM(model="llama3.1")

In [10]:
def query_llama(query):
    relevant_chunks = retrieve_similar(query)
    context = "\n".join(relevant_chunks)

    response = ollama.chat(model='llama3.1', messages=[
        {"role": "system", "content": "You are a legal AI assistant trained on case law."},
        {"role": "user", "content": f"Use this context:\n{context}\n\nQuestion: {query}"}
    ])
    return response['message']['content']

print(query_llama("What is the legal precedent for contract breaches?"))

The legal precedent for contract breaches in this case is that a party who breaks a contract does not discharge the other party's obligations, but rather gives the injured party the option to either:

1. Compel the guilty party to perform their promise (specific performance)
2. Sue them for damages

In this specific case, the Plaintiff continued to offer payment of the balance consideration and kept the contract "alive" beyond the period of three months, indicating that they did not accept the anticipatory breach by the 1st Defendant as a discharge of the obligations of the agreement to sell.

Therefore, the legal precedent is that even a breach sufficient to effect a discharge does not itself discharge the contract, but merely gives the other party an option to decide whether they will treat the contract as discharged.


In [13]:
def expand_graph_rag(query):
    similar_chunks = retrieve_similar(query)
    expanded_context = set(similar_chunks)

    for chunk in similar_chunks:
        neighbors = G.neighbors(chunked_df[chunked_df['text'] == chunk]['chunk_id'].values[0])
        expanded_context.update([G.nodes[n]['text'] for n in neighbors])

    return "\n".join(expanded_context)

In [14]:
def query_llama_graph_rag(query):
    context = expand_graph_rag(query)

    response = ollama.chat(model='llama3.1', messages=[
        {"role": "system", "content": "You are a legal AI assistant trained on case law using Graph RAG."},
        {"role": "user", "content": f"Use this context:\n{context}\n\nQuestion: {query}"}
    ])
    return response['message']['content']

In [18]:
# Evaluation
def measure_performance(query, answer):
    start_time = time.time()
    cpu_before = psutil.cpu_percent(interval=None)
    memory_before = psutil.virtual_memory().percent
    
    generated_answer = query_llama_graph_rag(query)
    
    cpu_after = psutil.cpu_percent(interval=None)
    memory_after = psutil.virtual_memory().percent
    end_time = time.time()
    
    response_time = end_time - start_time
    avg_cpu_usage = (cpu_before + cpu_after) / 2
    avg_memory_usage = (memory_before + memory_after) / 2
    
    rouge = Rouge()
    rouge_scores = rouge.get_scores(generated_answer, answer)[0]
    
    P, R, F1 = score([generated_answer], [answer], lang="en")
    bert_score = F1.mean().item()
    
    return response_time, avg_cpu_usage, avg_memory_usage, rouge_scores, bert_score, generated_answer

In [19]:
# Load evaluation dataset
eval_df = pd.read_csv("../data/processed/Questions & Answers.csv")

In [20]:
results = []
cpu_usage_list = []
memory_usage_list = []
llm_responses = []

In [21]:
start_time = time.time()
for i, row in tqdm(eval_df.iterrows(), total=len(eval_df), desc="Processing Questions", unit="question"):
    response_time, avg_cpu_usage, avg_memory_usage, rouge_scores, bert_score, generated_answer = measure_performance(row['question'], row['answer'])
    
    result = {
        "question": row['question'],
        "response": generated_answer,
        "response_time": response_time,
        "cpu_usage": avg_cpu_usage,
        "memory_usage": avg_memory_usage,
        "rouge_scores": rouge_scores,
        "bert_score": bert_score
    }
    results.append(result)
    cpu_usage_list.append(avg_cpu_usage)
    memory_usage_list.append(avg_memory_usage)
    llm_responses.append(generated_answer)
    
    print(f"✅ Completed {i + 1}/{len(eval_df)} | Time: {response_time:.2f}s | CPU: {avg_cpu_usage:.2f}% | Memory: {avg_memory_usage:.2f}%")

Processing Questions:   0%|          | 0/4 [00:00<?, ?question/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing Questions:  25%|██▌       | 1/4 [34:36<1:43:49, 2076.58s/question]

✅ Completed 1/4 | Time: 2069.73s | CPU: 34.50% | Memory: 72.40%


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing Questions:  50%|█████     | 2/4 [1:24:02<1:26:39, 2599.69s/question]

✅ Completed 2/4 | Time: 2959.34s | CPU: 55.20% | Memory: 91.50%


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing Questions:  75%|███████▌  | 3/4 [2:12:14<45:33, 2733.24s/question]  

✅ Completed 3/4 | Time: 2881.47s | CPU: 58.70% | Memory: 89.55%


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing Questions: 100%|██████████| 4/4 [2:42:17<00:00, 2434.27s/question]

✅ Completed 4/4 | Time: 1798.56s | CPU: 59.05% | Memory: 87.00%





In [22]:
# Compute final stats
total_time = time.time() - start_time
final_cpu_usage = sum(cpu_usage_list) / len(cpu_usage_list)
final_memory_usage = sum(memory_usage_list) / len(memory_usage_list)

In [24]:
# Extract ROUGE score keys
rouge_keys = results[0]["rouge_scores"].keys()

In [25]:
# Compute average ROUGE scores
avg_rouge = {
    k: sum(r["rouge_scores"][k]["f"] for r in results) / len(results) for k in rouge_keys
}

In [26]:
# Compute average BERTScore
avg_bert_score = sum(r["bert_score"] for r in results) / len(results)

In [32]:
if isinstance(results[0]["bert_score"], float):
    # If it's a float, assume it's the F1 score and set precision & recall to the same value
    avg_bert_precision = avg_bert_recall = avg_bert_f1 = sum(r["bert_score"] for r in results) / len(results)
else:
    # If it's a dictionary, compute proper averages
    avg_bert_precision = sum(r["bert_score"]["precision"] for r in results) / len(results)
    avg_bert_recall = sum(r["bert_score"]["recall"] for r in results) / len(results)
    avg_bert_f1 = sum(r["bert_score"]["f1"] for r in results) / len(results)

In [33]:
print(f"\n✅ All {len(eval_df)} questions processed in {total_time:.2f} seconds.")
print(f"📊 Final Average CPU Usage: {final_cpu_usage:.2f}%")
print(f"📊 Final Average Memory Usage: {final_memory_usage:.2f}%")
print(f"📊 Final Average ROUGE Scores: {avg_rouge}")
print(f"📊 Final Average BERTScore: {avg_bert_score}")
print(f"📊 Final BERTScore Precision: {avg_bert_precision:.4f}")
print(f"📊 Final BERTScore Recall: {avg_bert_recall:.4f}")
print(f"📊 Final BERTScore F1-score: {avg_bert_f1:.4f}")


✅ All 4 questions processed in 14153.64 seconds.
📊 Final Average CPU Usage: 51.86%
📊 Final Average Memory Usage: 85.11%
📊 Final Average ROUGE Scores: {'rouge-1': 0.2978928795317439, 'rouge-2': 0.08846930858822495, 'rouge-l': 0.2780273975007121}
📊 Final Average BERTScore: 0.8590720742940903
📊 Final BERTScore Precision: 0.8591
📊 Final BERTScore Recall: 0.8591
📊 Final BERTScore F1-score: 0.8591
