## Evalauting LLM in Local

In [None]:
from ctransformers import AutoModelForCausalLM
import json
import nltk
nltk.download("punkt")
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from bert_score import score as bert_score
import numpy as np

nltk.download('punkt')

# === Load the GGUF Model ===
model = AutoModelForCausalLM.from_pretrained(
    "C:/myworks/MODELS",
    model_file="mistral-7b-instruct-v0.1.Q2_K.gguf",
    model_type="mistral",
    max_new_tokens=200,
    threads=4
)

# === Sample Evaluation Data ===
eval_data = [
    {"question": "What is the capital of France?", "answer": "Paris"},
    {"question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare"},
    {"question": "What is the chemical formula of water?", "answer": "H2O"},
]

# === Embedding Model for Cosine Similarity ===
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# === ROUGE & BLEU Settings ===
rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
smoothie = SmoothingFunction().method4

# === Store Results ===
results = []
all_preds = []
all_refs = []

# === Evaluation Loop ===
for item in eval_data:
    prompt = f"### Question: {item['question']}\n### Answer:"
    prediction = model(prompt).strip()
    reference = item['answer'].strip()

    # Save for batch metrics
    all_preds.append(prediction)
    all_refs.append(reference)

    # === Metrics ===
    exact_match = prediction.lower().strip() == reference.lower().strip()
    bleu = sentence_bleu([nltk.word_tokenize(reference)], nltk.word_tokenize(prediction), smoothing_function=smoothie)
    rouge_l = rouge.score(reference, prediction)['rougeL'].fmeasure
    cosine = cosine_similarity(embedder.encode([reference]), embedder.encode([prediction]))[0][0]

    results.append({
        "question": item["question"],
        "reference": reference,
        "prediction": prediction,
        "exact_match": exact_match,
        "bleu": bleu,
        "rougeL": rouge_l,
        "cosine_similarity": cosine
    })

# === BERTScore (Batch) ===
P, R, F1 = bert_score(all_preds, all_refs, lang="en", rescale_with_baseline=True)

# === Aggregate Metrics ===
aggregate = {
    "exact_match (%)": round(np.mean([r["exact_match"] for r in results]) * 100, 2),
    "bleu (avg)": round(np.mean([r["bleu"] for r in results]), 4),
    "rougeL (avg)": round(np.mean([r["rougeL"] for r in results]), 4),
    "cosine_similarity (avg)": round(np.mean([r["cosine_similarity"] for r in results]), 4),
    "bert_score_f1 (avg)": round(float(F1.mean()), 4)
}

# === Save to File ===
with open("C:/myworks/AI_Agent_AS_LLM_Judge_Evaluator/llm_eval_full_metrics.txt", "w", encoding="utf-8") as f:
    f.write("=== Detailed Results ===\n")
    f.write(json.dumps(results, indent=2))
    f.write("\n\n=== Aggregated Metrics ===\n")
    f.write(json.dumps(aggregate, indent=2))

print("✅ Evaluation Complete. Summary:")
print(json.dumps(aggregate, indent=2))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SREEKANTHVS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SREEKANTHVS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!

KeyboardInterrupt



## AI Agent as LLM Judge Evaluator

###  Let’s now turn the LLM Judge into an AI Agent – one that acts autonomously to evaluate candidate answers against reference answers with chain-of-thought reasoning and decision logic.

An AI agent LLM Judge:

Uses a deliberate step-by-step reasoning chain.

Follows rules (like rubric-based scoring).

May critique, score, and even revise evaluations if they are inconsistent.

Can be reused across multiple tasks — like grading, summarizing, or ranking.
┌────────────┐
│ Prompt     │
│ Reference  │
│ Candidate  │
└─────┬──────┘
      │
      ▼
┌───────────────────────────────┐
│ AI Agent Judge (LLM pipeline)│
│ - Step 1: Understand intent   │
│ - Step 2: Compare answers     │
│ - Step 3: Reason on quality   │
│ - Step 4: Generate judgment   │
└────────────┬──────────────────┘
             │
             ▼
     Score, Explanation, Critique


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sentence_transformers import SentenceTransformer
import pandas as pd
import matplotlib.pyplot as plt
import bert_score
import numpy as np

# Load LLM
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1"  # Or another instruct model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, device_map="auto", torch_dtype=torch.float16
)
judge = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=300)

# Embedding model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Dataset
data = [
    {
        "prompt": "Explain the greenhouse effect.",
        "reference": "The greenhouse effect traps heat in Earth's atmosphere due to gases like CO2 and methane.",
        "candidate": "The atmosphere traps heat because of gases like carbon dioxide, causing Earth to warm up."
    },
    {
        "prompt": "What causes deforestation?",
        "reference": "Deforestation is caused by logging, farming, mining, and urban sprawl.",
        "candidate": "People cut trees for farming and for building towns."
    }
]
df = pd.DataFrame(data)

# Agent Prompt Template
def agent_prompt(prompt, reference, candidate):
    return f"""
You are an expert evaluation AI agent.

You will follow a step-by-step reasoning process to evaluate how well a candidate answer responds to a prompt compared to a reference answer. Your steps:

Step 1: Restate the intent of the prompt.
Step 2: Summarize the reference answer concisely.
Step 3: Summarize the candidate answer.
Step 4: Identify key differences.
Step 5: Judge the candidate's relevance, accuracy, and completeness compared to the reference.
Step 6: Assign a score from 1 (poor) to 5 (excellent).
Step 7: Provide a concise explanation for the score.

PROMPT: {prompt}
REFERENCE ANSWER: {reference}
CANDIDATE ANSWER: {candidate}

Respond using this format:
Intent: ...
Reference Summary: ...
Candidate Summary: ...
Comparison: ...
Evaluation: ...
Score: <1-5>
Explanation: ...
"""

# Cosine similarity
def cosine_sim(text1, text2):
    v1 = embedder.encode([text1])[0]
    v2 = embedder.encode([text2])[0]
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

# Score extractor
def extract_score(agent_output):
    try:
        for line in agent_output.split('\n'):
            if "score" in line.lower():
                digits = ''.join(filter(str.isdigit, line))
                return int(digits) if digits else None
        return None
    except:
        return None

# Run the AI agent
results = []
all_references, all_candidates = [], []

for idx, row in df.iterrows():
    print(f"Evaluating #{idx+1}")
    prompt_text = agent_prompt(row["prompt"], row["reference"], row["candidate"])
    output = judge(prompt_text)[0]["generated_text"]
    score = extract_score(output)
    sim = cosine_sim(row["reference"], row["candidate"])
    
    all_references.append(row["reference"])
    all_candidates.append(row["candidate"])

    results.append({
        "prompt": row["prompt"],
        "score": score,
        "cosine_similarity": sim,
        "agent_output": output
    })

# Compute BERTScore
P, R, F1 = bert_score.score(all_candidates, all_references, lang="en", verbose=False)
for i, f1 in enumerate(F1):
    results[i]["bertscore_f1"] = f1.item()

# Results
results_df = pd.DataFrame(results)
print(results_df[["prompt", "score", "cosine_similarity", "bertscore_f1"]])

# Summary
print("\n📊 Summary Metrics:")
print(f"Avg Score: {results_df['score'].mean():.2f}")
print(f"Avg Cosine Similarity: {results_df['cosine_similarity'].mean():.2f}")
print(f"Avg BERTScore F1: {results_df['bertscore_f1'].mean():.2f}")

# Plot
plt.hist(results_df["score"].dropna(), bins=[1,2,3,4,5,6], edgecolor="black")
plt.title("AI Agent Judge Score Distribution")
plt.xlabel("Score")
plt.ylabel("Frequency")
plt.show()


# Multi Agent voting

 AI Judge Agent with Multi-Agent Voting, where multiple LLMs act as judges and vote on the score. Then we’ll:

🧠 Aggregate their scores (majority vote or average)

📝 Save all judge outputs, scores, and metrics to a results.txt file

| Feature                 | Implemented? |
| ----------------------- | ------------ |
| AI Agent LLM Judge      | ✅ Yes        |
| Multi-Model Judging     | ✅ Yes        |
| Voting/Averaging Scores | ✅ Yes        |
| Save Results to File    | ✅ Yes        |


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sentence_transformers import SentenceTransformer
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import bert_score

# Load AI judge models (you can expand this list)
JUDGE_MODELS = {
    "Mistral": "mistralai/Mistral-7B-Instruct-v0.1",
    "LLaMA": "meta-llama/Llama-2-7b-chat-hf"  # swap this with another local model if needed
}

# Load models and tokenizers
judges = {}
for name, model_id in JUDGE_MODELS.items():
    print(f"Loading judge model: {name}")
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(
        model_id, torch_dtype=torch.float16, device_map="auto"
    )
    judges[name] = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=300)

# Embedding model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Sample dataset
data = [
    {
        "prompt": "Explain the greenhouse effect.",
        "reference": "The greenhouse effect traps heat in Earth's atmosphere due to gases like CO2 and methane.",
        "candidate": "The atmosphere traps heat because of gases like carbon dioxide, causing Earth to warm up."
    },
    {
        "prompt": "What causes deforestation?",
        "reference": "Deforestation is caused by logging, farming, mining, and urban sprawl.",
        "candidate": "People cut trees for farming and for building towns."
    }
]
df = pd.DataFrame(data)

# Format prompt for agents
def agent_prompt(prompt, reference, candidate):
    return f"""
You are an expert evaluator agent.

Step 1: Understand the prompt.
Step 2: Summarize the reference and candidate.
Step 3: Compare candidate to reference.
Step 4: Judge accuracy, completeness, and clarity.
Step 5: Score from 1 (poor) to 5 (excellent).
Step 6: Explain the score.

PROMPT: {prompt}
REFERENCE: {reference}
CANDIDATE: {candidate}

Respond using:
Intent: ...
Reference Summary: ...
Candidate Summary: ...
Comparison: ...
Score: <1-5>
Explanation: ...
"""

# Score extractor
def extract_score(output_text):
    try:
        for line in output_text.split("\n"):
            if "score" in line.lower():
                digits = ''.join(filter(str.isdigit, line))
                return int(digits) if digits else None
        return None
    except:
        return None

# Cosine similarity
def cosine_sim(text1, text2):
    v1 = embedder.encode([text1])[0]
    v2 = embedder.encode([text2])[0]
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

# Evaluate
results = []
all_refs, all_cands = [], []

for idx, row in df.iterrows():
    print(f"\nEvaluating Sample #{idx+1}")
    prompt_text = agent_prompt(row["prompt"], row["reference"], row["candidate"])
    all_refs.append(row["reference"])
    all_cands.append(row["candidate"])
    
    # Judge responses and scores
    judge_outputs = {}
    scores = []
    
    for judge_name, judge_pipe in judges.items():
        try:
            output = judge_pipe(prompt_text)[0]["generated_text"]
            score = extract_score(output)
            scores.append(score)
            judge_outputs[judge_name] = {"score": score, "explanation": output}
        except Exception as e:
            judge_outputs[judge_name] = {"score": None, "explanation": str(e)}
    
    # Voting: average score
    valid_scores = [s for s in scores if s is not None]
    avg_score = np.mean(valid_scores) if valid_scores else None
    sim = cosine_sim(row["reference"], row["candidate"])
    
    results.append({
        "prompt": row["prompt"],
        "candidate": row["candidate"],
        "average_score": avg_score,
        "cosine_similarity": sim,
        "judge_outputs": judge_outputs
    })

# BERTScore
P, R, F1 = bert_score.score(all_cands, all_refs, lang="en", verbose=False)
for i in range(len(results)):
    results[i]["bertscore_f1"] = F1[i].item()

# Save results to text file
with open("llm_evaluation_results.txt", "w", encoding="utf-8") as f:
    for i, entry in enumerate(results):
        f.write(f"\n### Evaluation Sample {i+1} ###\n")
        f.write(f"Prompt: {entry['prompt']}\n")
        f.write(f"Candidate Answer: {entry['candidate']}\n")
        f.write(f"Cosine Similarity: {entry['cosine_similarity']:.3f}\n")
        f.write(f"BERTScore F1: {entry['bertscore_f1']:.3f}\n")
        f.write(f"Average Judge Score: {entry['average_score']:.2f}\n")
        f.write(f"\n--- Judge Responses ---\n")
        for judge_name, response in entry["judge_outputs"].items():
            f.write(f"\n[{judge_name}]\n")
            f.write(f"Score: {response['score']}\n")
            f.write("Explanation:\n")
            f.write(response["explanation"] + "\n")

print("\n✅ All results saved to `llm_evaluation_results.txt`.")

# Summary (optional visualization)
scores = [r["average_score"] for r in results if r["average_score"] is not None]
plt.hist(scores, bins=[1,2,3,4,5,6], edgecolor="black")
plt.title("Multi-Agent Score Distribution")
plt.xlabel("Average Score")
plt.ylabel("Frequency")
plt.show()
