In [2]:
!pip install google-generativeai --quiet

In [34]:
import google.generativeai as genai
from dotenv import load_dotenv, find_dotenv
import os

load_dotenv('C:/Users/Nisharg/notebooks/LLM/.env/env.TXT')

genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
model = genai.GenerativeModel("gemini-2.0-flash")

In [36]:
def build_prompt(question, reference, prediction):
    return f"""
You are an expert evaluator of question-answering systems. Given a question, the reference answer, and a model-generated answer, your job is to score the model's answer from 1 to 5 based on:
- Faithfulness (accuracy to the reference)
- Completeness (does it cover key points)
- Clarity (is it understandable and well-written)

Give a score and a short justification.

### Question:
{question}

### Reference Answer:
{reference}

### Model's Answer:
{prediction}

### Evaluation:
Score (1-5):
Justification:
"""

In [58]:
qa_data = [
    {
        "question": "What causes global warming?",
        "reference": "Global warming is caused by greenhouse gases from burning fossil fuels and deforestation.",
        "prediction": "It is caused by human-released gases like CO2 and methane which trap heat."
    },
    {
        "question": "What is the capital of France?",
        "reference": "The capital of France is Paris.",
        "prediction": "London is the capital city of France."
    },
    # Add more..
]

# Function to call GPT-4 for evaluation (given for reference as alternative to gemini)
def evaluate_with_gpt(prompt):
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.2
        )
        return response['choices'][0]['message']['content']
    except Exception as e:
        return f"Error: {str(e)}"

# store response in result and use same way as below

In [60]:
import time

def evaluate_with_gemini(prompt):
    try:
        response = model.generate_content(prompt)
        return response.text
    except Exception as e:
        return f"Error: {e}"

results = []

for i, item in enumerate(qa_data):
    print(f"Evaluating Q{i+1}: {item['question'][:50]}...")
    prompt = build_prompt(item["question"], item["reference"], item["prediction"])
    result = evaluate_with_gemini(prompt)
    
    results.append({
        "question": item["question"],
        "reference": item["reference"],
        "prediction": item["prediction"],
        "evaluation": result
    })
    time.sleep(1)  # Avoid rate limits


Evaluating Q1: What causes global warming?...
Evaluating Q2: What is the capital of France?...


In [61]:
import pandas as pd

df = pd.DataFrame(results)
df.to_csv("gemini_llm_judge_results.csv", index=False)
df.head()


Unnamed: 0,question,reference,prediction,evaluation
0,What causes global warming?,Global warming is caused by greenhouse gases f...,It is caused by human-released gases like CO2 ...,Score: 5\nJustification: The model answer is f...
1,What is the capital of France?,The capital of France is Paris.,London is the capital city of France.,Score (1-5): 1\nJustification: The model's ans...
