In [1]:
from dotenv import load_dotenv
from openai import OpenAI
import pandas as pd
import time
import pickle
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

load_dotenv()
client = OpenAI()

# Test queries about Gemini (without RAG context)
TEST_QUERIES = [
    "What are the key capabilities of Gemini models?",
    "How does Gemini compare to other multimodal models?", 
    "What are the different versions of Gemini?",
    "What training data was used for Gemini?",
    "What are the safety measures in Gemini models?"
]

temperatures = [0.0, 0.2, 0.5, 0.8, 1.2]

In [2]:
# LLM Judge for Answer Evaluation
def llm_judge(query, answer, judge_model="gpt-4o-mini"):
    judge_prompt = f"""
You are an expert evaluator. Assess this answer to the given question.

QUESTION: {query}

ANSWER: {answer}

Evaluate on these 3 criteria using this scale:
- Poor: Incorrect, incomplete, or irrelevant
- Below Average: Partially correct but missing key information  
- Average: Correct but basic, meets minimum requirements
- Good: Comprehensive, accurate, and well-structured
- Excellent: Outstanding accuracy, completeness, and clarity

CRITERIA:
1. Accuracy: Is the answer factually correct?
2. Completeness: Does it address all parts of the question?
3. Clarity: Is it well-structured and easy to understand?

Respond in this exact format:
Accuracy: [Rating]
Completeness: [Rating] 
Clarity: [Rating]
Overall: [Rating]
Reasoning: [Brief explanation]
"""
    
    try:
        response = client.chat.completions.create(
            model=judge_model,
            messages=[{"role": "user", "content": judge_prompt}],
            temperature=0.1,
            max_tokens=300
        )
        
        evaluation = response.choices[0].message.content
        
        # Parse the evaluation
        lines = evaluation.strip().split('\n')
        result = {}
        
        for line in lines:
            if ':' in line:
                key, value = line.split(':', 1)
                result[key.strip().lower()] = value.strip()
        
        return {
            'accuracy': result.get('accuracy', 'N/A'),
            'completeness': result.get('completeness', 'N/A'),
            'clarity': result.get('clarity', 'N/A'),
            'overall': result.get('overall', 'N/A'),
            'reasoning': result.get('reasoning', 'N/A')
        }
        
    except Exception as e:
        return {
            'accuracy': 'Error',
            'completeness': 'Error', 
            'clarity': 'Error',
            'overall': 'Error',
            'reasoning': f'Evaluation failed: {str(e)}'
        }

print("LLM Judge function ready")

LLM Judge function ready


In [3]:
# Part 1: Non-RAG Temperature Experiment
def run_non_rag_experiment(query, temp):
    start = time.time()
    
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": query}],
        temperature=temp,
        max_tokens=200
    )
    
    latency = time.time() - start
    answer = response.choices[0].message.content
    
    # Get LLM judge evaluation
    evaluation = llm_judge(query, answer)
    
    return {
        "query": query,
        "temperature": temp,
        "answer": answer,
        "latency": round(latency, 2),
        "tokens": response.usage.total_tokens,
        "accuracy": evaluation['accuracy'],
        "completeness": evaluation['completeness'],
        "clarity": evaluation['clarity'],
        "overall_rating": evaluation['overall'],
        "reasoning": evaluation['reasoning']
    }

# Run Non-RAG experiments
print("Running Non-RAG temperature experiments...")
non_rag_results = []

for query in TEST_QUERIES:
    for temp in temperatures:
        print(f"Testing query: '{query[:50]}...' at temperature {temp}")
        result = run_non_rag_experiment(query, temp)
        non_rag_results.append(result)
        time.sleep(1)  # Rate limiting

non_rag_df = pd.DataFrame(non_rag_results)
print(f"\nCompleted {len(non_rag_results)} Non-RAG experiments")

Running Non-RAG temperature experiments...
Testing query: 'What are the key capabilities of Gemini models?...' at temperature 0.0
Testing query: 'What are the key capabilities of Gemini models?...' at temperature 0.2
Testing query: 'What are the key capabilities of Gemini models?...' at temperature 0.5
Testing query: 'What are the key capabilities of Gemini models?...' at temperature 0.8
Testing query: 'What are the key capabilities of Gemini models?...' at temperature 1.2
Testing query: 'How does Gemini compare to other multimodal models...' at temperature 0.0
Testing query: 'How does Gemini compare to other multimodal models...' at temperature 0.2
Testing query: 'How does Gemini compare to other multimodal models...' at temperature 0.5
Testing query: 'How does Gemini compare to other multimodal models...' at temperature 0.8
Testing query: 'How does Gemini compare to other multimodal models...' at temperature 1.2
Testing query: 'What are the different versions of Gemini?...' at temper

In [4]:
non_rag_df

Unnamed: 0,query,temperature,answer,latency,tokens,accuracy,completeness,clarity,overall_rating,reasoning
0,What are the key capabilities of Gemini models?,0.0,"As of my last update in October 2023, Gemini m...",5.19,216,Good,Below Average,Good,Good,The answer accurately describes the capabiliti...
1,What are the key capabilities of Gemini models?,0.2,"As of my last update in October 2023, Gemini m...",4.38,216,Good,Below Average,Good,Good,The answer accurately describes the capabiliti...
2,What are the key capabilities of Gemini models?,0.5,"Gemini models, developed by Google DeepMind, a...",4.71,216,Good,Below Average,Good,Good,The answer accurately describes the capabiliti...
3,What are the key capabilities of Gemini models?,0.8,As of my last knowledge update in October 2023...,7.62,216,Good,Below Average,Good,Good,The answer is factually correct regarding the ...
4,What are the key capabilities of Gemini models?,1.2,"The Gemini models, developed by Google DeepMin...",4.48,216,Good,Below Average,Good,Good,The answer accurately describes the capabiliti...
5,How does Gemini compare to other multimodal mo...,0.0,"As of my last update in October 2023, Gemini i...",4.79,217,Good,Below Average,Good,Below Average,The answer is factually correct regarding Gemi...
6,How does Gemini compare to other multimodal mo...,0.2,"As of my last update in October 2023, Gemini i...",3.9,217,Good,Below Average,Good,Below Average,The answer provides accurate information about...
7,How does Gemini compare to other multimodal mo...,0.5,As of my last knowledge update in October 2023...,4.53,217,Good,Below Average,Good,Below Average,The answer provides a generally accurate overv...
8,How does Gemini compare to other multimodal mo...,0.8,"As of my last update in October 2023, Gemini i...",3.95,217,Good,Below Average,Good,Good,The answer is factually correct regarding Gemi...
9,How does Gemini compare to other multimodal mo...,1.2,As of my last training cut-off in October 2023...,4.55,217,Good,Below Average,Good,Below Average,The answer provides a generally accurate overv...


In [6]:
# Load RAG embeddings
with open('../data/rag_embeddings.pkl', 'rb') as f:
    rag_data = pickle.load(f)

chunks = rag_data['chunks']
embeddings = rag_data['embeddings']

print(f"Loaded {len(chunks)} chunks for RAG experiments")

# RAG retrieval function
def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding

def retrieve_chunks(query, chunks, embeddings, k=3):
    query_embedding = get_embedding(query)
    similarities = cosine_similarity([query_embedding], embeddings)[0]
    top_indices = np.argsort(similarities)[::-1][:k]
    
    retrieved_chunks = []
    for idx in top_indices:
        retrieved_chunks.append({
            'chunk': chunks[idx],
            'similarity': similarities[idx]
        })
    
    return retrieved_chunks

Loaded 82 chunks for RAG experiments


In [7]:
# Part 2: RAG Temperature Experiment
def run_rag_experiment(query, temp):
    start = time.time()
    
    # Retrieve relevant chunks
    retrieved = retrieve_chunks(query, chunks, embeddings, k=3)
    context = "\n\n".join([item['chunk']['text'] for item in retrieved])
    
    # Generate response with context
    prompt = f"""
Use the context below to answer the question. Be accurate and cite specific information from the context.

Context:
{context}

Question: {query}

Answer:"""
    
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=temp,
        max_tokens=200
    )
    
    latency = time.time() - start
    answer = response.choices[0].message.content
    
    # Get LLM judge evaluation (with context)
    judge_prompt = f"""
You are an expert evaluator. Assess this RAG answer based on the given context.

CONTEXT:
{context}

QUESTION: {query}

ANSWER: {answer}

Evaluate on these 3 criteria using this scale:
- Poor: Incorrect, incomplete, or irrelevant
- Below Average: Partially correct but missing key information  
- Average: Correct but basic, meets minimum requirements
- Good: Comprehensive, accurate, and well-structured
- Excellent: Outstanding accuracy, completeness, and clarity

CRITERIA:
1. Accuracy: Is the answer factually correct based on the context?
2. Completeness: Does it address all parts of the question?
3. Clarity: Is it well-structured and easy to understand?

Respond in this exact format:
Accuracy: [Rating]
Completeness: [Rating] 
Clarity: [Rating]
Overall: [Rating]
Reasoning: [Brief explanation]
"""
    
    try:
        judge_response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": judge_prompt}],
            temperature=0.1,
            max_tokens=300
        )
        
        evaluation = judge_response.choices[0].message.content
        lines = evaluation.strip().split('\n')
        result = {}
        
        for line in lines:
            if ':' in line:
                key, value = line.split(':', 1)
                result[key.strip().lower()] = value.strip()
        
        eval_result = {
            'accuracy': result.get('accuracy', 'N/A'),
            'completeness': result.get('completeness', 'N/A'),
            'clarity': result.get('clarity', 'N/A'),
            'overall': result.get('overall', 'N/A'),
            'reasoning': result.get('reasoning', 'N/A')
        }
        
    except Exception as e:
        eval_result = {
            'accuracy': 'Error',
            'completeness': 'Error', 
            'clarity': 'Error',
            'overall': 'Error',
            'reasoning': f'Evaluation failed: {str(e)}'
        }
    
    return {
        "query": query,
        "temperature": temp,
        "answer": answer,
        "context_length": len(context),
        "latency": round(latency, 2),
        "tokens": response.usage.total_tokens,
        "accuracy": eval_result['accuracy'],
        "completeness": eval_result['completeness'],
        "clarity": eval_result['clarity'],
        "overall_rating": eval_result['overall'],
        "reasoning": eval_result['reasoning']
    }

# Run RAG experiments
print("\nRunning RAG temperature experiments...")
rag_results = []

for query in TEST_QUERIES:
    for temp in temperatures:
        print(f"Testing RAG query: '{query[:50]}...' at temperature {temp}")
        result = run_rag_experiment(query, temp)
        rag_results.append(result)
        time.sleep(1)  # Rate limiting

rag_df = pd.DataFrame(rag_results)
print(f"\nCompleted {len(rag_results)} RAG experiments")


Running RAG temperature experiments...
Testing RAG query: 'What are the key capabilities of Gemini models?...' at temperature 0.0
Testing RAG query: 'What are the key capabilities of Gemini models?...' at temperature 0.2
Testing RAG query: 'What are the key capabilities of Gemini models?...' at temperature 0.5
Testing RAG query: 'What are the key capabilities of Gemini models?...' at temperature 0.8
Testing RAG query: 'What are the key capabilities of Gemini models?...' at temperature 1.2
Testing RAG query: 'How does Gemini compare to other multimodal models...' at temperature 0.0
Testing RAG query: 'How does Gemini compare to other multimodal models...' at temperature 0.2
Testing RAG query: 'How does Gemini compare to other multimodal models...' at temperature 0.5
Testing RAG query: 'How does Gemini compare to other multimodal models...' at temperature 0.8
Testing RAG query: 'How does Gemini compare to other multimodal models...' at temperature 1.2
Testing RAG query: 'What are the di

In [8]:
rag_df

Unnamed: 0,query,temperature,answer,context_length,latency,tokens,accuracy,completeness,clarity,overall_rating,reasoning
0,What are the key capabilities of Gemini models?,0.0,The key capabilities of Gemini models include:...,10375,4.28,2528,Good,Good,Good,Good,The answer accurately reflects the key capabil...
1,What are the key capabilities of Gemini models?,0.2,The key capabilities of Gemini models include:...,10375,4.27,2528,Good,Average,Good,Good,The answer accurately captures the key capabil...
2,What are the key capabilities of Gemini models?,0.5,The key capabilities of Gemini models include:...,10375,4.54,2528,Good,Good,Good,Good,The answer accurately captures the key capabil...
3,What are the key capabilities of Gemini models?,0.8,The key capabilities of Gemini models include ...,10375,4.45,2528,Good,Good,Good,Good,The answer accurately reflects the key capabil...
4,What are the key capabilities of Gemini models?,1.2,The key capabilities of Gemini models include:...,10375,3.78,2528,Excellent,Good,Good,Good,The answer accurately reflects the key capabil...
5,How does Gemini compare to other multimodal mo...,0.0,"Gemini models, particularly the Gemini Ultra v...",10168,7.41,2305,Good,Good,Good,Good,The answer accurately reflects the advancement...
6,How does Gemini compare to other multimodal mo...,0.2,"Gemini models, particularly the Gemini Ultra v...",10168,4.08,2305,Excellent,Good,Good,Good,The answer accurately reflects the advancement...
7,How does Gemini compare to other multimodal mo...,0.5,"Gemini models, particularly the Gemini Ultra v...",10168,3.82,2305,Excellent,Good,Good,Good,The answer accurately reflects the information...
8,How does Gemini compare to other multimodal mo...,0.8,"Gemini models, particularly the Gemini Ultra, ...",10168,4.27,2305,Excellent,Good,Good,Good,The answer accurately reflects the strengths o...
9,How does Gemini compare to other multimodal mo...,1.2,Gemini is positioned as a highly capable multi...,10168,4.33,2305,Excellent,Good,Good,Good,The answer accurately reflects the strengths o...
