## Top-k = limit the number of choices
## Top-p = limit the total probability

In [1]:
from dotenv import load_dotenv
from openai import OpenAI
import pandas as pd
import time
import pickle
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

load_dotenv()
client = OpenAI()

# Test queries about Gemini
TEST_QUERIES = [
    "What are the key capabilities of Gemini models?",
    "How does Gemini compare to other multimodal models?", 
    "What are the different versions of Gemini?",
    "What training data was used for Gemini?",
    "What are the safety measures in Gemini models?"
]

# Parameters to test
TOP_P_VALUES = [0.2, 0.5, 0.9, 1.0]
RETRIEVAL_K_VALUES = [1, 3, 5, 10, 15]

TEMPERATURE = 0.7  # Keep constant
MAX_TOKENS = 200

In [2]:
# Load RAG embeddings
with open('../data/rag_embeddings.pkl', 'rb') as f:
    rag_data = pickle.load(f)

chunks = rag_data['chunks']
embeddings = rag_data['embeddings']

print(f"Loaded {len(chunks)} chunks for RAG experiments")

# RAG functions
def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding

def retrieve_chunks(query, chunks, embeddings, k=5):
    query_embedding = get_embedding(query)
    similarities = cosine_similarity([query_embedding], embeddings)[0]
    top_indices = np.argsort(similarities)[::-1][:k]
    
    retrieved_chunks = []
    for idx in top_indices:
        retrieved_chunks.append({
            'chunk': chunks[idx],
            'similarity': similarities[idx]
        })
    
    return retrieved_chunks

Loaded 82 chunks for RAG experiments


In [3]:
# LLM Judge for RAG Answer Evaluation
def llm_judge_rag(query, answer, context):
    judge_prompt = f"""
You are an expert evaluator. Assess this RAG answer based on the given context.

CONTEXT:
{context}

QUESTION: {query}

ANSWER: {answer}

Evaluate on these 3 criteria using this scale:
- Poor: Incorrect, incomplete, or irrelevant
- Below Average: Partially correct but missing key information  
- Average: Correct but basic, meets minimum requirements
- Good: Comprehensive, accurate, and well-structured
- Excellent: Outstanding accuracy, completeness, and clarity

CRITERIA:
1. Accuracy: Is the answer factually correct based on the context?
2. Completeness: Does it address all parts of the question?
3. Clarity: Is it well-structured and easy to understand?

Respond in this exact format:
Accuracy: [Rating]
Completeness: [Rating] 
Clarity: [Rating]
Overall: [Rating]
Reasoning: [Brief explanation]
"""
    
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": judge_prompt}],
            temperature=0.1,
            max_tokens=300
        )
        
        evaluation = response.choices[0].message.content
        lines = evaluation.strip().split('\n')
        result = {}
        
        for line in lines:
            if ':' in line:
                key, value = line.split(':', 1)
                result[key.strip().lower()] = value.strip()
        
        return {
            'accuracy': result.get('accuracy', 'N/A'),
            'completeness': result.get('completeness', 'N/A'),
            'clarity': result.get('clarity', 'N/A'),
            'overall': result.get('overall', 'N/A'),
            'reasoning': result.get('reasoning', 'N/A')
        }
        
    except Exception as e:
        return {
            'accuracy': 'Error',
            'completeness': 'Error', 
            'clarity': 'Error',
            'overall': 'Error',
            'reasoning': f'Evaluation failed: {str(e)}'
        }

print("LLM Judge function ready")

LLM Judge function ready


In [4]:
# Experiment 1: Top-p Effect on RAG Responses
def run_top_p_rag_experiment(query, top_p, retrieval_k=5):
    start = time.time()
    
    # Retrieve chunks
    retrieved = retrieve_chunks(query, chunks, embeddings, k=retrieval_k)
    context = "\n\n".join([item['chunk']['text'] for item in retrieved])
    
    # Generate response with different top_p
    prompt = f"""
Use the context below to answer the question. Be accurate and cite specific information from the context.

Context:
{context}

Question: {query}

Answer:"""
    
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=TEMPERATURE,
        top_p=top_p,
        max_tokens=MAX_TOKENS
    )
    
    latency = time.time() - start
    answer = response.choices[0].message.content
    
    # Get evaluation
    evaluation = llm_judge_rag(query, answer, context)
    
    return {
        "experiment": "top_p",
        "query": query,
        "top_p": top_p,
        "retrieval_k": retrieval_k,
        "answer": answer,
        "context_length": len(context),
        "latency": round(latency, 2),
        "tokens": response.usage.total_tokens,
        "accuracy": evaluation['accuracy'],
        "completeness": evaluation['completeness'],
        "clarity": evaluation['clarity'],
        "overall_rating": evaluation['overall'],
        "reasoning": evaluation['reasoning']
    }

# Run Top-p experiments
print("Running Top-p RAG experiments...")
top_p_results = []

for query in TEST_QUERIES[:2]:  # Test first 2 queries to save time
    for top_p in TOP_P_VALUES:
        print(f"Testing top_p={top_p} for query: '{query[:50]}...'")
        result = run_top_p_rag_experiment(query, top_p, retrieval_k=5)
        top_p_results.append(result)
        time.sleep(1)

top_p_df = pd.DataFrame(top_p_results)
print(f"\nCompleted {len(top_p_results)} Top-p experiments")

Running Top-p RAG experiments...
Testing top_p=0.2 for query: 'What are the key capabilities of Gemini models?...'
Testing top_p=0.5 for query: 'What are the key capabilities of Gemini models?...'
Testing top_p=0.9 for query: 'What are the key capabilities of Gemini models?...'
Testing top_p=1.0 for query: 'What are the key capabilities of Gemini models?...'
Testing top_p=0.2 for query: 'How does Gemini compare to other multimodal models...'
Testing top_p=0.5 for query: 'How does Gemini compare to other multimodal models...'
Testing top_p=0.9 for query: 'How does Gemini compare to other multimodal models...'
Testing top_p=1.0 for query: 'How does Gemini compare to other multimodal models...'

Completed 8 Top-p experiments


In [6]:
top_p_df

Unnamed: 0,experiment,query,top_p,retrieval_k,answer,context_length,latency,tokens,accuracy,completeness,clarity,overall_rating,reasoning
0,top_p,What are the key capabilities of Gemini models?,0.2,5,Gemini models exhibit several key capabilities...,17216,6.97,3818,Good,Good,Good,Good,The answer accurately reflects the key capabil...
1,top_p,What are the key capabilities of Gemini models?,0.5,5,Gemini models exhibit several key capabilities...,17216,5.33,3818,Good,Good,Good,Good,The answer accurately reflects the key capabil...
2,top_p,What are the key capabilities of Gemini models?,0.9,5,Gemini models exhibit several key capabilities...,17216,5.48,3818,Good,Average,Good,Good,The answer accurately describes the key capabi...
3,top_p,What are the key capabilities of Gemini models?,1.0,5,The key capabilities of Gemini models include:...,17216,5.64,3818,Good,Good,Good,Good,The answer accurately captures the key capabil...
4,top_p,How does Gemini compare to other multimodal mo...,0.2,5,"Gemini models, particularly the Gemini Ultra v...",16853,4.71,3737,Good,Good,Good,Good,The answer accurately reflects the context pro...
5,top_p,How does Gemini compare to other multimodal mo...,0.5,5,"Gemini models, particularly the Gemini Ultra v...",16853,6.65,3737,Good,Good,Good,Good,The answer accurately reflects the advancement...
6,top_p,How does Gemini compare to other multimodal mo...,0.9,5,"The Gemini models, particularly the Gemini Ult...",16853,6.52,3737,Good,Good,Good,Good,The answer accurately reflects the advancement...
7,top_p,How does Gemini compare to other multimodal mo...,1.0,5,"Gemini models, particularly the Gemini Ultra v...",16853,5.11,3737,Excellent,Good,Good,Good,The answer accurately reflects the context pro...


In [5]:
# Experiment 2: Retrieval-k Effect on RAG Responses  
def run_retrieval_k_experiment(query, retrieval_k, top_p=0.9):
    start = time.time()
    
    # Retrieve different numbers of chunks
    retrieved = retrieve_chunks(query, chunks, embeddings, k=retrieval_k)
    context = "\n\n".join([item['chunk']['text'] for item in retrieved])
    
    # Generate response
    prompt = f"""
Use the context below to answer the question. Be accurate and cite specific information from the context.

Context:
{context}

Question: {query}

Answer:"""
    
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=TEMPERATURE,
        top_p=top_p,
        max_tokens=MAX_TOKENS
    )
    
    latency = time.time() - start
    answer = response.choices[0].message.content
    
    # Get evaluation
    evaluation = llm_judge_rag(query, answer, context)
    
    return {
        "experiment": "retrieval_k",
        "query": query,
        "top_p": top_p,
        "retrieval_k": retrieval_k,
        "answer": answer,
        "context_length": len(context),
        "latency": round(latency, 2),
        "tokens": response.usage.total_tokens,
        "accuracy": evaluation['accuracy'],
        "completeness": evaluation['completeness'],
        "clarity": evaluation['clarity'],
        "overall_rating": evaluation['overall'],
        "reasoning": evaluation['reasoning']
    }

# Run Retrieval-k experiments
print("\nRunning Retrieval-k experiments...")
retrieval_k_results = []

for query in TEST_QUERIES[:2]:  # Test first 2 queries
    for k in RETRIEVAL_K_VALUES:
        print(f"Testing retrieval_k={k} for query: '{query[:50]}...'")
        result = run_retrieval_k_experiment(query, k, top_p=0.9)
        retrieval_k_results.append(result)
        time.sleep(1)

retrieval_k_df = pd.DataFrame(retrieval_k_results)
print(f"\nCompleted {len(retrieval_k_results)} Retrieval-k experiments")


Running Retrieval-k experiments...
Testing retrieval_k=1 for query: 'What are the key capabilities of Gemini models?...'
Testing retrieval_k=3 for query: 'What are the key capabilities of Gemini models?...'
Testing retrieval_k=5 for query: 'What are the key capabilities of Gemini models?...'
Testing retrieval_k=10 for query: 'What are the key capabilities of Gemini models?...'
Testing retrieval_k=15 for query: 'What are the key capabilities of Gemini models?...'
Testing retrieval_k=1 for query: 'How does Gemini compare to other multimodal models...'
Testing retrieval_k=3 for query: 'How does Gemini compare to other multimodal models...'
Testing retrieval_k=5 for query: 'How does Gemini compare to other multimodal models...'
Testing retrieval_k=10 for query: 'How does Gemini compare to other multimodal models...'
Testing retrieval_k=15 for query: 'How does Gemini compare to other multimodal models...'

Completed 10 Retrieval-k experiments


In [7]:
retrieval_k_df

Unnamed: 0,experiment,query,top_p,retrieval_k,answer,context_length,latency,tokens,accuracy,completeness,clarity,overall_rating,reasoning
0,retrieval_k,What are the key capabilities of Gemini models?,0.9,1,"The key capabilities of Gemini models, particu...",3393,7.66,877,Excellent,Good,Good,Good,The answer accurately reflects the key capabil...
1,retrieval_k,What are the key capabilities of Gemini models?,0.9,3,The key capabilities of Gemini models include:...,10375,4.83,2528,Excellent,Good,Good,Good,The answer accurately reflects the key capabil...
2,retrieval_k,What are the key capabilities of Gemini models?,0.9,5,The key capabilities of Gemini models include:...,17216,4.58,3818,Good,Good,Good,Good,The answer accurately reflects the key capabil...
3,retrieval_k,What are the key capabilities of Gemini models?,0.9,10,Gemini models exhibit several key capabilities...,33928,5.22,7204,Good,Average,Good,Good,The answer accurately describes key capabiliti...
4,retrieval_k,What are the key capabilities of Gemini models?,0.9,15,Gemini models exhibit several key capabilities...,50224,6.43,11217,Good,Good,Excellent,Good,The answer accurately reflects the key capabil...
5,retrieval_k,How does Gemini compare to other multimodal mo...,0.9,1,"The Gemini models, particularly the Gemini Ult...",3449,4.43,925,Good,Below Average,Good,Average,The answer accurately describes the evaluation...
6,retrieval_k,How does Gemini compare to other multimodal mo...,0.9,3,"Gemini, specifically the Gemini Ultra model, d...",10168,4.48,2305,Excellent,Good,Good,Good,The answer accurately reflects the capabilitie...
7,retrieval_k,How does Gemini compare to other multimodal mo...,0.9,5,"Gemini models, particularly the Gemini Ultra v...",16853,4.29,3737,Good,Good,Good,Good,The answer accurately reflects the advancement...
8,retrieval_k,How does Gemini compare to other multimodal mo...,0.9,10,"Gemini, particularly its Ultra variant, demons...",33892,4.98,7784,Excellent,Good,Good,Good,The answer accurately highlights Gemini Ultra'...
9,retrieval_k,How does Gemini compare to other multimodal mo...,0.9,15,"Gemini models, particularly the Gemini Ultra v...",50561,5.18,11547,Good,Average,Good,Good,The answer accurately highlights Gemini Ultra'...


In [11]:
# Key Insights:
# 1. Top-p affects response creativity vs consistency in RAG
# 2. Retrieval-k affects information completeness vs noise
# 3. More chunks (higher k) = more context but potential information overload
# 4. Lower top-p = more conservative, consistent answers
# 5. Higher top-p = more creative but potentially less accurate answers

print("Notebook complete! Check the CSV files for detailed analysis.")

TypeError: Completions.create() got an unexpected keyword argument 'top_k'. Did you mean 'top_p'?