# 03 - Temperature Parameter Comparison

This notebook compares different temperature values for RAG applications:
- Temperature: 0.0, 0.2, 0.5, 0.8, 1.2, 1.5, 2.0

We'll evaluate: accuracy, creativity, diversity, consistency, and response quality.

In [None]:
from dotenv import load_dotenv
from openai import OpenAI
import pandas as pd
import time
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import re

load_dotenv()
client = OpenAI()

# Test queries about Gemini
TEST_QUERIES = [
    "What are the key capabilities of Gemini models?",
    "How does Gemini compare to other multimodal models?", 
    "What are the different versions of Gemini?",
    "What training data was used for Gemini?",
    "What are the safety measures in Gemini models?",
    "How does Gemini perform on benchmarks?",
    "What is the architecture of Gemini?",
    "What are the limitations of Gemini?"
]

# Temperature values to test
TEMPERATURES = [0.0, 0.2, 0.5, 0.8, 1.2, 1.5, 2.0]

print(f"Testing {len(TEMPERATURES)} temperature values")
print(f"Using {len(TEST_QUERIES)} test queries")

In [None]:
# Enhanced LLM Judge for Temperature-Specific Evaluation
def llm_judge_temperature(query, answer, context, judge_model="gpt-4o-mini"):
    judge_prompt = f"""
You are an expert evaluator assessing RAG responses with focus on temperature effects.

CONTEXT:
{context[:2000]}...

QUESTION: {query}

ANSWER: {answer}

Evaluate on these 6 criteria using this scale:
- Poor (1): Severely lacking
- Below Average (2): Partially adequate
- Average (3): Meets basic requirements
- Good (4): High quality
- Excellent (5): Outstanding

CRITERIA:
1. Accuracy: Factual correctness based on context
2. Completeness: Addresses all parts of the question
3. Clarity: Well-structured and understandable
4. Creativity: Novel insights or creative explanations
5. Diversity: Varied vocabulary and expression
6. Consistency: Logical flow and coherence

Respond in this exact format:
Accuracy: [Rating]
Completeness: [Rating]
Clarity: [Rating]
Creativity: [Rating]
Diversity: [Rating]
Consistency: [Rating]
Overall: [Rating]
Reasoning: [Brief explanation]
"""
    
    try:
        response = client.chat.completions.create(
            model=judge_model,
            messages=[{"role": "user", "content": judge_prompt}],
            temperature=0.1,
            max_tokens=400
        )
        
        evaluation = response.choices[0].message.content
        lines = evaluation.strip().split('\n')
        result = {}
        
        for line in lines:
            if ':' in line:
                key, value = line.split(':', 1)
                result[key.strip().lower()] = value.strip()
        
        return {
            'accuracy': result.get('accuracy', 'N/A'),
            'completeness': result.get('completeness', 'N/A'),
            'clarity': result.get('clarity', 'N/A'),
            'creativity': result.get('creativity', 'N/A'),
            'diversity': result.get('diversity', 'N/A'),
            'consistency': result.get('consistency', 'N/A'),
            'overall': result.get('overall', 'N/A'),
            'reasoning': result.get('reasoning', 'N/A')
        }
        
    except Exception as e:
        return {
            'accuracy': 'Error',
            'completeness': 'Error',
            'clarity': 'Error',
            'creativity': 'Error',
            'diversity': 'Error',
            'consistency': 'Error',
            'overall': 'Error',
            'reasoning': f'Evaluation failed: {str(e)}'
        }

print("Enhanced LLM Judge function ready")

In [None]:
# Load RAG embeddings and setup
with open('../data/rag_embeddings.pkl', 'rb') as f:
    rag_data = pickle.load(f)

chunks = rag_data['chunks']
embedding_results = rag_data['embedding_results']

# Use the best performing embedding model (text-embedding-3-small as default)
embeddings = embedding_results['text-embedding-3-small']['embeddings']

print(f"Loaded {len(chunks)} chunks for RAG experiments")
print(f"Using embeddings from text-embedding-3-small model")

# RAG retrieval function
def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding

def retrieve_chunks(query, chunks, embeddings, k=3):
    query_embedding = get_embedding(query)
    similarities = cosine_similarity([query_embedding], embeddings)[0]
    top_indices = np.argsort(similarities)[::-1][:k]
    
    retrieved_chunks = []
    for idx in top_indices:
        retrieved_chunks.append({
            'chunk': chunks[idx],
            'similarity': similarities[idx]
        })
    
    return retrieved_chunks

In [None]:
# Temperature RAG Experiment Function
def run_temperature_rag_experiment(query, temp):
    start = time.time()
    
    # Retrieve relevant chunks
    retrieved = retrieve_chunks(query, chunks, embeddings, k=3)
    context = "\n\n".join([item['chunk']['text'] for item in retrieved])
    
    # Generate response with context
    prompt = f"""
Use the context below to answer the question. Be accurate and cite specific information from the context.

Context:
{context}

Question: {query}

Answer:"""
    
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=temp,
        max_tokens=200
    )
    
    latency = time.time() - start
    answer = response.choices[0].message.content
    
    # Get enhanced LLM judge evaluation
    evaluation = llm_judge_temperature(query, answer, context)
    
    # Calculate additional metrics
    word_count = len(answer.split())
    unique_words = len(set(answer.lower().split()))
    lexical_diversity = unique_words / word_count if word_count > 0 else 0
    
    return {
        "query": query,
        "temperature": temp,
        "answer": answer,
        "context_length": len(context),
        "latency": round(latency, 2),
        "tokens": response.usage.total_tokens,
        "word_count": word_count,
        "unique_words": unique_words,
        "lexical_diversity": round(lexical_diversity, 3),
        "accuracy": evaluation['accuracy'],
        "completeness": evaluation['completeness'],
        "clarity": evaluation['clarity'],
        "creativity": evaluation['creativity'],
        "diversity": evaluation['diversity'],
        "consistency": evaluation['consistency'],
        "overall_rating": evaluation['overall'],
        "reasoning": evaluation['reasoning']
    }

print("Temperature RAG experiment function ready")

In [None]:
# Run Temperature RAG Experiments
print("Running Temperature RAG experiments...")
results = []

for query in TEST_QUERIES:
    for temp in TEMPERATURES:
        print(f"Testing query: '{query[:50]}...' at temperature {temp}")
        result = run_temperature_rag_experiment(query, temp)
        results.append(result)
        time.sleep(1)  # Rate limiting

df = pd.DataFrame(results)
print(f"\nCompleted {len(results)} Temperature RAG experiments")
print(f"Total combinations: {len(TEST_QUERIES)} queries √ó {len(TEMPERATURES)} temperatures = {len(TEST_QUERIES) * len(TEMPERATURES)}")

In [None]:
# Display results summary
df.head()

In [None]:
# Save results
df.to_csv('../data/temperature_rag_results.csv', index=False)
print("Results saved to temperature_rag_results.csv")

# Save detailed results with pickle
with open('../data/temperature_rag_detailed.pkl', 'wb') as f:
    pickle.dump({
        'results': results,
        'test_queries': TEST_QUERIES,
        'temperatures': TEMPERATURES,
        'chunks': chunks,
        'embeddings': embeddings
    }, f)
print("Detailed results saved to temperature_rag_detailed.pkl")

In [None]:
# Convert ratings to numeric for analysis
rating_map = {'Poor': 1, 'Below Average': 2, 'Average': 3, 'Good': 4, 'Excellent': 5}

for col in ['accuracy', 'completeness', 'clarity', 'creativity', 'diversity', 'consistency', 'overall_rating']:
    df[f'{col}_numeric'] = df[col].map(rating_map)

print("Converted ratings to numeric values for analysis")

In [None]:
# Statistical Summary by Temperature
summary_stats = df.groupby('temperature').agg({
    'accuracy_numeric': ['mean', 'std'],
    'completeness_numeric': ['mean', 'std'],
    'clarity_numeric': ['mean', 'std'],
    'creativity_numeric': ['mean', 'std'],
    'diversity_numeric': ['mean', 'std'],
    'consistency_numeric': ['mean', 'std'],
    'overall_rating_numeric': ['mean', 'std'],
    'latency': ['mean', 'std'],
    'tokens': ['mean', 'std'],
    'lexical_diversity': ['mean', 'std']
}).round(3)

print("Temperature Performance Summary:")
print(summary_stats)

In [None]:
# Create comprehensive visualizations
plt.style.use('default')
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Temperature Effects on RAG Performance', fontsize=16, fontweight='bold')

metrics = ['accuracy_numeric', 'completeness_numeric', 'clarity_numeric', 
          'creativity_numeric', 'diversity_numeric', 'consistency_numeric']
titles = ['Accuracy', 'Completeness', 'Clarity', 'Creativity', 'Diversity', 'Consistency']

for i, (metric, title) in enumerate(zip(metrics, titles)):
    row, col = i // 3, i % 3
    
    # Box plot for each temperature
    df.boxplot(column=metric, by='temperature', ax=axes[row, col])
    axes[row, col].set_title(f'{title} by Temperature')
    axes[row, col].set_xlabel('Temperature')
    axes[row, col].set_ylabel(f'{title} Score (1-5)')
    axes[row, col].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Temperature vs Performance Line Plot
plt.figure(figsize=(15, 10))

# Calculate mean scores for each temperature
temp_means = df.groupby('temperature')[metrics].mean()

for i, (metric, title) in enumerate(zip(metrics, titles)):
    plt.subplot(2, 3, i+1)
    plt.plot(temp_means.index, temp_means[metric], 'o-', linewidth=2, markersize=8)
    plt.title(f'{title} vs Temperature')
    plt.xlabel('Temperature')
    plt.ylabel(f'{title} Score')
    plt.grid(True, alpha=0.3)
    plt.ylim(1, 5)

plt.tight_layout()
plt.show()

In [None]:
# Correlation Analysis
plt.figure(figsize=(12, 8))

# Create correlation matrix
corr_cols = ['temperature', 'lexical_diversity', 'latency'] + metrics
correlation_matrix = df[corr_cols].corr()

# Plot heatmap
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, fmt='.3f')
plt.title('Temperature and Performance Metrics Correlation')
plt.tight_layout()
plt.show()

In [None]:
# Best Temperature Analysis
print("\n=== TEMPERATURE COMPARISON ANALYSIS ===")

# Overall performance by temperature
overall_performance = df.groupby('temperature')['overall_rating_numeric'].agg(['mean', 'std', 'count'])
overall_performance.columns = ['Mean_Score', 'Std_Dev', 'Count']
overall_performance = overall_performance.sort_values('Mean_Score', ascending=False)

print("\nOverall Performance Ranking:")
print(overall_performance)

# Best temperature for each metric
print("\nBest Temperature for Each Metric:")
for metric, title in zip(metrics, titles):
    best_temp = df.groupby('temperature')[metric].mean().idxmax()
    best_score = df.groupby('temperature')[metric].mean().max()
    print(f"{title}: Temperature {best_temp} (Score: {best_score:.3f})")

# Temperature-specific insights
print("\nTemperature Insights:")
creativity_by_temp = df.groupby('temperature')['creativity_numeric'].mean()
consistency_by_temp = df.groupby('temperature')['consistency_numeric'].mean()
diversity_by_temp = df.groupby('temperature')['lexical_diversity'].mean()

print(f"Highest Creativity: Temperature {creativity_by_temp.idxmax()} ({creativity_by_temp.max():.3f})")
print(f"Highest Consistency: Temperature {consistency_by_temp.idxmax()} ({consistency_by_temp.max():.3f})")
print(f"Highest Lexical Diversity: Temperature {diversity_by_temp.idxmax()} ({diversity_by_temp.max():.3f})")

In [None]:
# Final Recommendations
print("\n=== TEMPERATURE RECOMMENDATIONS ===")

best_overall = overall_performance.index[0]
best_creativity = creativity_by_temp.idxmax()
best_consistency = consistency_by_temp.idxmax()

print(f"\nüèÜ BEST OVERALL: Temperature {best_overall}")
print(f"   - Highest overall rating: {overall_performance.loc[best_overall, 'Mean_Score']:.3f}")
print(f"   - Standard deviation: {overall_performance.loc[best_overall, 'Std_Dev']:.3f}")

print(f"\nüé® MOST CREATIVE: Temperature {best_creativity}")
print(f"   - Creativity score: {creativity_by_temp[best_creativity]:.3f}")

print(f"\nüéØ MOST CONSISTENT: Temperature {best_consistency}")
print(f"   - Consistency score: {consistency_by_temp[best_consistency]:.3f}")

print("\nüìä USE CASE RECOMMENDATIONS:")
print("‚Ä¢ Factual Q&A: Use lower temperatures (0.0-0.5) for accuracy")
print("‚Ä¢ Creative content: Use higher temperatures (1.2-2.0) for diversity")
print("‚Ä¢ Balanced responses: Use medium temperatures (0.5-0.8)")
print("‚Ä¢ Production systems: Consider consistency vs creativity trade-offs")