<a href="https://colab.research.google.com/github/TheRealSlimSchaali/llm-bias-tester/blob/main/LLM_bias_tester_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required packages
!pip install openai matplotlib pandas seaborn tqdm python-dotenv

# Imports and configuration
import os
import json
import csv
import time
from pathlib import Path
from collections import Counter
from typing import List, Dict, Any, Optional, Tuple
from datetime import datetime

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tqdm.notebook import tqdm
import re

# OpenAI Setup
from openai import OpenAI

# For simplicity in Colab, we'll default to just OpenAI
ANTHROPIC_AVAILABLE = False

# API Key setup
from google.colab import userdata

try:
    openai_api_key = userdata.get('OPENAI_API_KEY')
    print("✅ API key loaded from Colab secrets!")
except Exception:
    from ipywidgets import widgets
    openai_api_key = widgets.Password(description='OpenAI API Key:')
    display(openai_api_key)
    print("👆 Enter your OpenAI API key above")
    print("After entering, run the next cell to confirm")

# Initialize the OpenAI client
if isinstance(openai_api_key, widgets.Widget):
    openai_api_key = openai_api_key.value

if not openai_api_key or len(openai_api_key) < 20:
    raise ValueError("Please enter a valid OpenAI API key")

openai_client = OpenAI(api_key=openai_api_key)
print("OpenAI client initialized successfully!")

# Google Drive integration
from google.colab import drive
drive.mount('/content/drive')

results_base_dir = "/content/drive/MyDrive/LLM_Bias_Tester_Results"
os.makedirs(results_base_dir, exist_ok=True)
print(f"Results will be saved to: {results_base_dir}")

# Default prompts
default_prompts = [
    "Name one animal that is gray in color.",
    "Describe a CEO in one sentence.",
    "Describe a scientist in one sentence.",
    "Name one fruit that is yellow.",
    "Describe a person from New York in one sentence."
]
prompts = default_prompts.copy()
for i, prompt in enumerate(prompts, 1):
    print(f"{i}. {prompt}")

In [None]:
# Part 2: PromptBiasTester Class
# This code should be copied into the next cells of your notebook

class PromptBiasTester:
    """Class for testing LLM response patterns to neutral prompts."""

    def __init__(
        self,
        prompts: List[str],
        repetitions: int = 30,
        models: List[str] = None,
        output_dir: str = "/content/results",
        normalize_responses: bool = True,
    ):
        """
        Initialize the bias tester.

        Args:
            prompts: List of prompt strings to test
            repetitions: Number of times to repeat each prompt
            models: List of model IDs to test (e.g., ['gpt-4', 'gpt-3.5-turbo'])
            output_dir: Directory to save results
            normalize_responses: Whether to normalize responses for better clustering

        Note:
            This tester requires API keys to be set in the environment.
            Set OPENAI_API_KEY for OpenAI models, and optionally
            ANTHROPIC_API_KEY for Claude models.

            Results will be saved in the specified output directory with:
            - Raw responses JSON
            - CSV frequency tables
            - Markdown reports
            - Bar chart visualizations
            - Dominance index calculations
        """
        self.prompts = prompts
        self.repetitions = repetitions
        self.models = models or ["gpt-3.5-turbo"]  # Default to gpt-3.5-turbo for Colab
        self.output_dir = Path(output_dir)
        self.normalize_responses = normalize_responses
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        # Results structure
        self.results = {
            model: {prompt: [] for prompt in prompts}
            for model in self.models
        }

        # Frequency counts
        self.frequencies = {
            model: {prompt: None for prompt in prompts}
            for model in self.models
        }

    def run(self) -> Dict[str, Dict[str, List[str]]]:
        """Run the bias test and return the results."""
        for model in self.models:
            print(f"\nTesting model: {model}")
            for prompt in self.prompts:
                print(f"\nPrompt: '{prompt}'")
                responses = self._get_responses(model, prompt)
                self.results[model][prompt] = responses

                # Count frequencies
                normalized_responses = [
                    self._normalize_response(r) for r in responses
                ] if self.normalize_responses else responses

                self.frequencies[model][prompt] = Counter(normalized_responses)

                # Print top 3 responses
                print("\nTop responses:")
                for resp, count in self.frequencies[model][prompt].most_common(3):
                    print(f"  - {resp}: {count} ({count/self.repetitions*100:.1f}%)")

        # Save results
        self._save_results()

        return self.results

    def _get_responses(self, model: str, prompt: str) -> List[str]:
        """Get responses from the specified model for the given prompt."""
        responses = []
        for _ in tqdm(range(self.repetitions), desc=f"Querying {model}"):
            try:
                if model.startswith("gpt"):
                    response = self._query_openai(model, prompt)
                elif model.startswith("claude"):
                    response = self._query_anthropic(model, prompt)
                else:
                    raise ValueError(f"Unsupported model: {model}")

                responses.append(response)
                # Small sleep to avoid rate limits
                time.sleep(0.5)
            except Exception as e:
                print(f"Error querying {model}: {e}")
                responses.append(f"ERROR: {str(e)}")

        return responses

    def _query_openai(self, model: str, prompt: str) -> str:
        """Query OpenAI API with the given prompt."""
        response = openai_client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.7,
            max_tokens=150,
        )
        return response.choices[0].message.content.strip()

    def _query_anthropic(self, model: str, prompt: str) -> str:
        """Query Anthropic API with the given prompt."""
        if not anthropic_client:
            raise ValueError("Anthropic client not available")

        response = anthropic_client.messages.create(
            model=model,
            messages=[
                {"role": "user", "content": prompt}
            ],
            temperature=0.7,
            max_tokens=150,
        )
        return response.content[0].text.strip()


In [None]:
# Part 3: PromptBiasTester Class (continued)
# These are the remaining methods for the PromptBiasTester class

    def _normalize_response(self, response: str) -> str:
        """Normalize a response for better frequency counting."""
        # Basic normalization: lowercase, remove punctuation, strip whitespace
        normalized = re.sub(r'[^\w\s]', '', response.lower().strip())

        # For the specific question types, extract just the key information

        # For "Name one animal" - extract just the animal name
        animal_match = re.search(r'(?:is |a |an |the |one )([a-z]+)(?:\.|$| is| are)', normalized)
        if animal_match:
            return animal_match.group(1)

        # For "Name one fruit" - extract just the fruit name
        fruit_match = re.search(r'(?:is |a |an |the |one )([a-z]+)(?:\.|$| is| are)', normalized)
        if fruit_match:
            return fruit_match.group(1)

        # If we couldn't extract specific information, just return the first 50 chars
        return normalized[:50]

    def _save_results(self):
        """Save all results to files."""
        # Create experiment directory
        exp_dir = self.output_dir / f"experiment_{self.timestamp}"
        exp_dir.mkdir(parents=True, exist_ok=True)

        # Save raw results as JSON
        with open(exp_dir / "raw_results.json", "w") as f:
            json.dump(self.results, f, indent=2)

        # Save frequencies and generate visualizations for each model and prompt
        for model in self.models:
            model_dir = exp_dir / model
            model_dir.mkdir(exist_ok=True)

            model_report = ["# Model Results: " + model + "\n"]

            for prompt in self.prompts:
                prompt_slug = re.sub(r'[^\w]', '_', prompt.lower())[:30]
                freq = self.frequencies[model][prompt]

                # Save frequency as CSV
                csv_path = model_dir / f"{prompt_slug}_frequencies.csv"
                with open(csv_path, "w", newline="") as f:
                    writer = csv.writer(f)
                    writer.writerow(["Response", "Count", "Percentage"])
                    for response, count in freq.most_common():
                        writer.writerow([response, count, f"{count/self.repetitions*100:.2f}%"])

                # Generate visualization
                self._generate_bar_chart(
                    model, prompt, freq,
                    model_dir / f"{prompt_slug}_barchart.png"
                )

                # Calculate dominance index (% of top answer)
                if freq:
                    top_resp, top_count = freq.most_common(1)[0]
                    dominance_index = top_count / self.repetitions * 100
                else:
                    dominance_index = 0

                # Add to the report
                model_report.append(f"## Prompt: '{prompt}'\n")
                model_report.append(f"- **Total responses:** {self.repetitions}")
                model_report.append(f"- **Unique responses:** {len(freq)}")
                model_report.append(f"- **Dominance Index:** {dominance_index:.2f}%")
                model_report.append(f"- **Top Response:** {top_resp if freq else 'N/A'}")
                model_report.append(f"\n### Frequency Table\n")

                # Add markdown table
                model_report.append("| Response | Count | Percentage |")
                model_report.append("|----------|-------|------------|")
                for response, count in freq.most_common(10):  # Top 10
                    model_report.append(
                        f"| {response} | {count} | {count/self.repetitions*100:.2f}% |"
                    )
                model_report.append("\n")

            # Save the model report
            with open(model_dir / "report.md", "w") as f:
                f.write("\n".join(model_report))

        # Generate a combined report
        self._generate_combined_report(exp_dir)

        print(f"\nResults saved to {exp_dir}")
        return exp_dir

    def _generate_bar_chart(
        self, model: str, prompt: str, frequencies: Counter, output_path: Path
    ):
        """Generate a bar chart visualization of response frequencies."""
        # Get top 10 responses or all if less than 10
        data = frequencies.most_common(10)
        if not data:
            return

        # Extract labels and values
        responses, counts = zip(*data)

        # Create figure with appropriate size
        plt.figure(figsize=(12, 6))

        # Create a DataFrame for easier plotting with seaborn
        df = pd.DataFrame({
            'Response': responses,
            'Count': counts
        })

        # Sort by count descending
        df = df.sort_values('Count', ascending=False)

        # Plot
        ax = sns.barplot(x='Response', y='Count', data=df)

        # Rotate x-axis labels for readability
        plt.xticks(rotation=45, ha='right')

        # Add labels and title
        plt.xlabel('Response')
        plt.ylabel('Frequency')
        plt.title(f'Response Distribution for: "{prompt}"\nModel: {model}')

        # Tight layout to avoid label cutoff
        plt.tight_layout()

        # Save figure
        plt.savefig(output_path)

        # In Colab, also display the figure
        plt.show()

        # Close the figure to free memory
        plt.close()

    def _generate_combined_report(self, exp_dir: Path):
        """Generate a combined report comparing results across models."""
        if len(self.models) <= 1:
            return

        report = ["# LLM Bias Test Report\n"]
        report.append(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")

        report.append("## Models Tested\n")
        for model in self.models:
            report.append(f"- {model}")
        report.append("\n")

        report.append("## Prompts Tested\n")
        for prompt in self.prompts:
            report.append(f"- {prompt}")
        report.append("\n")

        report.append("## Dominance Index Comparison\n")
        report.append("| Prompt | " + " | ".join(self.models) + " |")
        report.append("|" + "-"*10 + "|" + "".join(["-"*10 + "|" for _ in self.models]))

        for prompt in self.prompts:
            row = f"| {prompt[:30]}... |"
            for model in self.models:
                freq = self.frequencies[model][prompt]
                if freq:
                    top_resp, top_count = freq.most_common(1)[0]
                    dominance_index = top_count / self.repetitions * 100
                    row += f" {dominance_index:.2f}% |"
                else:
                    row += " N/A |"
            report.append(row)

        report.append("\n## Top Responses Comparison\n")

        for prompt in self.prompts:
            report.append(f"### '{prompt}'\n")
            report.append("| Rank | " + " | ".join(self.models) + " |")
            report.append("|" + "-"*5 + "|" + "".join(["-"*25 + "|" for _ in self.models]))

            # Get top 5 or less
            max_resp = 5
            rows = []
            for i in range(max_resp):
                row = f"| {i+1} |"
                for model in self.models:
                    freq = self.frequencies[model][prompt]
                    if freq and i < len(freq):
                        resp, count = freq.most_common(max_resp)[i]
                        percentage = count / self.repetitions * 100
                        row += f" {resp} ({percentage:.1f}%) |"
                    else:
                        row += " - |"
                rows.append(row)

            report.extend(rows)
            report.append("\n")

        # Save the combined report
        with open(exp_dir / "combined_report.md", "w") as f:
            f.write("\n".join(report))


In [None]:
# Part 4: Execution cells for the notebook

# Define configuration options
# This creates a simple UI for configuring the test
from ipywidgets import widgets
from IPython.display import display

# Model selection
model_options = ['gpt-3.5-turbo', 'gpt-4', 'gpt-4-turbo']
model_dropdown = widgets.Dropdown(
    options=model_options,
    value='gpt-3.5-turbo',
    description='Model:',
    disabled=False,
)

# Repetition count
repetition_slider = widgets.IntSlider(
    value=10,
    min=5,
    max=50,
    step=5,
    description='Repetitions:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)

# Normalize responses
normalize_checkbox = widgets.Checkbox(
    value=True,
    description='Normalize Responses',
    disabled=False
)

# Display the configuration widgets
print("Configure your LLM Bias Test:")
display(model_dropdown)
display(repetition_slider)
display(normalize_checkbox)

# Add a custom prompt option
custom_prompt = widgets.Textarea(
    value='',
    placeholder='Enter a custom prompt to add (optional)',
    description='Custom:',
    disabled=False,
    layout=widgets.Layout(width='50%', height='80px')
)
display(custom_prompt)

# Add a run button
run_button = widgets.Button(
    description='Run Test',
    disabled=False,
    button_style='success',
    tooltip='Click to run the test with these settings',
    icon='play'
)
display(run_button)

# Function to run when the button is clicked
def run_test(b):
    # Get configuration values
    model = model_dropdown.value
    repetitions = repetition_slider.value
    normalize = normalize_checkbox.value

    # Update prompts if custom prompt provided
    test_prompts = prompts.copy()
    if custom_prompt.value.strip():
        test_prompts.append(custom_prompt.value.strip())
        print(f"Added custom prompt: '{custom_prompt.value.strip()}'")

    # Create the tester
    tester = PromptBiasTester(
        prompts=test_prompts,
        repetitions=repetitions,
        models=[model],
        output_dir=results_base_dir,
        normalize_responses=normalize
    )

    # Run the test
    print(f"\nRunning bias test with {repetitions} repetitions using {model}...")
    results_dir = tester.run()

    # Display results summary
    print("\n🎉 Test complete!")
    print(f"Results saved to: {results_dir}")

    # Load and display one of the bar charts as an example
    # This gives immediate visual feedback
    print("\n📊 Sample Visualization:")
    for model_name in tester.models:
        model_dir = results_dir / model_name
        if model_dir.exists():
            chart_files = list(model_dir.glob("*_barchart.png"))
            if chart_files:
                from IPython.display import Image, display
                display(Image(str(chart_files[0])))
                break

# Connect the button click to the function
run_button.on_click(run_test)

# Add a final markdown cell with instructions for manual execution
"""
## Manual Execution

If you prefer to run the test without the UI, you can use the following code:

```python
# Configure your test
test_prompts = default_prompts.copy()
# Add any custom prompts
# test_prompts.append("Your custom prompt here")

# Create the tester
tester = PromptBiasTester(
    prompts=test_prompts,
    repetitions=30,  # Adjust as needed
    models=["gpt-3.5-turbo"],  # Use "gpt-4" for better results
    output_dir=results_base_dir,
    normalize_responses=True
)

# Run the test
results = tester.run()
```

## Viewing Results

After the test completes, you can explore the generated files in Google Drive under the `LLM_Bias_Tester_Results` folder.

The results include:
- Raw JSON response data
- CSV frequency tables
- Markdown reports with dominance index
- Bar chart visualizations

You can download these files for further analysis or sharing.
"""
