# LLM Prompting Research Tool

This is a small notebook to test out the LLM Prompting for MHC. Written on 06-20-25.

In [None]:
import json
import requests
import os
from google.colab import userdata
from typing import List, Dict, Any, Tuple
import pandas as pd
from datetime import datetime
import re

## Configuration

Set up the OpenAI API key and model configuration:

In [None]:
# Configuration
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
MODEL = 'gpt-3.5-turbo'
MAX_TOKENS = 1000
TEMPERATURE = 0.7
MAX_RETRIES = 3

print("✅ API key configured")

✅ API key configured


## Core LLM Interface

Taken from the planNudges.ts implementation:

In [None]:
class LLMPromptTester:
    def __init__(self, api_key: str, model: str = MODEL, max_tokens: int = MAX_TOKENS, temperature: float = TEMPERATURE):
        self.api_key = api_key
        self.model = model
        self.max_tokens = max_tokens
        self.temperature = temperature
        self.results_history = []

    async def call_openai_api(self, prompt: str, retries: int = MAX_RETRIES) -> Dict[str, Any]:
        last_error = None

        for attempt in range(1, retries + 1):
            try:
                response = requests.post(
                    'https://api.openai.com/v1/chat/completions',
                    headers={
                        'Authorization': f'Bearer {self.api_key}',
                        'Content-Type': 'application/json'
                    },
                    json={
                        'model': self.model,
                        'messages': [{'role': 'user', 'content': prompt}],
                        'max_tokens': self.max_tokens,
                        'temperature': self.temperature
                    },
                    timeout=30
                )

                if not response.ok:
                    raise Exception(f'OpenAI API error: {response.status_code} {response.text}')

                data = response.json()
                content = data['choices'][0]['message']['content']

                return {
                    'success': True,
                    'content': content,
                    'attempt': attempt,
                    'error': None
                }

            except Exception as error:
                last_error = str(error)
                print(f"Attempt {attempt}/{retries} failed: {error}")

                if attempt < retries:
                    import time
                    time.sleep(attempt)  # Progressive backoff

        return {
            'success': False,
            'content': None,
            'attempt': retries,
            'error': last_error
        }

    def validate_nudge_response(self, content: str, expected_count: int = 7) -> Dict[str, Any]:
        """Validate LLM response format based on planNudges.ts validation logic"""
        validation_result = {
            'is_valid': False,
            'parsed_data': None,
            'errors': [],
            'warnings': []
        }

        try:
            # Try to parse JSON
            parsed_nudges = json.loads(content)
            validation_result['parsed_data'] = parsed_nudges

            # Check if it's an array
            if not isinstance(parsed_nudges, list):
                validation_result['errors'].append('Response is not an array')
                return validation_result

            # Check expected count
            if len(parsed_nudges) != expected_count:
                validation_result['errors'].append(f'Expected {expected_count} items, got {len(parsed_nudges)}')

            # Validate each nudge structure
            for i, nudge in enumerate(parsed_nudges):
                if not isinstance(nudge, dict):
                    validation_result['errors'].append(f'Item {i} is not an object')
                    continue

                if 'title' not in nudge:
                    validation_result['errors'].append(f'Item {i} missing "title" field')
                elif not isinstance(nudge['title'], str) or not nudge['title'].strip():
                    validation_result['errors'].append(f'Item {i} has invalid title')

                if 'body' not in nudge:
                    validation_result['errors'].append(f'Item {i} missing "body" field')
                elif not isinstance(nudge['body'], str) or not nudge['body'].strip():
                    validation_result['errors'].append(f'Item {i} has invalid body')

                # Check for reasonable length
                if 'title' in nudge and len(nudge['title']) > 100:
                    validation_result['warnings'].append(f'Item {i} title is very long ({len(nudge["title"])} chars)')

                if 'body' in nudge and len(nudge['body']) > 300:
                    validation_result['warnings'].append(f'Item {i} body is very long ({len(nudge["body"])} chars)')

            # If no errors, mark as valid
            if not validation_result['errors']:
                validation_result['is_valid'] = True

        except json.JSONDecodeError as e:
            validation_result['errors'].append(f'Invalid JSON: {str(e)}')
        except Exception as e:
            validation_result['errors'].append(f'Validation error: {str(e)}')

        return validation_result

    async def test_prompt(self, prompt: str, expected_count: int = 7, description: str = "") -> Dict[str, Any]:
        """Test a prompt and validate the response"""
        print(f"\n🧪 Testing prompt: {description or 'Unnamed test'}")
        print(f"📝 Prompt length: {len(prompt)} characters")

        # Call API
        api_result = await self.call_openai_api(prompt)

        result = {
            'timestamp': datetime.now().isoformat(),
            'description': description,
            'prompt': prompt,
            'api_success': api_result['success'],
            'api_attempts': api_result['attempt'],
            'api_error': api_result['error'],
            'raw_response': api_result['content'],
            'validation': None
        }

        if api_result['success']:
            print(f"✅ API call successful (attempt {api_result['attempt']})")

            # Validate response
            validation = self.validate_nudge_response(api_result['content'], expected_count)
            result['validation'] = validation

            if validation['is_valid']:
                print(f"✅ Response validation passed")
                if validation['warnings']:
                    print(f"⚠️  Warnings: {len(validation['warnings'])}")
                    for warning in validation['warnings']:
                        print(f"   - {warning}")
            else:
                print(f"❌ Response validation failed")
                print(f"🔍 Errors: {len(validation['errors'])}")
                for error in validation['errors']:
                    print(f"   - {error}")
        else:
            print(f"❌ API call failed after {api_result['attempt']} attempts")
            print(f"🔍 Error: {api_result['error']}")

        # Store result
        self.results_history.append(result)

        return result

## Predefined Test Prompts

Based on the nudge generation prompts from planNudges.ts:

In [None]:
# Original English prompt from planNudges.ts

ORIGINAL_ENGLISH_PROMPT = """Generate 7 motivational sports and exercise nudges for a heart health study participant. Each nudge should:
- Be encouraging and positive
- Focus on different types of physical activities and sports
- Be personalized and engaging
- Include a clear call to action
- Be suitable for someone in a heart health study
- Optionally include dynamic data variables (e.g., {{step_count}}, {{weekly_goal}}, {{last_activity}}, {{weather}}, etc.) that can be filled in later

Return the response as a JSON array with exactly 7 objects, each having "title" and "body" fields.
Example format:
[
  {"title": "Morning Energy Boost", "body": "Start your day with a 15-minute walk! Your heart will love the gentle cardio."},
  {"title": "Step It Up", "body": "You're just {{step_count}} steps away from your goal—keep moving!"}
]

Make each nudge unique and focus on different activities like walking, swimming, dancing, team sports, strength training, yoga, etc."""

# Original Spanish prompt from planNudges.ts
ORIGINAL_SPANISH_PROMPT = """Genera 7 recordatorios motivacionales de deportes y ejercicio para un participante en un estudio de salud cardíaca. Cada recordatorio debe:
- Ser alentador y positivo
- Enfocarse en diferentes tipos de actividades físicas y deportes
- Ser personalizado y atractivo
- Incluir una llamada clara a la acción
- Ser adecuado para alguien en un estudio de salud cardíaca

Devuelve la respuesta como un array JSON con exactamente 7 objetos, cada uno con campos "title" y "body".
Formato de ejemplo:
[
  {"title": "Impulso de Energía Matutino", "body": "¡Comienza tu día con una caminata de 15 minutos! Tu corazón amará el cardio suave."},
  ...
]

Haz cada recordatorio único y enfócate en diferentes actividades como caminar, nadar, bailar, deportes de equipo, entrenamiento de fuerza, yoga, etc."""

# Test prompts for experimentation
TEST_PROMPTS = {
    'original_english': ORIGINAL_ENGLISH_PROMPT,
    'original_spanish': ORIGINAL_SPANISH_PROMPT,

    'simplified_english': """Create 7 exercise reminders for heart health study participants. Each should have a "title" and "body". Return as JSON array.
Make them motivational and focus on different activities.""",

    'detailed_english': """Generate exactly 7 motivational exercise nudges for heart health study participants. Requirements:
1. Each nudge must be encouraging and positive
2. Focus on diverse physical activities (walking, swimming, cycling, dancing, strength training, yoga, team sports)
3. Include specific, actionable advice
4. Appropriate for cardiovascular health improvement
5. Vary the tone and approach for each nudge

CRITICAL: Return ONLY a valid JSON array with exactly 7 objects. Each object must have exactly these fields:
- "title": string (maximum 50 characters)
- "body": string (maximum 150 characters)

Example format:
[{"title": "Morning Walk Challenge", "body": "Start today with a 10-minute walk. Your heart will thank you!"}]""",

    'json_focused': """You are a JSON generator. Generate exactly 7 exercise nudges.

Output ONLY valid JSON in this exact format:
[{"title": "Exercise Title", "body": "Exercise description with call to action"}]

Requirements:
- Exactly 7 items
- Heart health focus
- Different activities each
- Positive tone
- No additional text outside JSON"""
}

## Initialize Tester and Run Tests

In [None]:
# Initialize the tester
tester = LLMPromptTester(OPENAI_API_KEY)

print("🚀 LLM Prompt Tester initialized")
print(f"📊 Model: {MODEL}")
print(f"🌡️  Temperature: {TEMPERATURE}")
print(f"📝 Max tokens: {MAX_TOKENS}")

🚀 LLM Prompt Tester initialized
📊 Model: gpt-3.5-turbo
🌡️  Temperature: 0.7
📝 Max tokens: 1000


## Test Individual Prompts

Run this cell to test a specific prompt:

In [None]:
# Test a specific prompt
prompt_name = 'ORIGINAL_ENGLISH_PROMPT'  # Change this to test different prompts
result = await tester.test_prompt(
    TEST_PROMPTS[prompt_name],
    description=f"Testing {prompt_name} prompt"
)

# Display the result
if result['api_success'] and result['validation']['is_valid']:
    print("\n📋 Generated Nudges:")
    for i, nudge in enumerate(result['validation']['parsed_data'], 1):
        print(f"{i}. {nudge['title']}")
        print(f"   {nudge['body']}\n")

NameError: name 'TEST_PROMPTS' is not defined

## Batch Test All Prompts

Run all test prompts and compare results:

In [None]:
# Test all prompts
print("🔬 Running batch tests on all prompts...\n")

batch_results = {}
for prompt_name, prompt_text in TEST_PROMPTS.items():
    result = await tester.test_prompt(
        prompt_text,
        description=f"Batch test: {prompt_name}"
    )
    batch_results[prompt_name] = result

    # Small delay to avoid rate limiting
    import time
    time.sleep(1)

print("\n✅ Batch testing complete!")

🔬 Running batch tests on all prompts...


🧪 Testing prompt: Batch test: original_english
📝 Prompt length: 698 characters
✅ API call successful (attempt 1)
✅ Response validation passed

🧪 Testing prompt: Batch test: original_spanish
📝 Prompt length: 790 characters
✅ API call successful (attempt 1)
✅ Response validation passed

🧪 Testing prompt: Batch test: simplified_english
📝 Prompt length: 183 characters
✅ API call successful (attempt 1)
✅ Response validation passed

🧪 Testing prompt: Batch test: detailed_english
📝 Prompt length: 720 characters
✅ API call successful (attempt 1)
✅ Response validation passed

🧪 Testing prompt: Batch test: json_focused
📝 Prompt length: 321 characters
✅ API call successful (attempt 1)
✅ Response validation passed

✅ Batch testing complete!


## Results Analysis

In [None]:
# Analyze results
results_summary = []

for prompt_name, result in batch_results.items():
    summary = {
        'prompt_name': prompt_name,
        'api_success': result['api_success'],
        'api_attempts': result['api_attempts'],
        'validation_success': result['validation']['is_valid'] if result['validation'] else False,
        'error_count': len(result['validation']['errors']) if result['validation'] else 0,
        'warning_count': len(result['validation']['warnings']) if result['validation'] else 0,
        'prompt_length': len(result['prompt'])
    }
    results_summary.append(summary)

# Create results DataFrame
df_results = pd.DataFrame(results_summary)
print("📊 Results Summary:")
print(df_results.to_string(index=False))

# Success rate
success_rate = (df_results['validation_success'].sum() / len(df_results)) * 100
print(f"\n🎯 Overall success rate: {success_rate:.1f}%")

📊 Results Summary:
  original_english         True             1                True            0              0            698
  original_spanish         True             1                True            0              0            790
simplified_english         True             1                True            0              0            183
  detailed_english         True             1                True            0              0            720
      json_focused         True             1                True            0              0            321

🎯 Overall success rate: 100.0%


## Custom Prompt Testing

Use this section to test your own custom prompts:

In [None]:
# Custom prompt for testing
custom_prompt = """
Write your custom prompt here...
"""

# Test custom prompt
if custom_prompt.strip():
    custom_result = await tester.test_prompt(
        custom_prompt,
        description="Custom prompt test"
    )

    if custom_result['api_success'] and custom_result['validation']['is_valid']:
        print("\n📋 Custom Prompt Results:")
        for i, nudge in enumerate(custom_result['validation']['parsed_data'], 1):
            print(f"{i}. {nudge['title']}")
            print(f"   {nudge['body']}\n")
else:
    print("✏️  Enter a custom prompt in the cell above to test it")


🧪 Testing prompt: Custom prompt test
📝 Prompt length: 34 characters
✅ API call successful (attempt 1)
❌ Response validation failed
🔍 Errors: 1
   - Invalid JSON: Expecting value: line 1 column 1 (char 0)


## Export Results

Export test results for further analysis:

In [None]:
# Export results to JSON
import json
from datetime import datetime

export_data = {
    'test_session': {
        'timestamp': datetime.now().isoformat(),
        'model': MODEL,
        'temperature': TEMPERATURE,
        'max_tokens': MAX_TOKENS
    },
    'results': tester.results_history
}

filename = f"llm_test_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(filename, 'w') as f:
    json.dump(export_data, f, indent=2)

print(f"📁 Results exported to: {filename}")
print(f"📊 Total tests conducted: {len(tester.results_history)}")

## Validation Utilities

Additional utilities for response validation:

In [None]:
def analyze_response_quality(responses: List[Dict]) -> Dict[str, Any]:
    """Analyze the quality and characteristics of generated responses"""
    analysis = {
        'title_lengths': [],
        'body_lengths': [],
        'common_words': {},
        'activity_types': [],
        'sentiment_indicators': []
    }

    for response in responses:
        if 'title' in response:
            analysis['title_lengths'].append(len(response['title']))

        if 'body' in response:
            analysis['body_lengths'].append(len(response['body']))

            # Simple word frequency analysis
            words = re.findall(r'\b\w+\b', response['body'].lower())
            for word in words:
                if len(word) > 3:  # Skip short words
                    analysis['common_words'][word] = analysis['common_words'].get(word, 0) + 1

    # Calculate statistics
    if analysis['title_lengths']:
        analysis['avg_title_length'] = sum(analysis['title_lengths']) / len(analysis['title_lengths'])
    if analysis['body_lengths']:
        analysis['avg_body_length'] = sum(analysis['body_lengths']) / len(analysis['body_lengths'])

    # Top common words
    analysis['top_words'] = sorted(analysis['common_words'].items(), key=lambda x: x[1], reverse=True)[:10]

    return analysis

# Example usage with successful results
successful_responses = []
for result in tester.results_history:
    if result['api_success'] and result['validation'] and result['validation']['is_valid']:
        successful_responses.extend(result['validation']['parsed_data'])

if successful_responses:
    quality_analysis = analyze_response_quality(successful_responses)
    print("📈 Response Quality Analysis:")
    print(f"Average title length: {quality_analysis.get('avg_title_length', 0):.1f} characters")
    print(f"Average body length: {quality_analysis.get('avg_body_length', 0):.1f} characters")
    print(f"Most common words: {[word for word, count in quality_analysis['top_words'][:5]]}")
else:
    print("No successful responses to analyze yet. Run some tests first!")