# LLM Prompting Research Tool

This is a small notebook to test out the LLM Prompting for MHC. Written on 06-20-25.

In [None]:
import json
import requests
import os
from google.colab import userdata
from typing import List, Dict, Any, Tuple
import pandas as pd
from datetime import datetime
import re

## Configuration

Set up the OpenAI API key and model configuration:

In [None]:
# Configuration
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
MODEL = 'gpt-3.5-turbo'
MAX_TOKENS = 1000
TEMPERATURE = 0.7
MAX_RETRIES = 3

print("✅ API key configured")

✅ API key configured


## Core LLM Interface

Taken from the planNudges.ts implementation:

In [ ]:
class LLMPromptTester:
    def __init__(self, api_key: str, model: str = MODEL, max_tokens: int = MAX_TOKENS, temperature: float = TEMPERATURE):
        self.api_key = api_key
        self.model = model
        self.max_tokens = max_tokens
        self.temperature = temperature
        self.results_history = []

    async def call_openai_api(self, prompt: str, retries: int = MAX_RETRIES) -> Dict[str, Any]:
        last_error = None

        for attempt in range(1, retries + 1):
            try:
                response = requests.post(
                    'https://api.openai.com/v1/chat/completions',
                    headers={
                        'Authorization': f'Bearer {self.api_key}',
                        'Content-Type': 'application/json'
                    },
                    json={
                        'model': self.model,
                        'messages': [{'role': 'user', 'content': prompt}],
                        'max_tokens': self.max_tokens,
                        'temperature': self.temperature
                    },
                    timeout=30
                )

                if not response.ok:
                    raise Exception(f'OpenAI API error: {response.status_code} {response.text}')

                data = response.json()
                content = data['choices'][0]['message']['content']

                return {
                    'success': True,
                    'content': content,
                    'attempt': attempt,
                    'error': None
                }

            except Exception as error:
                last_error = str(error)
                print(f"Attempt {attempt}/{retries} failed: {error}")

                if attempt < retries:
                    import time
                    time.sleep(attempt)  # Progressive backoff

        return {
            'success': False,
            'content': None,
            'attempt': retries,
            'error': last_error
        }

    def validate_nudge_response(self, content: str, expected_count: int = 7) -> Dict[str, Any]:
        """Validate LLM response format based on planNudges.ts validation logic"""
        validation_result = {
            'is_valid': False,
            'parsed_data': None,
            'errors': [],
            'warnings': []
        }

        try:
            # Try to parse JSON
            parsed_nudges = json.loads(content)
            validation_result['parsed_data'] = parsed_nudges

            # Check if it's an array
            if not isinstance(parsed_nudges, list):
                validation_result['errors'].append('Response is not an array')
                return validation_result

            # Check expected count
            if len(parsed_nudges) != expected_count:
                validation_result['errors'].append(f'Expected {expected_count} items, got {len(parsed_nudges)}')

            # Validate each nudge structure
            for i, nudge in enumerate(parsed_nudges):
                if not isinstance(nudge, dict):
                    validation_result['errors'].append(f'Item {i} is not an object')
                    continue

                if 'title' not in nudge:
                    validation_result['errors'].append(f'Item {i} missing "title" field')
                elif not isinstance(nudge['title'], str) or not nudge['title'].strip():
                    validation_result['errors'].append(f'Item {i} has invalid title')

                if 'body' not in nudge:
                    validation_result['errors'].append(f'Item {i} missing "body" field')
                elif not isinstance(nudge['body'], str) or not nudge['body'].strip():
                    validation_result['errors'].append(f'Item {i} has invalid body')

                # Check for reasonable length
                if 'title' in nudge and len(nudge['title']) > 100:
                    validation_result['warnings'].append(f'Item {i} title is very long ({len(nudge["title"])} chars)')

                if 'body' in nudge and len(nudge['body']) > 300:
                    validation_result['warnings'].append(f'Item {i} body is very long ({len(nudge["body"])} chars)')

            # If no errors, mark as valid
            if not validation_result['errors']:
                validation_result['is_valid'] = True

        except json.JSONDecodeError as e:
            validation_result['errors'].append(f'Invalid JSON: {str(e)}')
        except Exception as e:
            validation_result['errors'].append(f'Validation error: {str(e)}')

        return validation_result

    def simulate_step_count_aggregation(self, recent_step_data: List[int]) -> int:
        """Simulate step count aggregation from HealthObservations_HKQuantityTypeIdentifierStepCount"""
        if not recent_step_data:
            return 0
        return round(sum(recent_step_data) / len(recent_step_data))

    def generate_personalized_prompt(self, language: str = 'en', recent_step_average: int = None, 
                                   education_level: str = None, user_id: str = "test_user") -> str:
        """Generate personalized prompt with user step count and education level (matching planNudges.ts structure)"""
        
        # Build personalization context (matching planNudges.ts:208-228)
        personalization_context = []
        if recent_step_average is not None:
            personalization_context.append(
                f"Promedio de pasos diarios recientes: {recent_step_average}" if language == 'es' 
                else f"Recent daily step count average: {recent_step_average}"
            )
        if education_level:
            personalization_context.append(
                f"Nivel educativo: {education_level}" if language == 'es'
                else f"Education level: {education_level}"
            )
        
        context_str = ""
        if personalization_context:
            if language == 'es':
                context_str = f"\\n\\nInformación del participante:\\n{chr(10).join([f'- {ctx}' for ctx in personalization_context])}\\n"
            else:
                context_str = f"\\n\\nParticipant information:\\n{chr(10).join([f'- {ctx}' for ctx in personalization_context])}\\n"
        
        if language == 'es':
            # Spanish prompt matching planNudges.ts:232-249
            base_prompt = f"""Genera 7 recordatorios motivacionales de deportes y ejercicio para un participante en un estudio de salud cardíaca.{context_str}
Cada recordatorio debe:
- Ser alentador y positivo
- Enfocarse en diferentes tipos de actividades físicas y deportes
- Ser personalizado y atractivo basado en la información del participante
- Incluir una llamada clara a la acción
- Ser adecuado para alguien en un estudio de salud cardíaca
- Adaptar el lenguaje y las sugerencias al nivel educativo del participante
- Incorporar referencias al conteo de pasos cuando sea relevante

Devuelve la respuesta como un array JSON con exactamente 7 objetos, cada uno con campos "title" y "body".
Formato de ejemplo:
[
  {{"title": "Impulso de Energía Matutino", "body": "¡Comienza tu día con una caminata de 15 minutos! Tu corazón amará el cardio suave."}},
  ...
]

Haz cada recordatorio único y enfócate en diferentes actividades como caminar, nadar, bailar, deportes de equipo, entrenamiento de fuerza, yoga, etc."""
        else:
            # English prompt matching planNudges.ts:250-267
            base_prompt = f"""Generate 7 motivational sports and exercise nudges for a heart health study participant.{context_str}
Each nudge should:
- Be encouraging and positive
- Focus on different types of physical activities and sports
- Be personalized and engaging based on the participant's information
- Include a clear call to action
- Be suitable for someone in a heart health study
- Adapt language and suggestions to the participant's education level
- Incorporate step count references when relevant

Return the response as a JSON array with exactly 7 objects, each having "title" and "body" fields.
Example format:
[
  {{"title": "Morning Energy Boost", "body": "Start your day with a 15-minute walk! Your heart will love the gentle cardio."}},
  ...
]

Make each nudge unique and focus on different activities like walking, swimming, dancing, team sports, strength training, yoga, etc."""
        
        return base_prompt

    async def test_prompt(self, prompt: str, expected_count: int = 7, description: str = "") -> Dict[str, Any]:
        """Test a prompt and validate the response"""
        print(f"\\n🧪 Testing prompt: {description or 'Unnamed test'}")
        print(f"📝 Prompt length: {len(prompt)} characters")

        # Call API
        api_result = await self.call_openai_api(prompt)

        result = {
            'timestamp': datetime.now().isoformat(),
            'description': description,
            'prompt': prompt,
            'api_success': api_result['success'],
            'api_attempts': api_result['attempt'],
            'api_error': api_result['error'],
            'raw_response': api_result['content'],
            'validation': None
        }

        if api_result['success']:
            print(f"✅ API call successful (attempt {api_result['attempt']})")

            # Validate response
            validation = self.validate_nudge_response(api_result['content'], expected_count)
            result['validation'] = validation

            if validation['is_valid']:
                print(f"✅ Response validation passed")
                if validation['warnings']:
                    print(f"⚠️  Warnings: {len(validation['warnings'])}")
                    for warning in validation['warnings']:
                        print(f"   - {warning}")
            else:
                print(f"❌ Response validation failed")
                print(f"🔍 Errors: {len(validation['errors'])}")
                for error in validation['errors']:
                    print(f"   - {error}")
        else:
            print(f"❌ API call failed after {api_result['attempt']} attempts")
            print(f"🔍 Error: {api_result['error']}")

        # Store result
        self.results_history.append(result)

        return result

    async def test_personalized_prompt(self, language: str = 'en', recent_step_average: int = None, 
                                     education_level: str = None, user_id: str = "test_user") -> Dict[str, Any]:
        """Test personalized prompt with user data (matching real Firebase structure)"""
        prompt = self.generate_personalized_prompt(language, recent_step_average, education_level, user_id)
        
        description = f"Personalized {language} prompt"
        if recent_step_average is not None:
            description += f" (avg steps: {recent_step_average})"
        if education_level:
            description += f" (edu: {education_level})"
            
        return await self.test_prompt(prompt, description=description)

## Predefined Test Prompts

Based on the nudge generation prompts from planNudges.ts:

In [ ]:
# Personalized prompt examples with real user data structure
# These match the actual Firebase user document structure

# Simulate realistic step count data from HealthObservations_HKQuantityTypeIdentifierStepCount
SAMPLE_STEP_DATA = {
    'low_activity': [1200, 1500, 800, 2100, 1800, 1400, 1600],  # avg: 1486
    'moderate_activity': [4500, 5200, 3800, 6100, 4900, 5300, 4700],  # avg: 4929
    'high_activity': [8200, 9100, 7800, 8900, 8500, 8700, 9200],  # avg: 8629
    'very_low_activity': [300, 500, 200, 800, 400, 600, 350]  # avg: 450
}

# Real educationLevel values (use exactly as stored in user documents)
EDUCATION_LEVELS = [
    "High school diploma or GED",
    "Some college, no degree",
    "Associate degree",
    "Bachelor's degree",
    "Master's degree", 
    "Doctoral or professional degree",
    "Trade/technical/vocational training",
    "Less than high school"
]

# Test profiles using real Firebase structure
PERSONALIZED_EXAMPLES = {
    'low_steps_graduate': {
        'recent_step_data': SAMPLE_STEP_DATA['low_activity'],
        'education_level': "Master's degree",
        'description': 'Low activity participant with graduate education'
    },
    'high_steps_highschool': {
        'recent_step_data': SAMPLE_STEP_DATA['high_activity'],
        'education_level': "High school diploma or GED",
        'description': 'High activity participant with high school education'
    },
    'moderate_steps_bachelor': {
        'recent_step_data': SAMPLE_STEP_DATA['moderate_activity'],
        'education_level': "Bachelor's degree",
        'description': 'Moderate activity participant with bachelor education'
    },
    'very_low_steps_trade': {
        'recent_step_data': SAMPLE_STEP_DATA['very_low_activity'],
        'education_level': "Trade/technical/vocational training",
        'description': 'Very low activity participant with technical education'
    },
    'moderate_steps_some_college': {
        'recent_step_data': SAMPLE_STEP_DATA['moderate_activity'],
        'education_level': "Some college, no degree",
        'description': 'Moderate activity participant with some college'
    }
}

# Original prompts from planNudges.ts (for comparison)
ORIGINAL_ENGLISH_PROMPT = """Generate 7 motivational sports and exercise nudges for a heart health study participant. Each nudge should:
- Be encouraging and positive
- Focus on different types of physical activities and sports
- Be personalized and engaging
- Include a clear call to action
- Be suitable for someone in a heart health study

Return the response as a JSON array with exactly 7 objects, each having "title" and "body" fields.
Example format:
[
  {"title": "Morning Energy Boost", "body": "Start your day with a 15-minute walk! Your heart will love the gentle cardio."},
  ...
]

Make each nudge unique and focus on different activities like walking, swimming, dancing, team sports, strength training, yoga, etc."""

ORIGINAL_SPANISH_PROMPT = """Genera 7 recordatorios motivacionales de deportes y ejercicio para un participante en un estudio de salud cardíaca. Cada recordatorio debe:
- Ser alentador y positivo
- Enfocarse en diferentes tipos de actividades físicas y deportes
- Ser personalizado y atractivo
- Incluir una llamada clara a la acción
- Ser adecuado para alguien en un estudio de salud cardíaca

Devuelve la respuesta como un array JSON con exactamente 7 objetos, cada uno con campos "title" y "body".
Formato de ejemplo:
[
  {"title": "Impulso de Energía Matutino", "body": "¡Comienza tu día con una caminata de 15 minutos! Tu corazón amará el cardio suave."},
  ...
]

Haz cada recordatorio único y enfócate en diferentes actividades como caminar, nadar, bailar, deportes de equipo, entrenamiento de fuerza, yoga, etc."""

# Test prompts for experimentation
TEST_PROMPTS = {
    'original_english': ORIGINAL_ENGLISH_PROMPT,
    'original_spanish': ORIGINAL_SPANISH_PROMPT,

    'simplified_english': """Create 7 exercise reminders for heart health study participants. Each should have a "title" and "body". Return as JSON array.
Make them motivational and focus on different activities.""",

    'detailed_english': """Generate exactly 7 motivational exercise nudges for heart health study participants. Requirements:
1. Each nudge must be encouraging and positive
2. Focus on diverse physical activities (walking, swimming, cycling, dancing, strength training, yoga, team sports)
3. Include specific, actionable advice
4. Appropriate for cardiovascular health improvement
5. Vary the tone and approach for each nudge

CRITICAL: Return ONLY a valid JSON array with exactly 7 objects. Each object must have exactly these fields:
- "title": string (maximum 50 characters)
- "body": string (maximum 150 characters)

Example format:
[{"title": "Morning Walk Challenge", "body": "Start today with a 10-minute walk. Your heart will thank you!"}]""",

    'json_focused': """You are a JSON generator. Generate exactly 7 exercise nudges.

Output ONLY valid JSON in this exact format:
[{"title": "Exercise Title", "body": "Exercise description with call to action"}]

Requirements:
- Exactly 7 items
- Heart health focus
- Different activities each
- Positive tone
- No additional text outside JSON"""
}

print("📊 Sample data loaded:")
print(f"Education levels: {len(EDUCATION_LEVELS)} options")
print(f"User profiles: {len(PERSONALIZED_EXAMPLES)} profiles")
print(f"Step data samples: {list(SAMPLE_STEP_DATA.keys())}")

## Initialize Tester and Run Tests

In [None]:
# Initialize the tester
tester = LLMPromptTester(OPENAI_API_KEY)

print("🚀 LLM Prompt Tester initialized")
print(f"📊 Model: {MODEL}")
print(f"🌡️  Temperature: {TEMPERATURE}")
print(f"📝 Max tokens: {MAX_TOKENS}")

🚀 LLM Prompt Tester initialized
📊 Model: gpt-3.5-turbo
🌡️  Temperature: 0.7
📝 Max tokens: 1000


## Test Individual Prompts

Run this cell to test a specific prompt:

In [None]:
# Test a specific prompt
prompt_name = 'ORIGINAL_ENGLISH_PROMPT'  # Change this to test different prompts
result = await tester.test_prompt(
    TEST_PROMPTS[prompt_name],
    description=f"Testing {prompt_name} prompt"
)

# Display the result
if result['api_success'] and result['validation']['is_valid']:
    print("\n📋 Generated Nudges:")
    for i, nudge in enumerate(result['validation']['parsed_data'], 1):
        print(f"{i}. {nudge['title']}")
        print(f"   {nudge['body']}\n")

NameError: name 'TEST_PROMPTS' is not defined

## Batch Test All Prompts

Run all test prompts and compare results:

In [None]:
# Test all prompts
print("🔬 Running batch tests on all prompts...\n")

batch_results = {}
for prompt_name, prompt_text in TEST_PROMPTS.items():
    result = await tester.test_prompt(
        prompt_text,
        description=f"Batch test: {prompt_name}"
    )
    batch_results[prompt_name] = result

    # Small delay to avoid rate limiting
    import time
    time.sleep(1)

print("\n✅ Batch testing complete!")

🔬 Running batch tests on all prompts...


🧪 Testing prompt: Batch test: original_english
📝 Prompt length: 698 characters
✅ API call successful (attempt 1)
✅ Response validation passed

🧪 Testing prompt: Batch test: original_spanish
📝 Prompt length: 790 characters
✅ API call successful (attempt 1)
✅ Response validation passed

🧪 Testing prompt: Batch test: simplified_english
📝 Prompt length: 183 characters
✅ API call successful (attempt 1)
✅ Response validation passed

🧪 Testing prompt: Batch test: detailed_english
📝 Prompt length: 720 characters
✅ API call successful (attempt 1)
✅ Response validation passed

🧪 Testing prompt: Batch test: json_focused
📝 Prompt length: 321 characters
✅ API call successful (attempt 1)
✅ Response validation passed

✅ Batch testing complete!


## Results Analysis

In [None]:
# Analyze results
results_summary = []

for prompt_name, result in batch_results.items():
    summary = {
        'prompt_name': prompt_name,
        'api_success': result['api_success'],
        'api_attempts': result['api_attempts'],
        'validation_success': result['validation']['is_valid'] if result['validation'] else False,
        'error_count': len(result['validation']['errors']) if result['validation'] else 0,
        'warning_count': len(result['validation']['warnings']) if result['validation'] else 0,
        'prompt_length': len(result['prompt'])
    }
    results_summary.append(summary)

# Create results DataFrame
df_results = pd.DataFrame(results_summary)
print("📊 Results Summary:")
print(df_results.to_string(index=False))

# Success rate
success_rate = (df_results['validation_success'].sum() / len(df_results)) * 100
print(f"\n🎯 Overall success rate: {success_rate:.1f}%")

📊 Results Summary:
  original_english         True             1                True            0              0            698
  original_spanish         True             1                True            0              0            790
simplified_english         True             1                True            0              0            183
  detailed_english         True             1                True            0              0            720
      json_focused         True             1                True            0              0            321

🎯 Overall success rate: 100.0%


## Personalized Prompt Testing

Test personalized prompts with different user profiles:

In [ ]:
# Test personalized prompts with real Firebase user structure
profile_name = 'low_steps_graduate'  # Change this to test different profiles
profile = PERSONALIZED_EXAMPLES[profile_name]

# Simulate step count aggregation (matching planNudges.ts:184)
recent_step_average = tester.simulate_step_count_aggregation(profile['recent_step_data'])

print(f"🎯 Testing profile: {profile['description']}")
print(f"📊 Recent step data: {profile['recent_step_data']}")
print(f"📈 Average daily steps: {recent_step_average}")
print(f"🎓 Education level: {profile['education_level']}")

# Test English personalized prompt
result_en = await tester.test_personalized_prompt(
    language='en',
    recent_step_average=recent_step_average,
    education_level=profile['education_level'],
    user_id=f"test_{profile_name}"
)

# Display results
if result_en['api_success'] and result_en['validation']['is_valid']:
    print("\n📋 Personalized English Nudges:")
    for i, nudge in enumerate(result_en['validation']['parsed_data'], 1):
        print(f"{i}. {nudge['title']}")
        print(f"   {nudge['body']}\n")

# Test Spanish personalized prompt
print("\n" + "="*50)
result_es = await tester.test_personalized_prompt(
    language='es',
    recent_step_average=recent_step_average,
    education_level=profile['education_level'],
    user_id=f"test_{profile_name}_es"
)

# Display results
if result_es['api_success'] and result_es['validation']['is_valid']:
    print("\n📋 Personalized Spanish Nudges:")
    for i, nudge in enumerate(result_es['validation']['parsed_data'], 1):
        print(f"{i}. {nudge['title']}")
        print(f"   {nudge['body']}\n")

## Batch Test All Personalized Profiles

Test all user profiles to compare personalization effectiveness:

In [ ]:
# Test all personalized profiles with real Firebase data structure
print("🔬 Running batch tests on all personalized profiles...\n")

personalized_results = {}
for profile_name, profile_data in PERSONALIZED_EXAMPLES.items():
    print(f"Testing profile: {profile_data['description']}")
    
    # Simulate step count aggregation (matching planNudges.ts getRecentStepCount method)
    recent_step_average = tester.simulate_step_count_aggregation(profile_data['recent_step_data'])
    
    # Test English
    result_en = await tester.test_personalized_prompt(
        language='en',
        recent_step_average=recent_step_average,
        education_level=profile_data['education_level'],
        user_id=f"test_{profile_name}_en"
    )
    
    # Test Spanish
    result_es = await tester.test_personalized_prompt(
        language='es',
        recent_step_average=recent_step_average,
        education_level=profile_data['education_level'],
        user_id=f"test_{profile_name}_es"
    )
    
    personalized_results[f"{profile_name}_en"] = result_en
    personalized_results[f"{profile_name}_es"] = result_es
    
    # Log the step aggregation for verification
    print(f"  - Step data: {profile_data['recent_step_data']}")
    print(f"  - Calculated average: {recent_step_average} steps/day")
    print(f"  - Education: {profile_data['education_level']}")
    
    # Small delay to avoid rate limiting
    import time
    time.sleep(2)

print("\n✅ Personalized batch testing complete!")

## Personalization Analysis

Analyze how personalization affects the generated nudges:

In [ ]:
# Analyze personalization effectiveness
def analyze_personalization(results: Dict[str, Any]) -> Dict[str, Any]:
    """Analyze how personalization affects nudge content"""
    analysis = {
        'step_references': 0,
        'education_adaptations': 0,
        'complexity_scores': [],
        'activity_mentions': {},
        'personalization_indicators': []
    }
    
    for result_name, result in results.items():
        if not (result['api_success'] and result['validation'] and result['validation']['is_valid']):
            continue
            
        nudges = result['validation']['parsed_data']
        
        for nudge in nudges:
            title = nudge.get('title', '').lower()
            body = nudge.get('body', '').lower()
            content = f"{title} {body}"
            
            # Check for step references
            if any(word in content for word in ['step', 'walk', 'pace', 'distance']):
                analysis['step_references'] += 1
            
            # Check for complexity indicators
            complex_words = ['cardiovascular', 'metabolism', 'endurance', 'intensity', 'optimize']
            simple_words = ['move', 'fun', 'easy', 'try', 'start']
            
            complexity_score = sum(1 for word in complex_words if word in content)
            complexity_score -= sum(1 for word in simple_words if word in content) * 0.5
            analysis['complexity_scores'].append(complexity_score)
            
            # Track activity mentions
            activities = ['walk', 'run', 'swim', 'dance', 'yoga', 'bike', 'gym', 'sport']
            for activity in activities:
                if activity in content:
                    analysis['activity_mentions'][activity] = analysis['activity_mentions'].get(activity, 0) + 1
    
    # Calculate averages
    if analysis['complexity_scores']:
        analysis['avg_complexity'] = sum(analysis['complexity_scores']) / len(analysis['complexity_scores'])
    
    return analysis

# Analyze personalized results
if 'personalized_results' in locals():
    personalization_analysis = analyze_personalization(personalized_results)
    
    print("📈 Personalization Analysis:")
    print(f"Step references found: {personalization_analysis['step_references']}")
    print(f"Average complexity score: {personalization_analysis.get('avg_complexity', 0):.2f}")
    print(f"Most mentioned activities: {sorted(personalization_analysis['activity_mentions'].items(), key=lambda x: x[1], reverse=True)[:3]}")
    
    # Create comparison table
    comparison_data = []
    for result_name, result in personalized_results.items():
        if result['api_success'] and result['validation'] and result['validation']['is_valid']:
            profile_parts = result_name.split('_')
            language = profile_parts[-1]
            profile_type = '_'.join(profile_parts[:-1])
            
            comparison_data.append({
                'profile': profile_type,
                'language': language,
                'success': result['validation']['is_valid'],
                'prompt_length': len(result['prompt']),
                'description': result['description']
            })
    
    if comparison_data:
        df_personalized = pd.DataFrame(comparison_data)
        print("\n📊 Personalized Results Summary:")
        print(df_personalized.to_string(index=False))
else:
    print("No personalized results to analyze. Run the batch test first!")

In [ ]:
# Custom personalized testing
custom_step_count = 3000  # Modify these values to test different user profiles
custom_education = "College degree"  # Options: "High school", "College degree", "Graduate degree", "Technical/vocational training"
custom_language = "en"  # "en" or "es"

print(f"🎯 Testing custom profile:")
print(f"📊 Step count: {custom_step_count}")
print(f"🎓 Education level: {custom_education}")
print(f"🌐 Language: {custom_language}")

# Generate and display the personalized prompt
custom_prompt = tester.generate_personalized_prompt(
    language=custom_language,
    step_count=custom_step_count,
    education_level=custom_education
)

print(f"\n📝 Generated prompt preview (first 300 chars):")
print(custom_prompt[:300] + "..." if len(custom_prompt) > 300 else custom_prompt)

# Test the custom personalized prompt
custom_result = await tester.test_personalized_prompt(
    language=custom_language,
    step_count=custom_step_count,
    education_level=custom_education,
    user_id="custom_test_user"
)

# Display results
if custom_result['api_success'] and custom_result['validation']['is_valid']:
    print(f"\n📋 Custom Personalized Nudges ({custom_language.upper()}):")
    for i, nudge in enumerate(custom_result['validation']['parsed_data'], 1):
        print(f"{i}. {nudge['title']}")
        print(f"   {nudge['body']}\n")
        
    # Analyze this specific result for personalization indicators
    nudges = custom_result['validation']['parsed_data']
    step_mentions = sum(1 for nudge in nudges 
                       if any(word in f"{nudge.get('title', '')} {nudge.get('body', '')}".lower() 
                             for word in ['step', 'walk', 'pace', 'move']))
    
    print(f"📈 Personalization analysis:")
    print(f"   - Step/movement references: {step_mentions}/7 nudges")
    print(f"   - Average title length: {sum(len(n.get('title', '')) for n in nudges) / len(nudges):.1f} chars")
    print(f"   - Average body length: {sum(len(n.get('body', '')) for n in nudges) / len(nudges):.1f} chars")
else:
    print("❌ Custom test failed. Check the error messages above.")

# Custom personalized testing with real Firebase structure
# Modify these values to test different user profiles

# Sample step count data (7 days as collected from HealthObservations_HKQuantityTypeIdentifierStepCount)
custom_step_data = [3000, 3500, 2800, 4200, 3800, 3200, 3100]  # Sample 7-day step data
custom_education = "Bachelor's degree"  # Use exact values from EDUCATION_LEVELS
custom_language = "en"  # "en" or "es"

# Calculate average (matching planNudges.ts aggregation logic)
custom_step_average = tester.simulate_step_count_aggregation(custom_step_data)

print(f"🎯 Testing custom profile:")
print(f"📊 Raw step data (7 days): {custom_step_data}")
print(f"📈 Calculated average: {custom_step_average} steps/day")
print(f"🎓 Education level: {custom_education}")
print(f"🌐 Language: {custom_language}")

# Verify education level is valid
if custom_education not in EDUCATION_LEVELS:
    print(f"⚠️  Warning: '{custom_education}' not in standard education levels")
    print(f"Valid options: {EDUCATION_LEVELS}")

# Generate and display the personalized prompt
custom_prompt = tester.generate_personalized_prompt(
    language=custom_language,
    recent_step_average=custom_step_average,
    education_level=custom_education
)

print(f"\n📝 Generated prompt preview (first 300 chars):")
print(custom_prompt[:300] + "..." if len(custom_prompt) > 300 else custom_prompt)

# Test the custom personalized prompt
custom_result = await tester.test_personalized_prompt(
    language=custom_language,
    recent_step_average=custom_step_average,
    education_level=custom_education,
    user_id="custom_test_user"
)

# Display results
if custom_result['api_success'] and custom_result['validation']['is_valid']:
    print(f"\n📋 Custom Personalized Nudges ({custom_language.upper()}):")
    for i, nudge in enumerate(custom_result['validation']['parsed_data'], 1):
        print(f"{i}. {nudge['title']}")
        print(f"   {nudge['body']}\n")
        
    # Analyze this specific result for personalization indicators
    nudges = custom_result['validation']['parsed_data']
    step_mentions = sum(1 for nudge in nudges 
                       if any(word in f"{nudge.get('title', '')} {nudge.get('body', '')}".lower() 
                             for word in ['step', 'walk', 'pace', 'move', 'distance']))
    
    # Check for education-appropriate language complexity
    complex_words = ['cardiovascular', 'metabolism', 'endurance', 'optimize', 'intensity']
    simple_words = ['move', 'fun', 'easy', 'try', 'start', 'great']
    
    complex_count = sum(1 for nudge in nudges
                       for word in complex_words
                       if word.lower() in f"{nudge.get('title', '')} {nudge.get('body', '')}".lower())
    
    simple_count = sum(1 for nudge in nudges  
                      for word in simple_words
                      if word.lower() in f"{nudge.get('title', '')} {nudge.get('body', '')}".lower())
    
    print(f"📈 Personalization analysis:")
    print(f"   - Step/movement references: {step_mentions}/7 nudges")
    print(f"   - Complex vocabulary usage: {complex_count} instances")
    print(f"   - Simple vocabulary usage: {simple_count} instances")
    print(f"   - Average title length: {sum(len(n.get('title', '')) for n in nudges) / len(nudges):.1f} chars")
    print(f"   - Average body length: {sum(len(n.get('body', '')) for n in nudges) / len(nudges):.1f} chars")
    print(f"   - Data used: {custom_step_average} avg steps, '{custom_education}' education")
else:
    print("❌ Custom test failed. Check the error messages above.")