# Single-Shot Target Interaction Module

This module loads generated adversarial prompts and sends them to target LLMs, capturing responses and reasoning traces.

## 1. Setup & Dependencies

In [None]:
# Import libraries
import pandas as pd
import json
import csv
import uuid
import datetime
import os
import time
import random
from typing import Dict, List, Optional, Union, Any, Tuple
import openai
from dotenv import load_dotenv
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Create output directory if it doesn't exist
os.makedirs('./logs/single_shot', exist_ok=True)

# Load environment variables
load_dotenv()

# Target Model Configuration
# Set your target model endpoint and API key here
TARGET_MODEL_BASE_URL = os.getenv("TARGET_MODEL_BASE_URL", "http://localhost:1234/v1")
TARGET_MODEL_API_KEY = os.getenv("TARGET_MODEL_API_KEY", "not-needed")
TARGET_MODEL_NAME = os.getenv("TARGET_MODEL_NAME", "local-model")

# Initialize OpenAI client for target model
target_client = openai.OpenAI(
    base_url=TARGET_MODEL_BASE_URL,
    api_key=TARGET_MODEL_API_KEY
)

print(f"Target model endpoint configured: {TARGET_MODEL_BASE_URL}")

## 2. Load Generated Prompts

In [None]:
def load_generated_prompts(file_path: str) -> pd.DataFrame:
    """
    Load generated adversarial prompts from CSV or JSON file
    
    Args:
        file_path (str): Path to the generated prompts file
    
    Returns:
        pd.DataFrame: DataFrame containing the prompts
    """
    try:
        if file_path.endswith('.csv'):
            df = pd.read_csv(file_path)
        elif file_path.endswith('.json'):
            df = pd.read_json(file_path)
        else:
            raise ValueError("Unsupported file format. Please use CSV or JSON.")
        
        print(f"Successfully loaded {len(df)} prompts from {file_path}")
        return df
    except Exception as e:
        print(f"Error loading prompts: {e}")
        return pd.DataFrame()

# Load generated prompts
prompts_file = "./generated_prompts/sample_prompts.csv"  # Update with your actual path
prompts_df = load_generated_prompts(prompts_file)

if not prompts_df.empty:
    print(f"Loaded {len(prompts_df)} prompts")
    print("Columns:", list(prompts_df.columns))
    prompts_df.head()
else:
    print("No prompts loaded. Please check the file path and format.")

## 3. Single-Shot Target Interaction Function

In [None]:
def run_single_shot_interaction(prompt_data: Dict, target_model_config: Dict) -> Dict:
    """
    Send a single prompt to target LLM and capture response with reasoning trace
    
    Args:
        prompt_data (dict): Dictionary containing prompt information
        target_model_config (dict): Configuration for the target model API
    
    Returns:
        dict: Contains prompt, response, reasoning trace, and metadata
    """
    try:
        # Extract prompt information
        prompt_text = prompt_data.get('prompt', '')
        scenario = prompt_data.get('scenario', 'Unknown')
        prompt_id = prompt_data.get('id', str(uuid.uuid4()))
        
        # Prepare messages for target model
        messages = [
            {"role": "user", "content": prompt_text}
        ]
        
        # Add system prompt if provided
        if target_model_config.get('system_prompt'):
            messages.insert(0, {"role": "system", "content": target_model_config['system_prompt']})
        
        # Send request to target model
        response = target_client.chat.completions.create(
            model=target_model_config.get('model', TARGET_MODEL_NAME),
            messages=messages,
            temperature=target_model_config.get('temperature', 0.7),
            max_tokens=target_model_config.get('max_tokens', 500)
        )
        
        # Extract response content
        response_content = response.choices[0].message.content
        
        # Try to extract reasoning trace if available
        reasoning_trace = None
        if hasattr(response.choices[0].message, 'reasoning_trace'):
            reasoning_trace = response.choices[0].message.reasoning_trace
        
        # Create result dictionary
        result = {
            "interaction_id": str(uuid.uuid4()),
            "prompt_id": prompt_id,
            "scenario": scenario,
            "attacker_prompt": prompt_text,
            "target_response": response_content,
            "target_reasoning": reasoning_trace,
            "timestamp": datetime.datetime.now().isoformat(),
            "target_model": target_model_config.get('model', TARGET_MODEL_NAME),
            "model_provider": target_model_config.get('provider', 'unknown'),
            "status": "success"
        }
        
        return result
        
    except Exception as e:
        print(f"Error in single-shot interaction for prompt {prompt_data.get('id', 'unknown')}: {e}")
        return {
            "interaction_id": str(uuid.uuid4()),
            "prompt_id": prompt_data.get('id', 'unknown'),
            "scenario": prompt_data.get('scenario', 'Unknown'),
            "attacker_prompt": prompt_data.get('prompt', ''),
            "target_response": None,
            "target_reasoning": None,
            "error": str(e),
            "timestamp": datetime.datetime.now().isoformat(),
            "target_model": target_model_config.get('model', TARGET_MODEL_NAME),
            "model_provider": target_model_config.get('provider', 'unknown'),
            "status": "error"
        }

# Test the function with one prompt
if not prompts_df.empty:
    # Get first prompt as test
    test_prompt = prompts_df.iloc[0].to_dict()
    
    # Define target model configuration
    target_model_config = {
        "provider": "lmstudio",  # or "openai", "ollama"
        "api_key": TARGET_MODEL_API_KEY,
        "base_url": TARGET_MODEL_BASE_URL,
        "model": TARGET_MODEL_NAME,
        "temperature": 0.7,
        "max_tokens": 500
    }
    
    # Run single-shot interaction
    test_result = run_single_shot_interaction(test_prompt, target_model_config)
    print("Test interaction result:")
    print(json.dumps(test_result, indent=2))

## 4. Process All Generated Prompts

In [None]:
def process_all_prompts(prompts_df: pd.DataFrame, target_model_config: Dict) -> List[Dict]:
    """
    Process all generated prompts through single-shot interactions
    
    Args:
        prompts_df (pd.DataFrame): DataFrame containing generated prompts
        target_model_config (dict): Configuration for the target model API
    
    Returns:
        list: List of interaction results
    """
    interaction_results = []
    
    if prompts_df.empty:
        print("No prompts to process.")
        return interaction_results
    
    print(f"Processing {len(prompts_df)} prompts...")
    
    # Process each prompt
    for idx, row in prompts_df.iterrows():
        prompt_data = row.to_dict()
        
        print(f"Processing prompt {idx+1}/{len(prompts_df)}: {prompt_data.get('scenario', 'Unknown')}")
        
        # Run single-shot interaction
        result = run_single_shot_interaction(prompt_data, target_model_config)
        interaction_results.append(result)
        
        # Add a small delay to avoid overwhelming the API
        time.sleep(0.5)
    
    print(f"Completed processing {len(interaction_results)} prompts.")
    return interaction_results

# Process all prompts
if not prompts_df.empty:
    # Define target model configuration
    target_model_config = {
        "provider": "lmstudio",  # or "openai", "ollama"
        "api_key": TARGET_MODEL_API_KEY,
        "base_url": TARGET_MODEL_BASE_URL,
        "model": TARGET_MODEL_NAME,
        "temperature": 0.7,
        "max_tokens": 500
    }
    
    # Process all prompts
    all_results = process_all_prompts(prompts_df, target_model_config)
    print(f"Processed {len(all_results)} prompts.")
    
    # Show summary
    if all_results:
        successful_interactions = [r for r in all_results if r.get('status') == 'success']
        errored_interactions = [r for r in all_results if r.get('status') == 'error']
        
        print(f"Successful interactions: {len(successful_interactions)}")
        print(f"Errored interactions: {len(errored_interactions)}")
        
        # Show sample successful result
        if successful_interactions:
            sample = successful_interactions[0]
            print("\nSample successful interaction:")
            print(f"  Scenario: {sample.get('scenario', 'Unknown')}")
            print(f"  Prompt: {sample.get('attacker_prompt', '')[:100]}...")
            print(f"  Response length: {len(sample.get('target_response', ''))} characters")
else:
    print("No prompts to process.")
    all_results = []

## 5. Export Results

In [None]:
def export_interaction_results(results: List[Dict], format: str = "json"):
    """
    Export interaction results to JSON or CSV format
    
    Args:
        results (list): List of interaction results
        format (str): Export format ("json" or "csv")
    """
    try:
        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        
        if format.lower() == "json":
            filename = f"./logs/single_shot/interaction_results_{timestamp}.json"
            with open(filename, 'w', encoding='utf-8') as f:
                json.dump(results, f, indent=2, ensure_ascii=False)
            print(f"Exported {len(results)} results to {filename}")
            return filename
        
        elif format.lower() == "csv":
            filename = f"./logs/single_shot/interaction_results_{timestamp}.csv"
            
            # Flatten data for CSV
            csv_data = []
            for result in results:
                row = {
                    "interaction_id": result.get("interaction_id", ""),
                    "prompt_id": result.get("prompt_id", ""),
                    "scenario": result.get("scenario", ""),
                    "attacker_prompt": result.get("attacker_prompt", ""),
                    "target_response": result.get("target_response", ""),
                    "target_reasoning": str(result.get("target_reasoning", "")),
                    "timestamp": result.get("timestamp", ""),
                    "target_model": result.get("target_model", ""),
                    "model_provider": result.get("model_provider", ""),
                    "status": result.get("status", ""),
                    "error": result.get("error", "")
                }
                csv_data.append(row)
            
            if csv_data:
                df = pd.DataFrame(csv_data)
                df.to_csv(filename, index=False)
                print(f"Exported {len(csv_data)} results to {filename}")
            else:
                # Create empty CSV with headers
                headers = ["interaction_id", "prompt_id", "scenario", "attacker_prompt", 
                          "target_response", "target_reasoning", "timestamp", 
                          "target_model", "model_provider", "status", "error"]
                with open(filename, 'w', newline='', encoding='utf-8') as f:
                    writer = csv.writer(f)
                    writer.writerow(headers)
                print(f"Created empty CSV file with headers: {filename}")
            
            return filename
        
        else:
            raise ValueError(f"Unsupported format: {format}")
            
    except Exception as e:
        print(f"Error exporting results: {e}")
        return None

# Export results
if all_results:
    json_file = export_interaction_results(all_results, "json")
    csv_file = export_interaction_results(all_results, "csv")
    
    print("\nExport completed!")
    print(f"JSON file: {json_file}")
    print(f"CSV file: {csv_file}")
else:
    print("No results to export.")
    # Create empty files
    export_interaction_results([], "json")
    export_interaction_results([], "csv")

## 6. Summary and Next Steps

In [None]:
print("=== Single-Shot Target Interaction Complete ===")
print(f"Total prompts processed: {len(prompts_df) if not prompts_df.empty else 0}")
print(f"Total interactions completed: {len(all_results) if 'all_results' in locals() else 0}")

# Show results summary if available
if 'all_results' in locals() and all_results:
    successful = [r for r in all_results if r.get('status') == 'success']
    errored = [r for r in all_results if r.get('status') == 'error']
    
    print(f"Successful interactions: {len(successful)}")
    print(f"Errored interactions: {len(errored)}")
    
    # Count by scenario
    scenario_counts = {}
    for result in all_results:
        scenario = result.get('scenario', 'Unknown')
        scenario_counts[scenario] = scenario_counts.get(scenario, 0) + 1
    
    print("\nInteraction count by scenario:")
    for scenario, count in sorted(scenario_counts.items()):
        print(f"  {scenario}: {count}")

print("\nNext steps:")
print("1. Review the exported JSON and CSV files in ./logs/single_shot/")
print("2. Analyze target responses for safety compliance and vulnerability patterns")
print("3. Use the results for judge LLM evaluation")
print("4. Identify areas for improved safety mechanisms")