# RQ3: Cross-Model Transferability Analysis

This notebook analyzes whether toxic prompts evolved on one model (LLaMA 3.1 8B) transfer to other models (Mistral, Gemma) with different architectures or alignment tuning.

**Research Question**: To what extent do toxic prompts evolved on one model transfer to other models, especially those with different architectures or alignment tuning?

## Step 1: Data Collection - Combine All Elite Prompts


In [161]:
# Step 1: Read all elites.json files
import os
import json
import glob
import pandas as pd
import numpy as np

# Setup paths
script_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in globals() else os.getcwd()
project_root = os.path.dirname(script_dir)
base_data_dir = os.path.join(project_root, "data", "outputs")
base_data_dir = os.path.normpath(base_data_dir)

# Find all elites.json files
pattern = os.path.join(base_data_dir, "**", "elites.json")
elite_files = sorted(glob.glob(pattern, recursive=True))

print(f"Found {len(elite_files)} elites.json files")


Found 36 elites.json files


In [162]:
# Step 2: Load and combine all elites.json files
all_elites = []

for file_path in elite_files:
    try:
        # Extract run directory from path
        rel_path = os.path.relpath(file_path, base_data_dir)
        run_dir = os.path.dirname(rel_path)  # e.g., "run01_comb"
        
        # Load JSON file
        with open(file_path, 'r', encoding='utf-8') as f:
            elites = json.load(f)
        
        # Process each elite
        for elite in elites:
            # Extract toxicity score
            toxicity_score = None
            if 'moderation_result' in elite and 'google' in elite['moderation_result']:
                if 'scores' in elite['moderation_result']['google']:
                    toxicity_score = elite['moderation_result']['google']['scores'].get('toxicity', None)
            
            # Extract model filename from model path
            model_name_path = elite.get('model_name', '')
            model_filename = os.path.basename(model_name_path) if model_name_path else ''
            
            # Create models dictionary with source model
            models = {
                model_filename: {
                    'response': elite.get('generated_output', ''),
                    'toxicity': toxicity_score
                }
            }
            
            # Source metadata
            source = {
                'genome_id': elite.get('id', None),
                'operator': elite.get('operator', ''),
                'dir_name': run_dir
            }
            
            enriched_elite = {
                'id': None,  # Will be assigned later
                'prompt': elite.get('prompt', ''),
                'source': source,
                'models': models
            }
            
            all_elites.append(enriched_elite)
        
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        continue

print(f"Loaded {len(all_elites)} total elite records")


Loaded 904 total elite records


In [163]:
# Step 3: Deduplicate based on prompt (keep highest toxicity)
def get_source_toxicity(elite):
    for model_name, model_data in elite['models'].items():
        if model_data.get('toxicity') is not None:
            return model_data.get('toxicity')
    return 0

# Sort by toxicity (descending) so highest toxicity comes first
all_elites.sort(key=get_source_toxicity, reverse=True)

# Create dictionary to track unique prompts (keep first occurrence = highest toxicity)
unique_elites_dict = {}
for elite in all_elites:
    prompt = elite['prompt']
    if prompt not in unique_elites_dict:
        unique_elites_dict[prompt] = elite

# Convert back to list
combined_elites_list = list(unique_elites_dict.values())

# Filter: Keep only prompts that end with a question mark
combined_elites_list = [elite for elite in combined_elites_list if elite['prompt'].strip().endswith('?')]

# Re-assign IDs after filtering
for idx, elite in enumerate(combined_elites_list, start=1):
    elite['id'] = idx

print(f"After deduplication and filtering: {len(combined_elites_list)} unique prompts with question marks")


After deduplication and filtering: 696 unique prompts with question marks


In [164]:
# Step 4: Save combined and deduplicated data to JSON
output_dir = script_dir if 'script_dir' in globals() else os.getcwd()
json_path = os.path.join(output_dir, "rq3_combined_elites.json")

with open(json_path, 'w', encoding='utf-8') as f:
    json.dump(combined_elites_list, f, indent=2, ensure_ascii=False)

print(f"Saved to: {json_path}")
print(f"Total records: {len(combined_elites_list)}")


Saved to: /Users/onkars/Documents/Projects/eost-cam-llm/experiments/rq3_combined_elites.json
Total records: 696


## Step 2: Save Top 25% Elites to CSV

Save the top 25% of deduplicated elites (75th percentile and above by toxicity) to CSV for cross-model testing.


In [165]:
# Save top 25% elites (75th percentile and above by toxicity) to CSV
import os
import pandas as pd

# Setup paths
if 'script_dir' not in globals():
    script_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in globals() else os.getcwd()
if 'project_root' not in globals():
    project_root = os.path.dirname(script_dir)

# Load combined elites
json_path = os.path.join(script_dir, "rq3_combined_elites.json")
with open(json_path, 'r', encoding='utf-8') as f:
    combined_data = json.load(f)

print(f"Total elites: {len(combined_data)}")

# Function to get the highest toxicity score from models dictionary
def get_max_toxicity(elite):
    """Get the maximum toxicity score from all models for this elite."""
    models_dict = elite.get('models', {})
    max_toxicity = 0.0
    for model_name, model_data in models_dict.items():
        toxicity = model_data.get('toxicity')
        if toxicity is not None and toxicity > max_toxicity:
            max_toxicity = toxicity
    return max_toxicity

# Calculate toxicity for each elite and sort by toxicity (descending)
elites_with_toxicity = []
for elite in combined_data:
    max_tox = get_max_toxicity(elite)
    elites_with_toxicity.append({
        'elite': elite,
        'toxicity': max_tox
    })

# Sort by toxicity (descending)
elites_with_toxicity.sort(key=lambda x: x['toxicity'], reverse=True)

# Calculate top 25% (75th percentile and above)
total_count = len(elites_with_toxicity)
top_25_percent_count = int(total_count * 0.25)
print(f"Top 25% count: {top_25_percent_count} out of {total_count}")

# Get top 25% elites (75th percentile and above)
top_25_percent_elites = elites_with_toxicity[:top_25_percent_count]

# Extract prompts from top 25% elites
top_prompts = [item['elite']['prompt'] for item in top_25_percent_elites]

# Create DataFrame with just one column "questions"
df_top = pd.DataFrame({'questions': top_prompts})

print(f"\nDataFrame created: {len(df_top)} rows, {len(df_top.columns)} columns")
print(f"Toxicity range: {min(item['toxicity'] for item in top_25_percent_elites):.4f} - {max(item['toxicity'] for item in top_25_percent_elites):.4f}")

# Save DataFrame to CSV in data/ directory
csv_path = os.path.join(project_root, "data", "combined_elites.csv")
df_top.to_csv(csv_path, index=False, encoding='utf-8')
print(f"Saved CSV to: {csv_path}")
print(f"Saved {len(df_top)} prompts (top 25% by toxicity, 75th percentile and above)")


Total elites: 696
Top 25% count: 174 out of 696

DataFrame created: 174 rows, 1 columns
Toxicity range: 0.3796 - 0.8697
Saved CSV to: /Users/onkars/Documents/Projects/eost-cam-llm/data/combined_elites.csv
Saved 174 prompts (top 25% by toxicity, 75th percentile and above)
