In [1]:
# !pip install pandas transformers scikit-learn matplotlib seaborn sentencepiece accelerate -q
# !pip install protobuf
# ! pip install h5py
# import pandas as pd
# hate_yes_data = pd.read_csv('/root/ccs_aisf/data/yes_no/hate_vs_antagonist_yes.csv')
# hate_no_data = pd.read_csv('/root/ccs_aisf/data/yes_no/hate_vs_antagonist_no.csv')

# torch.set_default_tensor_type(torch.cuda.HalfTensor)

## **1. Datasets.**

In [2]:
import re
import pickle
import pandas as pd
from sklearn.metrics import accuracy_score
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification
# from transformers import EncoderDecoderModel, BertTokenizer, DistilBertTokenizer
# from transformers import AutoModelForCausalLM

from sklearn.linear_model import LogisticRegression
import numpy as np
from tqdm import tqdm

import sys
import os

# Add the code directory directly to Python path
code_dir = '/Users/elenaericheva/ericheva_git/ccs_aisf/code'
if code_dir not in sys.path:
    sys.path.insert(0, code_dir)

print(f"Added {code_dir} to Python path")

  from .autonotebook import tqdm as notebook_tqdm


Added /Users/elenaericheva/ericheva_git/ccs_aisf/code to Python path


In [3]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from itertools import combinations
from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler

## **2. Choose model.**

In [7]:
# Available Gemma2 variants:

# google/gemma-2-2b - 2B parameter base model
# google/gemma-2-9b - 9B parameter base model
# google/gemma-2-2b-it - 2B instruction-tuned model
# google/gemma-2-9b-it - 9B instruction-tuned model

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

YOUR_NAME = "gemma-2-2b"

# CHANGED: Using Gemma2 model instead of DeBERTa
# IMPORTANT: Must use AutoModelForCausalLM, NOT AutoModelForMaskedLM
gemma_tokenizer = AutoTokenizer.from_pretrained(f"google/{YOUR_NAME}")
gemma_model = AutoModelForCausalLM.from_pretrained(
    f"google/{YOUR_NAME}",
    torch_dtype=torch.float32,  # Use half precision for efficiency
    device_map="auto"           # Automatically map to available GPUs
)

gemma_model.eval();
LAYER_IDX = 16


Loading checkpoint shards: 100%|██████████| 3/3 [00:09<00:00,  3.17s/it]


## **4. Plot.**

In [8]:
# path to big hate data
hate_data = pd.read_csv('../data/raw/total_hate_data.csv', index_col=0)
hate_data.head(2)

hate_total_yes_data = pd.read_csv('../data/yes_no/hate_total_yes_data.csv', index_col=0)
hate_total_no_data =  pd.read_csv('../data/yes_no/hate_total_no_data.csv', index_col=0)

# Load the files correctly
X_pos_file = np.load(f'{YOUR_NAME}_pos.npz')
X_neg_file = np.load(f'{YOUR_NAME}_neg.npz')

# Extract the actual arrays
X_pos = X_pos_file['arr_0']
X_neg = X_neg_file['arr_0']

# Check data types and shapes first
print("X_pos dtype:", X_pos.dtype)
print("X_pos shape:", X_pos.shape)
print("X_neg dtype:", X_neg.dtype)
print("X_neg shape:", X_neg.shape)

print("######################################################################################## ")
print("X_pos.max()", X_pos.max())
print("X_pos.min()", X_pos.min())
print("X_pos.mean()", X_pos.mean())
print("X_pos.std()", X_pos.std())
print("X_pos.median()", np.median(X_pos))
print("X_neg.max()", X_neg.max())
print("X_neg.min()", X_neg.min())
print("X_neg.mean()", X_neg.mean())
print("X_neg.std()", X_neg.std())
print("X_neg.median()", np.median(X_neg))
print("######################################################################################## ")

# Check for any issues with the data
print("X_pos sample:", X_pos.flat[:5])
print("X_neg sample:", X_neg.flat[:5])

# Convert to proper numeric type if needed
X_pos = X_pos.astype(np.float32)
X_neg = X_neg.astype(np.float32)

n_samples, n_layers, n_features = X_pos.shape

# Normalize the data
X_pos_normalized = X_pos.reshape(-1, X_pos.shape[-1])         # (512*25, 768)
X_pos_normalized = normalize(X_pos_normalized, norm='l2', axis=1).reshape(X_pos.shape)

X_neg_normalized = X_neg.reshape(-1, X_neg.shape[-1])         # (512*25, 768)
X_neg_normalized = normalize(X_neg_normalized, norm='l2', axis=1).reshape(X_neg.shape)

print("Normalization completed!")
print("X_pos_normalized shape:", X_pos_normalized.shape)
print("X_neg_normalized shape:", X_neg_normalized.shape)

# Check data types and shapes first
print("X_pos_normalized dtype:", X_pos_normalized.dtype)
print("X_pos_normalized shape:", X_pos_normalized.shape)
print("X_neg_normalized dtype:", X_neg_normalized.dtype)
print("X_neg_normalized shape:", X_neg_normalized.shape)

print("######################################################################################## ")
print("X_pos_normalized.max()", X_pos_normalized.max())
print("X_pos_normalized.min()", X_pos_normalized.min())
print("X_pos_normalized.mean()", X_pos_normalized.mean())
print("X_pos_normalized.std()", X_pos_normalized.std())
print("X_pos_normalized.median()", np.median(X_pos_normalized))
print("X_neg_normalized.max()", X_neg_normalized.max())
print("X_neg_normalized.min()", X_neg_normalized.min())
print("X_neg_normalized.mean()", X_neg_normalized.mean())
print("X_neg_normalized.std()", X_neg_normalized.std())
print("X_neg_normalized.median()", np.median(X_neg_normalized))
print("######################################################################################## ")

# Verify normalization worked - check L2 norms
print("X_pos_normalized sample norms:", np.linalg.norm(X_pos_normalized.reshape(-1, n_features)[:5], axis=1))
print("X_neg_normalized sample norms:", np.linalg.norm(X_neg_normalized.reshape(-1, n_features)[:5], axis=1))

# Plot PCA
from format_results_fixed import plot_pca_or_tsne_layerwise
plot_pca_or_tsne_layerwise(X_pos_normalized,
                           X_neg_normalized,
                           hate_data['is_harmfull_opposition'],
                           standardize=False, n_components=5,
                           components=[0, 1], mode='pca',
                           plot_title='PCA clustering, Gemma2b Base not pretr hate vs normal')

X_pos dtype: float32
X_pos shape: (1244, 27, 2304)
X_neg dtype: float32
X_neg shape: (1244, 27, 2304)
######################################################################################## 
X_pos.max() 335.68018
X_pos.min() -404.33563
X_pos.mean() 0.04543885
X_pos.std() 5.0773478
X_pos.median() 0.002463609
X_neg.max() 307.2693
X_neg.min() -354.6938
X_neg.mean() 0.058767434
X_neg.std() 4.8582144
X_neg.median() 0.012776405
######################################################################################## 
X_pos sample: [-0.41606918  0.8024701  -1.5463089   2.3808136  -1.8874665 ]
X_neg sample: [-0.41606918  0.8024701  -1.5463089   2.3808136  -1.8874665 ]
Normalization completed!
X_pos_normalized shape: (1244, 27, 2304)
X_neg_normalized shape: (1244, 27, 2304)
X_pos_normalized dtype: float32
X_pos_normalized shape: (1244, 27, 2304)
X_neg_normalized dtype: float32
X_neg_normalized shape: (1244, 27, 2304)
##############################################################################

KeyboardInterrupt: 

After that, we train the CCS on the selected layer to obtain the probe and its weights.

In [None]:
ccs = pickle.load(open(f'ccs_{YOUR_NAME}_l_{LAYER_IDX}_mixed_data_results.pkl', 'rb'))

css_orig = pickle.load(open(f'ccs_{YOUR_NAME}_mixed_data_results.pkl', 'rb'))

FileNotFoundError: [Errno 2] No such file or directory: 'ccs_gemma-2-2b_l_18_mixed_data_results.pkl'

# **6. Steering**

In [None]:
from steering_gemma import plot_steering_power, plot_boundary
from steering_gemma import PatchHook


In [None]:
deltas = np.linspace(-0.05, 0.05, 30)

X_pos_tensor = torch.tensor(X_pos_normalized[:257], dtype=torch.float32, device=ccs.device)
X_neg_tensor = torch.tensor(X_neg_normalized[:257], dtype=torch.float32, device=ccs.device)

plot_steering_power(ccs, X_pos_tensor, X_neg_tensor, deltas, labels=["POS (statement + ДА) [harm]", "NEG (statement + НЕТ) [harm]"], 
                    title="Steering along opinion direction [harm]")

In [None]:
deltas = np.linspace(-0.05, 0.05, 30)

X_pos_tensor = torch.tensor(X_pos_normalized[257:], dtype=torch.float32, device=ccs.device)
X_neg_tensor = torch.tensor(X_neg_normalized[257:], dtype=torch.float32, device=ccs.device)

plot_steering_power(ccs, X_pos_tensor, X_neg_tensor, deltas, labels=["POS (statement + ДА) [safe]", "NEG (statement + НЕТ) [safe]"], 
                    title="Steering along opinion direction [harm]")

Manual calibration

In [None]:
idx = 1  # first sample
h_orig = torch.tensor(X_pos_normalized_data.loc[idx], dtype=torch.float32, device=ccs.device)

# ensure weights is numpy array and normalized
weights, _ = ccs.get_weights()

direction = weights / (np.linalg.norm(weights) + 1e-6)
h_steered =h_orig + 0.025*direction

p_orig = ccs.best_probe(h_orig.unsqueeze(0)).item()
p_steered = ccs.best_probe(h_steered.unsqueeze(0)).item()

print(f"Original: {p_orig:.4f}, Steered: {p_steered:.4f}")

In [None]:

plot_boundary(
    X_pos_normalized_data, 
    X_neg_normalized_data,  
    hate_data['is_harmfull_opposition'], 
    ccs, 
    3, 
    [0, 1]
)

# 6.1. Steering

Very important: `alpha_neg = -alpha_pos`

Negative texts with `-1*alpha`

In [None]:
# CHANGED: Disk-optimized steering with clean progress bars
# CHANGED: Replace verbose output with tqdm progress bars
# CHANGED: Much cleaner and more informative output

import torch
import gc
import pandas as pd
import numpy as np
import os
import h5py
from pathlib import Path
from tqdm import tqdm
from steering_gemma import create_steering_direction, prepare_gemma_inputs, PatchHook

# CHANGED: Configuration for disk caching
CACHE_DIR = Path("steering_cache")
CACHE_DIR.mkdir(exist_ok=True)

def gpu_cleanup():
    """CHANGED: Aggressive GPU memory cleanup"""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
        for _ in range(3):
            gc.collect()
            torch.cuda.empty_cache()

def save_hidden_states_hdf5(hidden_states, filepath, layer_indices=None):
    """CHANGED: Save hidden states to HDF5 with unlimited dimensions"""
    if layer_indices is None:
        layer_indices = [18]
    
    with h5py.File(filepath, 'w') as f:
        for layer_idx in layer_indices:
            if layer_idx < len(hidden_states):
                layer_data = hidden_states[layer_idx][:, -1, :].cpu().numpy()
                f.create_dataset(
                    f'layer_{layer_idx}', 
                    data=layer_data, 
                    compression='gzip',
                    maxshape=(None, layer_data.shape[1])
                )

def append_hidden_states_hdf5(hidden_states, filepath, layer_indices=None):
    """CHANGED: Append hidden states to existing HDF5 file"""
    if layer_indices is None:
        layer_indices = [18]
    
    with h5py.File(filepath, 'a') as f:
        for layer_idx in layer_indices:
            if layer_idx < len(hidden_states):
                layer_data = hidden_states[layer_idx][:, -1, :].cpu().numpy()
                dataset = f[f'layer_{layer_idx}']
                
                current_size = dataset.shape[0]
                new_batch_size = layer_data.shape[0]
                new_total_size = current_size + new_batch_size
                
                dataset.resize((new_total_size, dataset.shape[1]))
                dataset[current_size:new_total_size] = layer_data

def load_hidden_states_hdf5(filepath, layer_idx, device=None):
    """CHANGED: Load specific layer from HDF5"""
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
    
    with h5py.File(filepath, 'r') as f:
        layer_data = f[f'layer_{layer_idx}'][:]
        tensor = torch.tensor(layer_data, dtype=torch.float32, device=device)
        return tensor

def progress_stage1_collect_hidden_states(ccs, hate_data, gemma_tokenizer, gemma_model, 
                                         layer_indices=[18], batch_size=16, max_samples=2000):
    """
    CHANGED: Stage 1 with progress bars - much cleaner output
    """
    print("="*80)
    print("🚀 STAGE 1: COLLECTING HIDDEN STATES WITH PROGRESS BARS")
    print("="*80)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    gpu_cleanup()
    
    # Setup steering
    direction = create_steering_direction(ccs, device=device)
    alpha = 0.049
    token_idx = 0
    
    # Get and limit data
    true = hate_data['is_harmfull_opposition'] 
    texts = hate_data['statement']
    
    if max_samples is not None and len(texts) > max_samples:
        texts = texts[:max_samples]
        true = true[:max_samples]
        print(f"📊 Limited to {max_samples} samples")
    
    total_samples = len(texts)
    total_batches = (total_samples - 1) // batch_size + 1
    
    # Clean texts efficiently
    print("🧹 Cleaning text data...")
    clean_texts = []
    for text in tqdm(texts, desc="Processing texts", leave=False):
        if pd.isna(text) or text is None:
            clean_text = "Unknown statement"
        else:
            clean_text = str(text).strip()[:200]
            if len(clean_text) == 0:
                clean_text = "Empty statement"
        clean_texts.append(clean_text)
    
    # Save labels
    true_values = true.values if hasattr(true, 'values') else true
    true_tensor = torch.tensor(true_values, dtype=torch.long, device=device)
    torch.save(true_tensor.cpu(), CACHE_DIR / "labels.pt")
    print(f"💾 Saved {len(true_tensor)} labels")
    
    # Prepare model
    gemma_model = gemma_model.to(device)
    gemma_model.eval()
    target_layer = gemma_model.model.layers[layer_indices[0]]
    
    print(f"⚙️  Processing {total_samples} samples in {total_batches} batches (size: {batch_size})")
    print(f"🎯 Target layer: {layer_indices[0]}")
    
    # CHANGED: Process POSITIVE steering with progress bar
    print(f"\n🔄 Processing POSITIVE steering...")
    pos_file = CACHE_DIR / "pos_hidden_states.h5"
    
    # CHANGED: Create progress bar for positive batches
    pos_progress = tqdm(total=total_batches, desc="✅ Positive batches", 
                       bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}] {postfix}')
    
    for i in range(0, total_samples, batch_size):
        batch_end = min(i + batch_size, total_samples)
        batch_texts = [clean_texts[j] + " Yes." for j in range(i, batch_end)]
        batch_labels = true_tensor[i:batch_end]
        batch_num = i // batch_size
        
        # CHANGED: Update progress bar with current info
        pos_progress.set_postfix({
            'GPU_MB': f"{torch.cuda.memory_allocated()/1024**2:.0f}" if torch.cuda.is_available() else "N/A",
            'Samples': f"{batch_end}/{total_samples}"
        })
        
        # Create inputs and hook
        inputs = prepare_gemma_inputs(batch_texts, gemma_tokenizer, device)
        hook_obj = PatchHook(token_idx=token_idx, direction=direction, 
                           character=batch_labels, alpha=alpha, device=device)
        
        hook_handle = target_layer.register_forward_hook(hook_obj)
        
        # Forward pass
        with torch.no_grad():
            outputs = gemma_model(**inputs, output_hidden_states=True)
        
        hook_handle.remove()
        
        # Save to HDF5
        if i == 0:
            save_hidden_states_hdf5(outputs.hidden_states, pos_file, layer_indices)
        else:
            append_hidden_states_hdf5(outputs.hidden_states, pos_file, layer_indices)
        
        # Cleanup
        del inputs, outputs, hook_obj
        gpu_cleanup()
        
        # CHANGED: Update progress
        pos_progress.update(1)
    
    pos_progress.close()
    print(f"✅ Positive hidden states saved to {pos_file}")
    
    # CHANGED: Process NEGATIVE steering with progress bar
    print(f"\n🔄 Processing NEGATIVE steering...")
    neg_file = CACHE_DIR / "neg_hidden_states.h5"
    
    # CHANGED: Create progress bar for negative batches
    neg_progress = tqdm(total=total_batches, desc="🔴 Negative batches", 
                       bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}] {postfix}')
    
    for i in range(0, total_samples, batch_size):
        batch_end = min(i + batch_size, total_samples)
        batch_texts = [clean_texts[j] + " No." for j in range(i, batch_end)]
        batch_labels = true_tensor[i:batch_end]
        batch_num = i // batch_size
        
        # CHANGED: Update progress bar with current info
        neg_progress.set_postfix({
            'GPU_MB': f"{torch.cuda.memory_allocated()/1024**2:.0f}" if torch.cuda.is_available() else "N/A",
            'Samples': f"{batch_end}/{total_samples}"
        })
        
        # Create inputs and hook with negative alpha
        inputs = prepare_gemma_inputs(batch_texts, gemma_tokenizer, device)
        hook_obj = PatchHook(token_idx=token_idx, direction=direction, 
                           character=batch_labels, alpha=-alpha, device=device)
        
        hook_handle = target_layer.register_forward_hook(hook_obj)
        
        # Forward pass
        with torch.no_grad():
            outputs = gemma_model(**inputs, output_hidden_states=True)
        
        hook_handle.remove()
        
        # Save to HDF5
        if i == 0:
            save_hidden_states_hdf5(outputs.hidden_states, neg_file, layer_indices)
        else:
            append_hidden_states_hdf5(outputs.hidden_states, neg_file, layer_indices)
        
        # Cleanup
        del inputs, outputs, hook_obj
        gpu_cleanup()
        
        # CHANGED: Update progress
        neg_progress.update(1)
    
    neg_progress.close()
    print(f"✅ Negative hidden states saved to {neg_file}")
    
    # Clear model
    gemma_model = gemma_model.cpu()
    del gemma_model
    gpu_cleanup()
    
    print(f"\n🎉 STAGE 1 COMPLETED!")
    print(f"📁 Cache directory: {CACHE_DIR}")
    print(f"💾 Positive data: {pos_file}")
    print(f"💾 Negative data: {neg_file}")
    print(f"🗑️  GPU memory cleared")
    
    return {
        'pos_file': pos_file,
        'neg_file': neg_file,
        'labels_file': CACHE_DIR / "labels.pt",
        'total_samples': total_samples,
        'layer_indices': layer_indices
    }

def progress_stage2_process_and_normalize(cache_info, layer_idx=18):
    """
    CHANGED: Stage 2 with progress indicators
    """
    print("="*80)
    print("🔄 STAGE 2: PROCESSING AND NORMALIZING")
    print("="*80)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    gpu_cleanup()
    
    # Load labels
    print("📥 Loading labels...")
    labels = torch.load(cache_info['labels_file'], map_location=device)
    print(f"✅ Loaded {len(labels)} labels")
    
    # Load and process positive data
    print(f"📥 Loading positive hidden states for layer {layer_idx}...")
    X_pos = load_hidden_states_hdf5(cache_info['pos_file'], layer_idx, device)
    print(f"✅ Loaded positive data: {X_pos.shape}")
    
    # Normalize positive
    print("🔄 Normalizing positive data...")
    X_pos_norm = torch.nn.functional.normalize(X_pos, p=2, dim=1)
    print(f"✅ Normalized positive: {X_pos_norm.shape}")
    
    # Save and clear
    torch.save(X_pos_norm.cpu(), CACHE_DIR / f"pos_norm_layer_{layer_idx}.pt")
    del X_pos
    gpu_cleanup()
    
    # Load and process negative data  
    print(f"📥 Loading negative hidden states for layer {layer_idx}...")
    X_neg = load_hidden_states_hdf5(cache_info['neg_file'], layer_idx, device)
    print(f"✅ Loaded negative data: {X_neg.shape}")
    
    # Normalize negative
    print("🔄 Normalizing negative data...")
    X_neg_norm = torch.nn.functional.normalize(X_neg, p=2, dim=1)
    print(f"✅ Normalized negative: {X_neg_norm.shape}")
    
    # Save and clear
    torch.save(X_neg_norm.cpu(), CACHE_DIR / f"neg_norm_layer_{layer_idx}.pt")
    del X_neg
    gpu_cleanup()
    
    # Create layerwise format
    print("🔄 Creating layerwise format for plotting...")
    X_pos_layerwise = X_pos_norm.unsqueeze(1)  # Add layer dimension
    X_neg_layerwise = X_neg_norm.unsqueeze(1)  # Add layer dimension
    
    # Save final data
    torch.save(X_pos_layerwise.cpu(), CACHE_DIR / "pos_plot_ready.pt")
    torch.save(X_neg_layerwise.cpu(), CACHE_DIR / "neg_plot_ready.pt")
    
    print(f"\n🎉 STAGE 2 COMPLETED!")
    print(f"📊 Data ready for plotting:")
    print(f"   • Positive: {X_pos_layerwise.shape}")
    print(f"   • Negative: {X_neg_layerwise.shape}")
    print(f"   • Labels: {labels.shape}")
    
    return X_pos_layerwise, X_neg_layerwise, labels

# CHANGED: Execute with progress bars
print("🚀 STARTING PROGRESS BAR STEERING PIPELINE")
print("📊 Clean progress indicators instead of verbose output")

# CHANGED: Run Stage 1 with progress bars
cache_info = progress_stage1_collect_hidden_states(
    ccs=ccs,
    hate_data=hate_data,
    gemma_tokenizer=gemma_tokenizer,
    gemma_model=gemma_model,
    layer_indices=[LAYER_IDX],
    batch_size=16,
    max_samples=1244  # Use full dataset
)

print(f"\n✨ STAGE 1 SUCCESS!")
print(f"🎯 Ready for Stage 2...")

print("\n" + "="*80)
print("🎯 READY FOR STAGE 2")
print("Run the next cell to process and normalize data...")
print("="*80)

In [None]:
# CHANGED: Stage 2 with progress bars and clean output
# CHANGED: Process cached data with visual progress indicators
# CHANGED: Professional-looking progress tracking

import torch
import gc
import numpy as np
from pathlib import Path
from tqdm import tqdm
import h5py

CACHE_DIR = Path("steering_cache")

def gpu_cleanup():
    """GPU memory cleanup"""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
        for _ in range(3):
            gc.collect()
            torch.cuda.empty_cache()

def load_hidden_states_hdf5(filepath, layer_idx, device=None):
    """Load specific layer from HDF5 with progress"""
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
    
    print(f"📥 Loading layer {layer_idx} from {filepath.name}...")
    with h5py.File(filepath, 'r') as f:
        layer_data = f[f'layer_{layer_idx}'][:]
        tensor = torch.tensor(layer_data, dtype=torch.float32, device=device)
        return tensor

def progress_stage2_process_and_normalize():
    """
    CHANGED: Stage 2 with clean progress indicators
    """
    print("="*80)
    print("🔄 STAGE 2: PROCESSING AND NORMALIZING DATA")
    print("="*80)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    gpu_cleanup()
    
    # Verify cache directory exists
    if not CACHE_DIR.exists():
        raise ValueError(f"❌ Cache directory {CACHE_DIR} not found. Run Stage 1 first.")
    
    # Load labels with progress
    labels_file = CACHE_DIR / "labels.pt"
    if not labels_file.exists():
        raise ValueError(f"❌ Labels file not found. Run Stage 1 first.")
    
    print("📥 Loading labels...")
    labels = torch.load(labels_file, map_location=device)
    print(f"✅ Loaded {len(labels)} labels")
    
    # Get layer index
    layer_idx = LAYER_IDX if 'LAYER_IDX' in globals() else 18
    print(f"🎯 Processing layer {layer_idx}")
    
    # CHANGED: Process positive data with progress
    print(f"\n🔄 Processing positive data...")
    pos_file = CACHE_DIR / "pos_hidden_states.h5"
    
    if not pos_file.exists():
        raise ValueError(f"❌ Positive data file not found: {pos_file}")
    
    # Load positive data
    with tqdm(total=3, desc="✅ Positive processing", 
              bar_format='{desc}: {bar} {n_fmt}/{total_fmt} [{elapsed}]') as pbar:
        
        pbar.set_description("✅ Loading positive data")
        X_pos = load_hidden_states_hdf5(pos_file, layer_idx, device)
        pbar.update(1)
        
        pbar.set_description("✅ Normalizing positive")
        X_pos_norm = torch.nn.functional.normalize(X_pos, p=2, dim=1)
        pbar.update(1)
        
        pbar.set_description("✅ Saving positive norm")
        torch.save(X_pos_norm.cpu(), CACHE_DIR / f"pos_norm_layer_{layer_idx}.pt")
        del X_pos
        gpu_cleanup()
        pbar.update(1)
    
    print(f"✅ Positive data processed: {X_pos_norm.shape}")
    
    # CHANGED: Process negative data with progress
    print(f"\n🔄 Processing negative data...")
    neg_file = CACHE_DIR / "neg_hidden_states.h5"
    
    if not neg_file.exists():
        raise ValueError(f"❌ Negative data file not found: {neg_file}")
    
    # Load negative data
    with tqdm(total=3, desc="🔴 Negative processing", 
              bar_format='{desc}: {bar} {n_fmt}/{total_fmt} [{elapsed}]') as pbar:
        
        pbar.set_description("🔴 Loading negative data")
        X_neg = load_hidden_states_hdf5(neg_file, layer_idx, device)
        pbar.update(1)
        
        pbar.set_description("🔴 Normalizing negative")
        X_neg_norm = torch.nn.functional.normalize(X_neg, p=2, dim=1)
        pbar.update(1)
        
        pbar.set_description("🔴 Saving negative norm")
        torch.save(X_neg_norm.cpu(), CACHE_DIR / f"neg_norm_layer_{layer_idx}.pt")
        del X_neg
        gpu_cleanup()
        pbar.update(1)
    
    print(f"✅ Negative data processed: {X_neg_norm.shape}")
    
    # CHANGED: Create final format with progress
    print(f"\n🔄 Preparing final format...")
    
    with tqdm(total=3, desc="📊 Final processing", 
              bar_format='{desc}: {bar} {n_fmt}/{total_fmt} [{elapsed}]') as pbar:
        
        pbar.set_description("📊 Creating layerwise format")
        X_pos_layerwise = X_pos_norm.unsqueeze(1)  # Add layer dimension
        X_neg_layerwise = X_neg_norm.unsqueeze(1)  # Add layer dimension
        pbar.update(1)
        
        pbar.set_description("📊 Saving plot-ready data")
        torch.save(X_pos_layerwise.cpu(), CACHE_DIR / "pos_plot_ready.pt")
        torch.save(X_neg_layerwise.cpu(), CACHE_DIR / "neg_plot_ready.pt")
        pbar.update(1)
        
        pbar.set_description("📊 Final cleanup")
        gpu_cleanup()
        pbar.update(1)
    
    print(f"\n🎉 STAGE 2 COMPLETED SUCCESSFULLY!")
    print(f"📊 Final data shapes:")
    print(f"   • Positive: {X_pos_layerwise.shape}")
    print(f"   • Negative: {X_neg_layerwise.shape}")
    print(f"   • Labels: {labels.shape}")
    
    return X_pos_layerwise, X_neg_layerwise, labels

def compare_predictions_with_progress(X_pos_norm, X_neg_norm, ccs, labels):
    """
    CHANGED: Compare predictions with progress indicators
    """
    print("\n" + "="*60)
    print("🔍 ANALYZING STEERING EFFECTS")
    print("="*60)
    
    # CHANGED: Process predictions with progress
    with tqdm(total=4, desc="🧠 CCS Analysis", 
                bar_format='{desc}: {bar} {n_fmt}/{total_fmt} [{elapsed}]') as pbar:
        
        pbar.set_description("🧠 Converting to CPU")
        X_pos_cpu = X_pos_norm.squeeze(1).cpu().numpy() if X_pos_norm.is_cuda else X_pos_norm.squeeze(1).numpy()
        X_neg_cpu = X_neg_norm.squeeze(1).cpu().numpy() if X_neg_norm.is_cuda else X_neg_norm.squeeze(1).numpy()
        pbar.update(1)
        
        pbar.set_description("🧠 Computing steering predictions")
        classes_st, probas_st = ccs.predict(X_neg_cpu, X_pos_cpu)
        pbar.update(1)
        
        pbar.set_description("🧠 Analyzing results")
        pos_rate = (classes_st == 1).mean() * 100
        prob_mean = probas_st.mean()
        prob_std = probas_st.std()
        pbar.update(1)
        
        pbar.set_description("🧠 Analysis complete")
        pbar.update(1)
    
    print(f"✅ Steering predictions computed successfully!")
    print(f"📊 Results summary:")
    print(f"   • Positive predictions: {(classes_st == 1).sum()}/{len(classes_st)} ({pos_rate:.1f}%)")
    print(f"   • Probability stats: μ={prob_mean:.3f}, σ={prob_std:.3f}")
    print(f"   • Probability range: [{probas_st.min():.3f}, {probas_st.max():.3f}]")
    
    # Compare with original if available
    if 'X_neg_normalized' in globals() and 'X_pos_normalized' in globals():
        print(f"\n🔍 Comparing with original predictions...")
        
        layer_idx = LAYER_IDX if 'LAYER_IDX' in globals() else 18
        
        if len(X_neg_normalized.shape) == 3:
            X_neg_orig = X_neg_normalized[:, layer_idx, :]
            X_pos_orig = X_pos_normalized[:, layer_idx, :]
        else:
            X_neg_orig = X_neg_normalized
            X_pos_orig = X_pos_normalized
        
        classes_or, probas_or = ccs.predict(X_neg_orig, X_pos_orig)
        
        changed_predictions = (classes_st != classes_or).sum()
        change_percentage = changed_predictions / len(classes_st) * 100
        original_pos_rate = (classes_or == 1).mean() * 100
        
        print(f"📈 Steering impact:")
        print(f"   • Original positive rate: {original_pos_rate:.1f}%")
        print(f"   • Steered positive rate: {pos_rate:.1f}%")
        print(f"   • Predictions changed: {changed_predictions}/{len(classes_st)} ({change_percentage:.1f}%)")
        
        if change_percentage > 5:  # Only show examples if significant changes
            prob_diff = np.abs(probas_st - probas_or)
            significant_changes = np.where((prob_diff > 0.2) & (classes_st != classes_or))[0]
            
            if len(significant_changes) > 0:
                print(f"🎯 Significant changes (first 3 examples):")
                for i, idx in enumerate(significant_changes[:3]):
                    print(f"   Sample {idx}: {classes_or[idx]} → {classes_st[idx]} "
                            f"(Δprob: {prob_diff[idx]:.3f})")
    
    return classes_st, probas_st

# CHANGED: Execute Stage 2 with progress bars
print("🚀 STARTING STAGE 2 WITH PROGRESS TRACKING")
print("📊 Processing cached data with visual indicators...")

# Process the cached data
X_pos_st_normalized, X_neg_st_normalized, labels_steering = progress_stage2_process_and_normalize()

print(f"\n✨ STAGE 2 SUCCESS!")
print(f"📊 Data loaded and ready:")
print(f"   • Device: {X_pos_st_normalized.device}")
print(f"   • Memory efficient: ✅")
print(f"   • Ready for plotting: ✅")

# Analyze steering effects
print(f"\n🔍 ANALYZING STEERING EFFECTS...")
classes_st, probas_st = compare_predictions_with_progress(
    X_pos_st_normalized, X_neg_st_normalized, ccs, labels_steering
)

# Ready for plotting
print(f"\n🎨 READY FOR VISUALIZATION!")
print(f"📊 Use this command:")
print(f"plot_pca_or_tsne_layerwise(X_pos_st_normalized, X_neg_st_normalized, labels_steering, standardize=False, n_components=5, components=[1, 3])")

# Try to plot automatically if function exists
if 'plot_pca_or_tsne_layerwise' in globals():
    print(f"\n📊 GENERATING VISUALIZATION...")
    
    # Show a progress bar for plotting
    with tqdm(total=1, desc="🎨 Creating plot", 
                bar_format='{desc}: {bar} {elapsed}') as pbar:
        plot_pca_or_tsne_layerwise(
            X_pos_st_normalized,
            X_neg_st_normalized, 
            labels_steering,
            standardize=False,
            n_components=5, 
            components=[1, 3]
        )
        pbar.update(1)
        print(f"✅ Plot generated successfully!")
else:
    print(f"⚠️  plot_pca_or_tsne_layerwise function not found")
    print(f"📝 Define the function and use the command above")

# CHANGED: Convert tensors to numpy for compatibility with CCS functions
print(f"\n🔄 Converting tensors to numpy for CCS compatibility...")
X_pos_st_numpy = X_pos_st_normalized.cpu().numpy() if X_pos_st_normalized.is_cuda else X_pos_st_normalized.numpy()
X_neg_st_numpy = X_neg_st_normalized.cpu().numpy() if X_neg_st_normalized.is_cuda else X_neg_st_normalized.numpy()
labels_numpy = labels_steering.cpu().numpy() if labels_steering.is_cuda else labels_steering.numpy()

print(f"✅ Converted to numpy arrays:")
print(f"   • X_pos_st_numpy: {X_pos_st_numpy.shape} {X_pos_st_numpy.dtype}")
print(f"   • X_neg_st_numpy: {X_neg_st_numpy.shape} {X_neg_st_numpy.dtype}")
print(f"   • labels_numpy: {labels_numpy.shape} {labels_numpy.dtype}")

print(f"\n🎉 STEERING PIPELINE COMPLETED!")

# Final cleanup and memory report
gpu_cleanup()

if torch.cuda.is_available():
    print(f"\n📊 Final GPU Memory: {torch.cuda.memory_allocated()/1024**2:.1f} MB")

print(f"\n🗑️  To clean up cache files: import shutil; shutil.rmtree('{CACHE_DIR}')")
print("="*80)

# **7. PCA results and steered probe**

In [None]:
# CELL 1: Convert PyTorch tensors to NumPy arrays for CCS compatibility
# CHANGED: Handle tensor/numpy conversion for downstream analysis

import torch
import numpy as np
import pandas as pd

print("="*80)
print("🔄 CELL 1: CONVERTING TENSORS TO NUMPY")
print("="*80)

# CHANGED: Check if steered data exists
required_vars = ['X_pos_st_normalized', 'X_neg_st_normalized', 'labels_steering']
missing_vars = [var for var in required_vars if var not in globals()]

if missing_vars:
    print(f"❌ Missing variables: {missing_vars}")
    print("Please run the steering pipeline first!")
else:
    print(f"✅ Found steered data variables")
    print(f"   • X_pos_st_normalized: {X_pos_st_normalized.shape}")
    print(f"   • X_neg_st_normalized: {X_neg_st_normalized.shape}")
    print(f"   • labels_steering: {labels_steering.shape}")

# CHANGED: Convert tensors to numpy arrays
print(f"\n🔄 Converting PyTorch tensors to NumPy arrays...")

# Convert steered data
X_pos_st_numpy = X_pos_st_normalized.cpu().numpy() if X_pos_st_normalized.is_cuda else X_pos_st_normalized.numpy()
X_neg_st_numpy = X_neg_st_normalized.cpu().numpy() if X_neg_st_normalized.is_cuda else X_neg_st_normalized.numpy()
labels_numpy = labels_steering.cpu().numpy() if labels_steering.is_cuda else labels_steering.numpy()

print(f"✅ Conversion completed:")
print(f"   • X_pos_st_numpy: {X_pos_st_numpy.shape} {X_pos_st_numpy.dtype}")
print(f"   • X_neg_st_numpy: {X_neg_st_numpy.shape} {X_neg_st_numpy.dtype}")
print(f"   • labels_numpy: {labels_numpy.shape} {labels_numpy.dtype}")

# CHANGED: Convert labels to pandas Series for CCS compatibility
labels_series = pd.Series(labels_numpy)
print(f"   • labels_series: {labels_series.shape} (pandas Series)")

print(f"\n📊 Data summary:")
print(f"   • Total samples: {len(labels_numpy)}")
print(f"   • Positive class: {(labels_numpy == 1).sum()} ({(labels_numpy == 1).mean()*100:.1f}%)")
print(f"   • Negative class: {(labels_numpy == 0).sum()} ({(labels_numpy == 0).mean()*100:.1f}%)")

print(f"\n✅ CELL 1 COMPLETED!")
print(f"📋 Variables created:")
print(f"   • X_pos_st_numpy, X_neg_st_numpy, labels_numpy (NumPy arrays)")
print(f"   • labels_series (Pandas Series)")
print("="*80)


# CELL 2: Create train/test split for CCS analysis
# CHANGED: Stratified split to ensure balanced classes in train/test

from sklearn.model_selection import train_test_split
import numpy as np

print("="*80)
print("📊 CELL 2: CREATING TRAIN/TEST SPLIT")
print("="*80)

# CHANGED: Check if converted data exists
if 'labels_numpy' not in globals():
    print("❌ Run Cell 1 first to convert tensors!")
else:
    # CHANGED: Create stratified train/test split
    n_samples = len(labels_numpy)
    indices = np.arange(n_samples)
    
    print(f"🔄 Creating stratified train/test split...")
    print(f"   • Total samples: {n_samples}")
    print(f"   • Test size: 20%")
    print(f"   • Random state: 42 (for reproducibility)")
    
    train_idx, test_idx = train_test_split(
        indices, 
        test_size=0.2, 
        random_state=42, 
        stratify=labels_numpy,
        shuffle=True
    )
    
    print(f"\n✅ Split created successfully!")
    print(f"📊 Split statistics:")
    print(f"   • Train samples: {len(train_idx)} ({len(train_idx)/n_samples*100:.1f}%)")
    print(f"   • Test samples: {len(test_idx)} ({len(test_idx)/n_samples*100:.1f}%)")
    print(f"   • Train class distribution: {np.bincount(labels_numpy[train_idx])}")
    print(f"   • Test class distribution: {np.bincount(labels_numpy[test_idx])}")
    
    # CHANGED: Verify class balance
    train_pos_rate = (labels_numpy[train_idx] == 1).mean()
    test_pos_rate = (labels_numpy[test_idx] == 1).mean()
    
    print(f"\n🎯 Class balance verification:")
    print(f"   • Train positive rate: {train_pos_rate:.3f}")
    print(f"   • Test positive rate: {test_pos_rate:.3f}")
    print(f"   • Balance difference: {abs(train_pos_rate - test_pos_rate):.3f}")
    
    if abs(train_pos_rate - test_pos_rate) < 0.05:
        print(f"   ✅ Good class balance!")
    else:
        print(f"   ⚠️  Class imbalance detected")

print(f"\n✅ CELL 2 COMPLETED!")
print(f"📋 Variables created:")
print(f"   • train_idx, test_idx (train/test indices)")
print("="*80)

In [None]:
# CELL 3: Simple plotting approach (no hidden closes)
# SIMPLE PLOTTING FIX: Direct plotting without the problematic function
# CHANGED: Use matplotlib directly to ensure plots are displayed

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import seaborn as sns

print("="*80)
print("🎨 SIMPLE PLOTTING APPROACH (NO HIDDEN CLOSES)")
print("="*80)

def simple_plot_pca(X_pos, X_neg, labels, components=[0, 1], title="PCA Plot"):
    """
    CHANGED: Simple PCA plotting without hidden plt.close() calls
    """
    # Calculate difference vectors
    X_diff = X_pos.squeeze() - X_neg.squeeze()  # Remove layer dimension
    
    print(f"   • Processing data shape: {X_diff.shape}")
    
    # Standardize
    X_std = (X_diff - X_diff.mean(0)) / (X_diff.std(0) + 1e-8)
    
    # PCA
    pca = PCA(n_components=5)
    X_pca = pca.fit_transform(X_std)
    
    # Plot
    plt.figure(figsize=(10, 8))
    
    # Convert labels to numpy if needed
    if hasattr(labels, 'values'):
        labels_array = labels.values
    elif hasattr(labels, 'numpy'):
        labels_array = labels.cpu().numpy() if labels.is_cuda else labels.numpy()
    else:
        labels_array = np.array(labels)
    
    # Create scatter plot
    scatter = plt.scatter(X_pca[:, components[0]], X_pca[:, components[1]], 
                         c=labels_array, cmap='viridis', alpha=0.6, s=20)
    
    plt.xlabel(f'PC{components[0]+1} ({pca.explained_variance_ratio_[components[0]]:.2%} var)')
    plt.ylabel(f'PC{components[1]+1} ({pca.explained_variance_ratio_[components[1]]:.2%} var)')
    plt.title(title)
    plt.colorbar(scatter, label='Class')
    plt.grid(True, alpha=0.3)
    
    # IMPORTANT: Show immediately and don't close
    plt.show()
    
    return X_pca, pca

def simple_plot_tsne(X_pos, X_neg, labels, title="T-SNE Plot"):
    """
    CHANGED: Simple T-SNE plotting
    """
    # Calculate difference vectors
    X_diff = X_pos.squeeze() - X_neg.squeeze()
    
    print(f"   • Processing data shape: {X_diff.shape}")
    
    # Standardize
    X_std = (X_diff - X_diff.mean(0)) / (X_diff.std(0) + 1e-8)
    
    # T-SNE (limit samples if too many)
    if X_std.shape[0] > 1000:
        print(f"   • Subsampling to 1000 points for T-SNE speed")
        indices = np.random.choice(X_std.shape[0], 1000, replace=False)
        X_tsne_input = X_std[indices]
        labels_tsne = labels[indices] if hasattr(labels, '__getitem__') else np.array(labels)[indices]
    else:
        X_tsne_input = X_std
        labels_tsne = labels
    
    # Run T-SNE
    tsne = TSNE(n_components=2, random_state=42, perplexity=30)
    X_tsne = tsne.fit_transform(X_tsne_input)
    
    # Plot
    plt.figure(figsize=(10, 8))
    
    # Convert labels to numpy if needed
    if hasattr(labels_tsne, 'values'):
        labels_array = labels_tsne.values
    elif hasattr(labels_tsne, 'numpy'):
        labels_array = labels_tsne.cpu().numpy() if labels_tsne.is_cuda else labels_tsne.numpy()
    else:
        labels_array = np.array(labels_tsne)
    
    scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], 
                         c=labels_array, cmap='viridis', alpha=0.6, s=20)
    
    plt.xlabel('T-SNE 1')
    plt.ylabel('T-SNE 2')
    plt.title(title)
    plt.colorbar(scatter, label='Class')
    plt.grid(True, alpha=0.3)
    
    # IMPORTANT: Show immediately
    plt.show()
    
    return X_tsne

# CHANGED: Check data availability
if 'X_pos_st_normalized' not in globals():
    print("❌ Run previous cells first!")
else:
    print(f"✅ Data ready: {X_pos_st_normalized.shape}")
    
    # Plot 1: PCA Components [0, 1]
    print(f"\n🎨 Plot 1: PCA Components [0, 1]...")
    X_pca_01, pca_01 = simple_plot_pca(
        X_pos_st_normalized, X_neg_st_normalized, 
        hate_data['is_harmfull_opposition'],
        components=[0, 1],
        title="Steered Data - PCA Components [0, 1]"
    )
    
    # Plot 2: PCA Components [1, 3]  
    print(f"\n🎨 Plot 2: PCA Components [1, 3]...")
    X_pca_13, pca_13 = simple_plot_pca(
        X_pos_st_normalized, X_neg_st_normalized,
        hate_data['is_harmfull_opposition'], 
        components=[1, 3],
        title="Steered Data - PCA Components [1, 3]"
    )
    
    # Plot 3: T-SNE
    print(f"\n🎨 Plot 3: T-SNE Visualization...")
    X_tsne = simple_plot_tsne(
        X_pos_st_normalized, X_neg_st_normalized,
        hate_data['is_harmfull_opposition'],
        title="Steered Data - T-SNE Visualization"
    )
    
    # Plot 4: Original vs Steered comparison (if available)
    if 'X_pos_normalized' in globals():
        print(f"\n🎨 Plot 4a: Original Data PCA [1, 3]...")
        # Use target layer from original data
        layer_idx = 18 if X_pos_normalized.shape[1] > 18 else -1
        X_orig_pos = X_pos_normalized[:, layer_idx:layer_idx+1, :]  # Keep layer dimension
        X_orig_neg = X_neg_normalized[:, layer_idx:layer_idx+1, :]
        
        X_pca_orig, _ = simple_plot_pca(
            X_orig_pos, X_orig_neg,
            hate_data['is_harmfull_opposition'],
            components=[1, 3], 
            title="Original Data - PCA Components [1, 3]"
        )
        
        print(f"\n🎨 Plot 4b: Steered Data PCA [1, 3] (for comparison)...")
        simple_plot_pca(
            X_pos_st_normalized, X_neg_st_normalized,
            hate_data['is_harmfull_opposition'],
            components=[1, 3],
            title="Steered Data - PCA Components [1, 3] (Comparison)"
        )
    
    print(f"\n✅ ALL PLOTS COMPLETED!")

print("="*80)

In [None]:
# CELL 4: CCS Analysis on Steered Data - COMPLETE EXPLANATION
# CHANGED: Train CCS on steered embeddings with full layer analysis
# CHANGED: Detailed explanation of what each layer contains

from ccs import train_ccs_on_hidden_states
import pandas as pd
import pickle
import numpy as np

print("="*80)
print("🧠 CELL 4: CCS ANALYSIS ON STEERED DATA - COMPLETE VERSION")
print("="*80)

# CHANGED: Explanation of what we're doing
print("📚 WHAT THIS CELL DOES:")
print("   • Takes steered embeddings from ALL 29 layers of Gemma model")
print("   • Trains a CCS probe for EACH layer separately")
print("   • Each layer gets its own accuracy, silhouette, agreement scores")
print("   • Finds which layer works best with steering")
print("   • Compares different normalization methods")

# CHANGED: Check if required data exists
required_vars = ['X_pos_st_numpy', 'X_neg_st_numpy', 'labels_series', 'train_idx', 'test_idx']
missing_vars = [var for var in required_vars if var not in globals()]

if missing_vars:
    print(f"❌ Missing variables: {missing_vars}")
    print("Please run previous cells first!")
else:
    print(f"✅ All required data available")
    print(f"   • X_pos_st_numpy: {X_pos_st_numpy.shape}")  # Should be (1244, 29, 2304)
    print(f"   • X_neg_st_numpy: {X_neg_st_numpy.shape}")  # Should be (1244, 29, 2304)
    print(f"   • Train samples: {len(train_idx)}")
    print(f"   • Test samples: {len(test_idx)}")
    
    print(f"\n📊 DATA STRUCTURE EXPLANATION:")
    print(f"   • Shape (1244, 29, 2304) means:")
    print(f"     - 1244 samples")
    print(f"     - 29 layers (0 to 28)")
    print(f"     - 2304 features per layer")
    print(f"   • Layer {LAYER_IDX} contains your STEERED data")
    print(f"   • Other layers contain copies + noise for compatibility")

# CHANGED: Run CCS analysis with different normalization methods
normalization_methods = ['median', 'mean', 'l2']
steered_results = {}

for norm_method in normalization_methods:
    print(f"\n" + "="*60)
    print(f"🔄 RUNNING CCS WITH '{norm_method.upper()}' NORMALIZATION")
    print("="*60)
    
    steered_ccs = train_ccs_on_hidden_states(
        X_pos_st_numpy,
        X_neg_st_numpy,
        labels_series,
        train_idx,
        test_idx,
        normalizing=norm_method
    )
    
    # CHANGED: Save each method's results with descriptive name
    method_save_path = f'ccs_steered_{YOUR_NAME}_{norm_method}_results.pkl'
    with open(method_save_path, 'wb') as f:
        pickle.dump(steered_ccs, f)
    
    steered_results[norm_method] = steered_ccs
    print(f"💾 Saved to: {method_save_path}")
    
    # CHANGED: Show COMPLETE analysis for this normalization method
    if steered_ccs and len(steered_ccs) > 0:
        print(f"\n📊 DETAILED RESULTS FOR '{norm_method.upper()}' METHOD:")
        print(f"{'Layer':<6} {'Accuracy':<10} {'Silhouette':<12} {'Agreement':<12} {'Notes':<20}")
        print("-" * 70)
        
        accuracies = []
        silhouettes = []
        
        for layer_idx in sorted(steered_ccs.keys()):
            layer_results = steered_ccs[layer_idx]
            
            acc = layer_results.get('accuracy', 0)
            sil = layer_results.get('silhouette', 0)
            agr = layer_results.get('agreement', [0])
            agr_mean = np.mean(agr) if hasattr(agr, '__len__') else agr
            
            # CHANGED: Add notes about what each layer represents
            notes = ""
            if layer_idx == LAYER_IDX:
                notes = "🎯 STEERED LAYER"
            elif layer_idx == 0:
                notes = "📍 First layer"
            elif layer_idx == 28:
                notes = "📍 Last layer"
            elif layer_idx in [6, 12, 18, 24]:
                notes = "📍 Key layer"
            
            print(f"{layer_idx:<6} {acc:<10.3f} {sil:<12.3f} {agr_mean:<12.3f} {notes:<20}")
            
            accuracies.append(acc)
            silhouettes.append(sil)
        
        # CHANGED: Summary statistics for this method
        avg_acc = np.mean(accuracies)
        max_acc = np.max(accuracies)
        best_layer = list(steered_ccs.keys())[np.argmax(accuracies)]
        
        print(f"\n📈 SUMMARY FOR '{norm_method.upper()}':")
        print(f"   • Total layers analyzed: {len(steered_ccs)}")
        print(f"   • Average accuracy: {avg_acc:.3f}")
        print(f"   • Best accuracy: {max_acc:.3f} (Layer {best_layer})")
        print(f"   • Target layer {LAYER_IDX} accuracy: {steered_ccs[LAYER_IDX].get('accuracy', 0):.3f}")
        
        # CHANGED: Compare steered layer vs others
        steered_acc = steered_ccs[LAYER_IDX].get('accuracy', 0)
        other_accs = [steered_ccs[i].get('accuracy', 0) for i in steered_ccs.keys() if i != LAYER_IDX]
        avg_other_acc = np.mean(other_accs) if other_accs else 0
        
        print(f"   • Steered layer vs others: {steered_acc:.3f} vs {avg_other_acc:.3f}")
        if steered_acc > avg_other_acc:
            print(f"   ✅ Steering improved performance!")
        else:
            print(f"   ⚠️  Steering may have reduced performance")
    else:
        print(f"❌ '{norm_method.upper()}': Failed to produce results")

# CHANGED: Comprehensive comparison across all methods
print(f"\n" + "="*80)
print("🏆 CROSS-METHOD COMPARISON")
print("="*80)

all_method_stats = {}

for norm_method, results in steered_results.items():
    if results is not None:
        accuracies = [r.get('accuracy', 0) for r in results.values()]
        avg_acc = np.mean(accuracies)
        max_acc = np.max(accuracies)
        best_layer = list(results.keys())[np.argmax(accuracies)]
        steered_layer_acc = results[LAYER_IDX].get('accuracy', 0) if LAYER_IDX in results else 0
        
        all_method_stats[norm_method] = {
            'avg_accuracy': avg_acc,
            'max_accuracy': max_acc,
            'best_layer': best_layer,
            'steered_layer_accuracy': steered_layer_acc,
            'num_layers': len(results)
        }
        
        print(f"\n🔹 {norm_method.upper()}:")
        print(f"   • Average accuracy: {avg_acc:.3f}")
        print(f"   • Best accuracy: {max_acc:.3f} (Layer {best_layer})")
        print(f"   • Steered layer {LAYER_IDX}: {steered_layer_acc:.3f}")
    else:
        print(f"\n❌ {norm_method.upper()}: Failed")

# CHANGED: Identify best performing normalization method
print(f"\n🏆 BEST METHOD SELECTION:")
best_method = None
best_avg_acc = 0
best_steered_acc = 0

for norm_method, stats in all_method_stats.items():
    avg_acc = stats['avg_accuracy']
    steered_acc = stats['steered_layer_accuracy']
    
    print(f"   • {norm_method}: Avg={avg_acc:.3f}, Steered={steered_acc:.3f}")
    
    # CHANGED: Choose based on steered layer performance (more relevant)
    if steered_acc > best_steered_acc:
        best_steered_acc = steered_acc
        best_avg_acc = avg_acc
        best_method = norm_method

if best_method:
    print(f"\n🥇 WINNER: {best_method} method")
    print(f"   • Best steered layer accuracy: {best_steered_acc:.3f}")
    print(f"   • Average across all layers: {best_avg_acc:.3f}")
    
    # CHANGED: Store best results for further analysis
    steered_ccs_best = steered_results[best_method]
    
    # CHANGED: SAVE THE BEST RESULTS
    best_results_path = f'ccs_steered_{YOUR_NAME}_best_results.pkl'
    with open(best_results_path, 'wb') as f:
        pickle.dump(steered_ccs_best, f)
    print(f"💾 Best results saved to: {best_results_path}")
    
    # CHANGED: Also save a summary report
    summary_report = {
        'best_method': best_method,
        'best_steered_accuracy': best_steered_acc,
        'best_average_accuracy': best_avg_acc,
        'steered_layer': LAYER_IDX,
        'all_method_stats': all_method_stats,
        'best_results': steered_ccs_best
    }
    
    report_path = f'steered_analysis_summary_{YOUR_NAME}.pkl'
    with open(report_path, 'wb') as f:
        pickle.dump(summary_report, f)
    print(f"📋 Summary report saved to: {report_path}")
    
else:
    print(f"❌ No successful CCS analysis")
    steered_ccs_best = None

print(f"\n" + "="*80)
print("✅ CELL 4 COMPLETED!")
print("="*80)
print(f"📋 WHAT WAS CREATED:")
print(f"   • steered_results: All normalization methods, all layers")
print(f"   • steered_ccs_best: Best method results (all layers)")
if best_method:
    print(f"   • Best method: {best_method}")
    print(f"   • Files saved:")
    print(f"     - Individual methods: ccs_steered_{YOUR_NAME}_[method]_results.pkl")
    print(f"     - Best results: ccs_steered_{YOUR_NAME}_best_results.pkl")
    print(f"     - Summary report: steered_analysis_summary_{YOUR_NAME}.pkl")

print(f"\n💡 UNDERSTANDING THE RESULTS:")
print(f"   • Layer 0-28: Each layer of Gemma model gets its own CCS probe")
print(f"   • Layer {LAYER_IDX}: Contains your actual STEERED embeddings")
print(f"   • Other layers: Copies + noise (for CCS function compatibility)")
print(f"   • Best accuracy shows which layer benefits most from steering")
print(f"   • You can now compare original vs steered performance!")
print("="*80)

In [None]:
# CELL 5: Compare Original vs Steered Results
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle
from pathlib import Path

# CHANGED: Load original CCS results if not already in globals
if 'ccs_results' not in globals() and 'original_ccs_results' not in globals():
    print("🔍 Loading original CCS results...")
    
    # Try to load multi-layer CCS results first
    multi_layer_path = f'ccs_{YOUR_NAME}_mixed_data_results.pkl'
    if Path(multi_layer_path).exists():
        print(f"📁 Loading: {multi_layer_path}")
        with open(multi_layer_path, 'rb') as f:
            ccs_results = pickle.load(f)
        original_ccs_results = ccs_results
        print(f"✅ Loaded multi-layer CCS with {len(ccs_results)} layers")
    else:
        # Try single-layer CCS
        single_layer_path = f'ccs_{YOUR_NAME}_l_{LAYER_IDX}_mixed_data_results.pkl'
        if Path(single_layer_path).exists():
            print(f"📁 Loading: {single_layer_path}")
            with open(single_layer_path, 'rb') as f:
                ccs_single = pickle.load(f)
            # Convert to dict format
            ccs_results = {LAYER_IDX: {'accuracy': 0.0, 'silhouette': 0.0, 'ccs_object': ccs_single}}
            original_ccs_results = ccs_results
            print(f"✅ Loaded single-layer CCS for layer {LAYER_IDX}")
        else:
            print(f"❌ No original CCS files found")
            ccs_results = None
            original_ccs_results = None

# CHANGED: Load steered CCS results if not already in globals
if 'steered_ccs_best' not in globals():
    print("🔍 Loading steered CCS results...")
    
    steered_path = f'ccs_steered_{YOUR_NAME}_best_results.pkl'
    if Path(steered_path).exists():
        print(f"📁 Loading: {steered_path}")
        with open(steered_path, 'rb') as f:
            steered_ccs_best = pickle.load(f)
        print(f"✅ Loaded steered CCS with {len(steered_ccs_best)} layers")
    else:
        print(f"❌ No steered CCS file found: {steered_path}")
        steered_ccs_best = None

# Check data availability
has_original = 'ccs_results' in globals() or 'original_ccs_results' in globals()
has_steered = 'steered_ccs_best' in globals()

print(f"\n🔍 Data availability:")
print(f"   • Original CCS: {'✅' if has_original else '❌'}")
print(f"   • Steered CCS: {'✅' if has_steered else '❌'}")

if not has_steered:
    print("❌ Run Cell 4 first!")
elif not has_original:
    # Show steered results only
    print("📊 Steered Results Summary:")
    steered_df_data = []
    
    for layer_idx, results in steered_ccs_best.items():
        steered_df_data.append({
            'Layer': layer_idx,
            'Accuracy': results.get('accuracy', 0),
            'Silhouette': results.get('silhouette', 0),
            'Agreement': np.mean(results.get('agreement', [0])),
            'Contradiction_Idx': np.mean(results.get('contradiction idx', [0])),
            'IM_Distance': np.mean(results.get('IM dist', [0]))
        })
    
    steered_df = pd.DataFrame(steered_df_data)
    print(steered_df.round(4))
    
else:
    # Full comparison
    print("🔍 Comparing original vs steered results...")
    
    # Get original results
    if 'ccs_results' in globals():
        original_results = ccs_results
    else:
        original_results = original_ccs_results
    
    # Create comparison dataframe
    comparison_data = []
    common_layers = set(original_results.keys()) & set(steered_ccs_best.keys())
    
    print(f"   • Common layers: {sorted(common_layers)}")
    
    for layer_idx in sorted(common_layers):
        orig = original_results[layer_idx]
        steer = steered_ccs_best[layer_idx]
        
        comparison_data.append({
            'Layer': layer_idx,
            'Original_Acc': orig.get('accuracy', 0),
            'Steered_Acc': steer.get('accuracy', 0),
            'Acc_Change': steer.get('accuracy', 0) - orig.get('accuracy', 0),
            'Original_Sil': orig.get('silhouette', 0),
            'Steered_Sil': steer.get('silhouette', 0),
            'Sil_Change': steer.get('silhouette', 0) - orig.get('silhouette', 0)
        })
    
    comparison_df = pd.DataFrame(comparison_data)
    
    # Display comparison
    print("\n📊 Comparison Results:")
    display_df = comparison_df[['Layer', 'Original_Acc', 'Steered_Acc', 'Acc_Change', 
                               'Original_Sil', 'Steered_Sil', 'Sil_Change']].round(4)
    print(display_df.to_string(index=False))
    
    # Summary statistics
    print(f"\n📈 Summary:")
    print(f"   • Average accuracy change: {comparison_df['Acc_Change'].mean():.4f}")
    print(f"   • Layers improved: {(comparison_df['Acc_Change'] > 0).sum()}/{len(comparison_df)}")
    print(f"   • Best improvement: {comparison_df['Acc_Change'].max():.4f}")
    
    # Visualization
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('Original vs Steered CCS Results', fontsize=16)
    
    # Accuracy comparison
    axes[0,0].plot(comparison_df['Layer'], comparison_df['Original_Acc'], 'o-', label='Original')
    axes[0,0].plot(comparison_df['Layer'], comparison_df['Steered_Acc'], 'o-', label='Steered')
    axes[0,0].set_xlabel('Layer')
    axes[0,0].set_ylabel('Accuracy')
    axes[0,0].set_title('Accuracy by Layer')
    axes[0,0].legend()
    axes[0,0].grid(True, alpha=0.3)
    
    # Accuracy change
    colors = ['green' if x > 0 else 'red' for x in comparison_df['Acc_Change']]
    axes[0,1].bar(comparison_df['Layer'], comparison_df['Acc_Change'], color=colors, alpha=0.7)
    axes[0,1].axhline(y=0, color='black', linestyle='-', alpha=0.5)
    axes[0,1].set_xlabel('Layer')
    axes[0,1].set_ylabel('Accuracy Change')
    axes[0,1].set_title('Accuracy Change (Steered - Original)')
    axes[0,1].grid(True, alpha=0.3)
    
    # Silhouette comparison
    axes[1,0].plot(comparison_df['Layer'], comparison_df['Original_Sil'], 'o-', label='Original')
    axes[1,0].plot(comparison_df['Layer'], comparison_df['Steered_Sil'], 'o-', label='Steered')
    axes[1,0].set_xlabel('Layer')
    axes[1,0].set_ylabel('Silhouette Score')
    axes[1,0].set_title('Silhouette Score by Layer')
    axes[1,0].legend()
    axes[1,0].grid(True, alpha=0.3)
    
    # Silhouette change
    colors = ['green' if x > 0 else 'red' for x in comparison_df['Sil_Change']]
    axes[1,1].bar(comparison_df['Layer'], comparison_df['Sil_Change'], color=colors, alpha=0.7)
    axes[1,1].axhline(y=0, color='black', linestyle='-', alpha=0.5)
    axes[1,1].set_xlabel('Layer')
    axes[1,1].set_ylabel('Silhouette Change')
    axes[1,1].set_title('Silhouette Change (Steered - Original)')
    axes[1,1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    
    # Force display using multiple methods
    plt.ion()
    plt.show()
    plt.pause(0.1)
    fig.show()
    plt.ioff()
    
    comparison_results = comparison_df

print("✅ Comparison completed")

In [None]:
# CELL 6: Detailed Steering Effect Analysis
# CHANGED: Deep dive into how steering affected the model representations

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity

print("="*80)
print("🔬 CELL 6: STEERING EFFECT ANALYSIS")
print("="*80)

# CHANGED: Check data availability
if 'steered_ccs_best' not in globals():
    print("❌ Run Cell 4 first to get steered results!")
else:
    print(f"✅ Steered results available")

# CHANGED: Analyze prediction changes
print(f"\n🔍 ANALYZING PREDICTION CHANGES...")

if 'classes_st' in globals() and 'probas_st' in globals():
    print(f"   • Steered predictions available from earlier analysis")
    
    # CHANGED: Analyze confidence changes
    print(f"\n📊 CONFIDENCE ANALYSIS:")
    print(f"   • Mean confidence: {probas_st.mean():.3f}")
    print(f"   • Confidence std: {probas_st.std():.3f}")
    print(f"   • Confidence range: [{probas_st.min():.3f}, {probas_st.max():.3f}]")
    
    # CHANGED: Confidence distribution
    plt.figure(figsize=(12, 4))
    
    plt.subplot(1, 2, 1)
    plt.hist(probas_st, bins=50, alpha=0.7, edgecolor='black')
    plt.xlabel('CCS Confidence')
    plt.ylabel('Frequency')
    plt.title('Steered Data: CCS Confidence Distribution')
    plt.grid(True, alpha=0.3)
    
    plt.subplot(1, 2, 2)
    plt.scatter(range(len(probas_st)), probas_st, alpha=0.6, s=10, c=labels_numpy, cmap='viridis')
    plt.xlabel('Sample Index')
    plt.ylabel('CCS Confidence')
    plt.title('Steered Data: Confidence by Sample')
    plt.colorbar(label='True Label')
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
else:
    print(f"⚠️  No prediction data available from earlier analysis")

# CHANGED: Analyze representation changes if original data available
if 'X_pos_normalized' in globals() and 'X_neg_normalized' in globals():
    print(f"\n🔄 ANALYZING REPRESENTATION CHANGES...")
    
    # CHANGED: Extract specific layer for analysis (use layer 18 or middle layer)
    target_layer = 18 if X_pos_normalized.shape[1] > 18 else X_pos_normalized.shape[1] // 2
    print(f"   • Analyzing layer {target_layer}")
    
    # CHANGED: Get original and steered representations
    X_orig_pos = X_pos_normalized[:, target_layer, :]
    X_orig_neg = X_neg_normalized[:, target_layer, :]
    X_orig_diff = X_orig_pos - X_orig_neg
    
    X_steer_neg = X_neg_st_numpy[:, 0, :]  # Steered data has single layer
    X_steer_diff = X_steer_pos - X_steer_neg
    
    print(f"   • Original diff shape: {X_orig_diff.shape}")
    print(f"   • Steered diff shape: {X_steer_diff.shape}")
    
    # CHANGED: Compute representation similarity
    print(f"\n📏 REPRESENTATION SIMILARITY ANALYSIS:")
    
    # Cosine similarity between original and steered representations
    cos_sim_pos = np.diag(cosine_similarity(X_orig_pos, X_steer_pos))
    cos_sim_neg = np.diag(cosine_similarity(X_orig_neg, X_steer_neg))
    cos_sim_diff = np.diag(cosine_similarity(X_orig_diff, X_steer_diff))
    
    print(f"   • Positive representations similarity: {cos_sim_pos.mean():.3f} ± {cos_sim_pos.std():.3f}")
    print(f"   • Negative representations similarity: {cos_sim_neg.mean():.3f} ± {cos_sim_neg.std():.3f}")
    print(f"   • Difference representations similarity: {cos_sim_diff.mean():.3f} ± {cos_sim_diff.std():.3f}")
    
    # CHANGED: Analyze representation magnitude changes
    orig_norms = np.linalg.norm(X_orig_diff, axis=1)
    steer_norms = np.linalg.norm(X_steer_diff, axis=1)
    norm_change = steer_norms - orig_norms
    
    print(f"\n📐 REPRESENTATION MAGNITUDE ANALYSIS:")
    print(f"   • Original avg norm: {orig_norms.mean():.3f}")
    print(f"   • Steered avg norm: {steer_norms.mean():.3f}")
    print(f"   • Average norm change: {norm_change.mean():.3f}")
    print(f"   • Norm change std: {norm_change.std():.3f}")
    
    # CHANGED: Visualization of representation changes
    plt.figure(figsize=(15, 10))
    
    # Similarity distributions
    plt.subplot(2, 3, 1)
    plt.hist(cos_sim_pos, bins=30, alpha=0.7, label='Positive', color='green')
    plt.hist(cos_sim_neg, bins=30, alpha=0.7, label='Negative', color='red')
    plt.xlabel('Cosine Similarity')
    plt.ylabel('Frequency')
    plt.title('Original vs Steered Similarity')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Norm changes
    plt.subplot(2, 3, 2)
    plt.hist(norm_change, bins=50, alpha=0.7, edgecolor='black')
    plt.axvline(0, color='red', linestyle='--', alpha=0.7)
    plt.xlabel('Norm Change (Steered - Original)')
    plt.ylabel('Frequency')
    plt.title('Representation Magnitude Changes')
    plt.grid(True, alpha=0.3)
    
    # Similarity by class
    plt.subplot(2, 3, 3)
    for class_idx in [0, 1]:
        mask = labels_numpy == class_idx
        plt.hist(cos_sim_diff[mask], bins=20, alpha=0.6, 
                label=f'Class {class_idx}', density=True)
    plt.xlabel('Cosine Similarity (Diff Vectors)')
    plt.ylabel('Density')
    plt.title('Similarity by True Class')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Scatter plots
    plt.subplot(2, 3, 4)
    plt.scatter(orig_norms, steer_norms, alpha=0.6, s=10, c=labels_numpy, cmap='viridis')
    plt.plot([orig_norms.min(), orig_norms.max()], 
             [orig_norms.min(), orig_norms.max()], 'r--', alpha=0.7)
    plt.xlabel('Original Norm')
    plt.ylabel('Steered Norm')
    plt.title('Norm Comparison')
    plt.colorbar(label='True Label')
    plt.grid(True, alpha=0.3)
    
    # Changes by confidence
    if 'probas_st' in globals():
        plt.subplot(2, 3, 5)
        plt.scatter(probas_st.flatten(), cos_sim_diff, alpha=0.6, s=10, c=labels_numpy, cmap='viridis')
        plt.xlabel('CCS Confidence')
        plt.ylabel('Representation Similarity')
        plt.title('Similarity vs Confidence')
        plt.colorbar(label='True Label')
        plt.grid(True, alpha=0.3)
        
        # Correlation analysis
        correlation = np.corrcoef(probas_st.flatten(), cos_sim_diff)[0, 1]
        plt.text(0.05, 0.95, f'Correlation: {correlation:.3f}', 
                transform=plt.gca().transAxes, bbox=dict(boxstyle="round", facecolor='white', alpha=0.8))
    
    # Changes by sample index
    plt.subplot(2, 3, 6)
    plt.plot(norm_change, alpha=0.7)
    plt.axhline(0, color='red', linestyle='--', alpha=0.7)
    plt.xlabel('Sample Index')
    plt.ylabel('Norm Change')
    plt.title('Norm Changes Across Samples')
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
else:
    print(f"⚠️  Original representations not available for comparison")

# CHANGED: Analyze steering direction effects
if 'steered_ccs_best' in globals():
    print(f"\n🎯 STEERING DIRECTION ANALYSIS:")
    
    # CHANGED: Analyze CCS weights/directions for steered data
    layer_weights = []
    layer_biases = []
    
    for layer_idx, results in steered_ccs_best.items():
        if 'weights' in results:
            weights = results['weights']
            bias = results.get('bias', 0)
            
            layer_weights.append(weights)
            layer_biases.append(bias)
            
            weight_norm = np.linalg.norm(weights)
            print(f"   • Layer {layer_idx}: Weight norm = {weight_norm:.3f}, Bias = {bias:.3f}")
    
    if layer_weights:
        # CHANGED: Analyze weight patterns
        print(f"\n📊 WEIGHT PATTERN ANALYSIS:")
        all_weights = np.array(layer_weights)
        all_biases = np.array(layer_biases)
        
        print(f"   • Weight norms - Mean: {np.linalg.norm(all_weights, axis=1).mean():.3f}")
        print(f"   • Weight norms - Std: {np.linalg.norm(all_weights, axis=1).std():.3f}")
        print(f"   • Biases - Mean: {all_biases.mean():.3f}")
        print(f"   • Biases - Std: {all_biases.std():.3f}")
        
        # CHANGED: Weight visualization
        plt.figure(figsize=(12, 4))
        
        plt.subplot(1, 3, 1)
        weight_norms = np.linalg.norm(all_weights, axis=1)
        plt.plot(range(len(weight_norms)), weight_norms, 'o-')
        plt.xlabel('Layer')
        plt.ylabel('Weight Norm')
        plt.title('CCS Weight Norms by Layer')
        plt.grid(True, alpha=0.3)
        
        plt.subplot(1, 3, 2)
        plt.plot(range(len(all_biases)), all_biases, 'o-', color='orange')
        plt.xlabel('Layer')
        plt.ylabel('Bias')
        plt.title('CCS Biases by Layer')
        plt.grid(True, alpha=0.3)
        
        plt.subplot(1, 3, 3)
        # Show weight distribution for first layer
        if len(all_weights) > 0:
            plt.hist(all_weights[0], bins=50, alpha=0.7, edgecolor='black')
            plt.xlabel('Weight Value')
            plt.ylabel('Frequency')
            plt.title(f'Weight Distribution (Layer 0)')
            plt.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()

# CHANGED: Summary of steering effects
print(f"\n🎯 STEERING EFFECTS SUMMARY:")
print("="*50)

if 'comparison_df' in globals():
    best_acc_layer = comparison_df.loc[comparison_df['Acc_Change'].idxmax()]
    worst_acc_layer = comparison_df.loc[comparison_df['Acc_Change'].idxmin()]
    
    print(f"🏆 BEST IMPROVEMENTS:")
    print(f"   • Accuracy: Layer {best_acc_layer['Layer']} (+{best_acc_layer['Acc_Change']:.4f})")
    
    if 'Sil_Change' in comparison_df.columns:
        best_sil_layer = comparison_df.loc[comparison_df['Sil_Change'].idxmax()]
        print(f"   • Silhouette: Layer {best_sil_layer['Layer']} (+{best_sil_layer['Sil_Change']:.4f})")
    
    print(f"\n📉 LARGEST DECLINES:")
    print(f"   • Accuracy: Layer {worst_acc_layer['Layer']} ({worst_acc_layer['Acc_Change']:.4f})")
    
    overall_improvement = comparison_df['Acc_Change'].mean()
    if overall_improvement > 0:
        print(f"\n✅ OVERALL: Positive steering effect (+{overall_improvement:.4f} avg accuracy)")
    else:
        print(f"\n⚠️  OVERALL: Negative steering effect ({overall_improvement:.4f} avg accuracy)")

if 'cos_sim_diff' in locals():
    print(f"\n🔄 REPRESENTATION CHANGES:")
    print(f"   • Average similarity to original: {cos_sim_diff.mean():.3f}")
    print(f"   • High similarity samples (>0.9): {(cos_sim_diff > 0.9).sum()}/{len(cos_sim_diff)}")
    print(f"   • Low similarity samples (<0.5): {(cos_sim_diff < 0.5).sum()}/{len(cos_sim_diff)}")

print(f"\n✅ CELL 6 COMPLETED!")
print(f"📋 Analysis performed:")
print(f"   • Prediction confidence analysis")
if 'cos_sim_diff' in locals():
    print(f"   • Representation similarity analysis")
    print(f"   • Magnitude change analysis")
print(f"   • Steering direction analysis")
print(f"   • Overall steering effect summary")
print("="*80)

## **7. Plot results tables.**

In [None]:
from format_results import get_results_table
orig_ccs = pickle.load(open(f'ccs_{YOUR_NAME}_mixed_data_results.pkl', 'rb'))
steered_ccs = pickle.load(open(f'ccs_steer_{YOUR_NAME}_mixed_data_results.pkl', 'rb'))
orig_ccs_data = get_results_table(orig_ccs)
st_ccs_data = get_results_table(steered_ccs)

In [None]:
fig, ax = plt.subplots(figsize=(18, 5))

plt.plot(orig_ccs_data['accuracy'], label='Orig_CCS')
plt.plot(st_ccs_data['accuracy'], label='St_CCS')

plt.hlines(1, 0, 7, label='ideal', colors='red', linewidth=3, linestyles=['--'])

plt.xlabel('Layer_number')
plt.ylabel('accuracy score ')
plt.legend(loc='upper right');

In [None]:
from format_results import get_results_table
orig_ccs_data = get_results_table(orig_ccs)
st_ccs_data = get_results_table(steered_ccs)

fig, ax = plt.subplots(figsize=(18, 5))

plt.plot(orig_ccs_data['contradiction idx ↓'], label='Orig_CCS')
plt.plot(st_ccs_data['contradiction idx ↓'], label='St_CCS')

plt.hlines(0, 0, 7, label='ideal', colors='red', linewidth=3, linestyles=['--'])

plt.xlabel('Layer_number')
plt.ylabel('Contradiction idx score ')
plt.legend(loc='upper right');

In [None]:
fig, ax = plt.subplots(figsize=(18, 5))

plt.plot(orig_ccs_data['agreement_score ↓'], label='Orig_CCS')
plt.plot(st_ccs_data['agreement_score ↓'], label='St_CCS')

plt.hlines(0, 0, 7, label='ideal', colors='red', linewidth=3, linestyles=['--'])

plt.xlabel('Layer_number')
plt.ylabel('Agreement score ')
plt.legend(loc='upper right');