In [1]:
%%bash
cat > steer_llm.py << 'EOF'
#!/usr/bin/env python3
"""
Enhanced Script for Steering LLM Behavior (e.g., Epistemic Stance, Stylistic Features)
using Activation Vector Manipulation in models like LLaMA-3-8B-Instruct.

Targeted for Research Publication - Incorporates learnings from iterative experiments.

Features:
- CLI with subcommands: compute-vector, steer, evaluate-perplexity, evaluate-style
- Flexible text pair input (JSON) for concept vector computation.
- Concept vector derived using a Linear Probe (Logistic Regression) on MLP activations.
- Options for token aggregation strategy (mean_all_tokens, last_token) for probe training.
- Optional normalization for the computed concept vector.
- Support for multi-layer MLP steering with specified strengths.
- Optional Activation Clipping during steering for fluency control.
- Perplexity evaluation (uses external model like GPT-2 for less bias).
- Style evaluation (Flesch Reading Ease, Flesch-Kincaid Grade, Avg Sentence Length) using textstat.
- Save/load concept vectors (dictionary mapping layer_index to vector).
- Comprehensive logging and CLI arguments for reproducibility.
"""
import os
import sys
import argparse
import json
import logging
import torch
import numpy as np
from torch.nn.functional import normalize as torch_normalize
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    GenerationConfig,
    set_seed
)
from datasets import load_dataset # Keep if used for other purposes, not directly in current script logic
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from huggingface_hub import login

# Attempt to import textstat, provide message if not found
try:
    import textstat
except ImportError:
    print("textstat library not found. Please install it using 'pip install textstat' to use the evaluate-style command.")
    textstat = None


# ----- Logging Configuration -----\n",
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(module)s:%(lineno)d %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)
logger = logging.getLogger(__name__)

# Optional HF authentication
HF_TOKEN = os.environ.get('HF_TOKEN')
if HF_TOKEN:
    login(token=HF_TOKEN)

# Strip Jupyter args
orig_args = sys.argv[:]
clean_args = [orig_args[0]]
skip_next = False
for a in orig_args[1:]:
    if skip_next:
        skip_next = False
        continue
    if a == '-f':
        skip_next = True
        continue
    b = os.path.basename(a)
    if b.startswith('kernel-') and b.endswith('.json'):
        continue
    clean_args.append(a)
sys.argv = clean_args

# ----- Utility Functions -----
def load_model_and_tokenizer(model_name: str, device: str = 'cuda', dtype_str: str = 'bfloat16'):
    logger.info(f"Loading model {model_name} on {device} (dtype={dtype_str})")
    if dtype_str == 'bfloat16':
        dtype = torch.bfloat16
    elif dtype_str == 'float16':
        dtype = torch.float16
    else:
        dtype = torch.float32

    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=HF_TOKEN)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token or tokenizer.pad_token_id
    tokenizer.padding_side = 'left'

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=dtype,
        device_map='auto',
        use_auth_token=HF_TOKEN
    )
    model.eval()
    return model, tokenizer

class ActivationExtractor:
    def __init__(self, model, mlp_layer_indices, extraction_type='mlp'):
        self.model = model
        self.layer_indices = mlp_layer_indices
        self.extraction_type = extraction_type
        self.handles = []
        self.activations = {idx: [] for idx in mlp_layer_indices}

    def _make_hook(self, layer_target_idx): # Renamed to avoid conflict
        def hook(module, inp, outp):
            activation_to_store = outp[0] if isinstance(outp, tuple) and self.extraction_type == 'full_layer' else outp
            self.activations[layer_target_idx].append(activation_to_store.detach().cpu())
        return hook

    def register(self):
        self.unregister() # Clear previous hooks
        self.activations = {idx: [] for idx in self.layer_indices} # Reset activations
        for idx_spec in self.layer_indices: # Use idx_spec to iterate
            actual_idx = idx_spec if idx_spec >= 0 else len(self.model.model.layers) + idx_spec
            if self.extraction_type == 'mlp':
                module = self.model.model.layers[actual_idx].mlp
            elif self.extraction_type == 'full_layer':
                module = self.model.model.layers[actual_idx]
            else:
                raise ValueError(f"Unknown extraction_type: {self.extraction_type}")

            handle = module.register_forward_hook(self._make_hook(actual_idx)) # Pass actual_idx to hook
            self.handles.append(handle)
        logger.info(f"Registered hooks at {'MLP' if self.extraction_type == 'mlp' else 'Full Decoder'} layers: {self.layer_indices} (actual: {[idx if idx >=0 else len(self.model.model.layers)+idx for idx in self.layer_indices]})")


    def unregister(self):
        for h in self.handles:
            h.remove()
        self.handles = []

# ----- Core Functionality -----
def compute_concept_vector_with_probe(
    model, tokenizer,
    positive_texts, negative_texts,
    mlp_layer_indices, batch_size: int,
    aggregation_strategy: str, normalize_vector: bool,
    seed: int, device: str, max_length: int,
    extraction_type: str = 'mlp' # Added extraction_type
):
    set_seed(seed)
    # Ensure layer_indices are actual, non-negative indices for the extractor
    actual_layer_indices = [idx if idx >= 0 else len(model.model.layers) + idx for idx in mlp_layer_indices]
    extractor = ActivationExtractor(model, actual_layer_indices, extraction_type=extraction_type)


    def collect_activations(texts, label): # label is for logging/debugging
        # Initialize activations for each call to collect_activations
        for idx in actual_layer_indices:
            extractor.activations[idx] = []

        all_acts_for_texts = {idx: [] for idx in actual_layer_indices}
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(device)

            extractor.register() # Register fresh hooks for this batch pass
            with torch.no_grad():
                model(**inputs)
            extractor.unregister() # Unregister after the forward pass for this batch

            for idx_loop in actual_layer_indices: # Use a different loop variable
                if not extractor.activations[idx_loop]:
                    logger.warning(f"No activations collected for layer {idx_loop} in batch {i//batch_size} for {label} texts. Skipping this layer for this batch.")
                    continue

                feats_batch_layer = torch.cat(extractor.activations[idx_loop], dim=0)
                # Reset for next batch pass *within* collect_activations for this layer
                extractor.activations[idx_loop] = []


                if aggregation_strategy == 'mean_all_tokens':
                    # Use attention mask to average only non-padded tokens
                    mask = inputs.attention_mask.unsqueeze(-1).expand_as(feats_batch_layer).float().cpu()
                    masked_activations = feats_batch_layer * mask
                    summed_activations = masked_activations.sum(dim=1)
                    num_tokens = mask.sum(dim=1)
                    num_tokens = torch.clamp(num_tokens, min=1e-9)
                    aggregated_feats = summed_activations / num_tokens
                elif aggregation_strategy == 'last_token':
                    lengths = inputs.attention_mask.sum(dim=1)
                    aggregated_feats = torch.stack([feats_batch_layer[k, lengths[k]-1] for k in range(feats_batch_layer.size(0))])
                else:
                    raise ValueError(f"Unknown aggregation strategy: {aggregation_strategy}")

                all_acts_for_texts[idx_loop].append(aggregated_feats)
        # Concatenate features from all batches for each layer
        final_acts = {idx: torch.cat(all_acts_for_texts[idx], dim=0) if all_acts_for_texts[idx] else torch.empty(0) for idx in actual_layer_indices}
        return final_acts


    logger.info(f"Collecting POSITIVE activations ({len(positive_texts)} texts)...")
    pos_acts = collect_activations(positive_texts, 'pos')
    logger.info(f"Collecting NEGATIVE activations ({len(negative_texts)} texts)...")
    neg_acts = collect_activations(negative_texts, 'neg')

    concept_vectors = {}
    for idx in actual_layer_indices:
        if not pos_acts[idx].numel() or not neg_acts[idx].numel():
             logger.warning(f"Skipping layer {idx} due to missing positive or negative activations.")
             continue
        X_pos = pos_acts[idx].numpy()
        X_neg = neg_acts[idx].numpy()

        if X_pos.shape[0] == 0 or X_neg.shape[0] == 0:
            logger.warning(f"Skipping layer {idx} due to empty positive ({X_pos.shape[0]}) or negative ({X_neg.shape[0]}) activation arrays after processing.")
            continue

        y = np.array([1]*X_pos.shape[0] + [0]*X_neg.shape[0])
        X = np.vstack([X_pos, X_neg])

        if X.shape[0] < 2 : # Need at least 2 samples for scaler and probe
            logger.warning(f"Skipping layer {idx} due to insufficient samples ({X.shape[0]}) for probe training.")
            continue

        scaler = StandardScaler(); X_scaled = scaler.fit_transform(X)
        probe = LogisticRegression(solver='liblinear', C=1.0, class_weight='balanced', random_state=seed)
        probe.fit(X_scaled, y)
        coef = probe.coef_[0]
        vec = torch_normalize(torch.tensor(coef, dtype=torch.float32).unsqueeze(0), dim=1).squeeze(0) if normalize_vector else torch.tensor(coef, dtype=torch.float32)
        concept_vectors[idx] = vec.to(device) # Store with actual index
        logger.info(f"Computed concept vector for layer {idx}, norm={vec.norm():.3f}, shape={vec.shape}")
    return concept_vectors


def steer_generation_with_mlp_intervention(
    model, tokenizer, prompt: str,
    concept_vectors: dict, strengths: dict, # strengths keys should be string of actual layer_idx
    max_new_tokens: int, temperature: float, do_sample: bool,
    clip_min: float, clip_max: float, device: str, seed: int,
    extraction_type_steering: str = 'mlp' # To determine which module to hook
):
    set_seed(seed)
    handles = []
    active_steering_info = []

    for layer_str_key, strength_val in strengths.items():
        try:
            layer_idx_steer = int(layer_str_key) # This should be the actual, non-negative index
        except ValueError:
            logger.warning(f"Invalid layer index '{layer_str_key}' in strengths_json. Skipping.")
            continue

        # The concept_vectors dict uses actual (non-negative) layer indices as keys
        concept_vec_to_apply = concept_vectors.get(layer_idx_steer)

        if concept_vec_to_apply is None or strength_val == 0:
            logger.info(f"No concept vector found for layer {layer_idx_steer} or strength is 0. Skipping hook.")
            continue

        def hook_fn_closure(cv, strength_hook, use_clip, c_min, c_max):
            def actual_hook(module, inp, outp_main):
                # For full_layer, outp_main is a tuple (hidden_states, ...)
                # For MLP, outp_main is the activation tensor
                if extraction_type_steering == 'full_layer' and isinstance(outp_main, tuple):
                    original_activation = outp_main[0]
                else:
                    original_activation = outp_main

                modified_activation = original_activation.clone()
                cv_device_dtype = cv.to(modified_activation.device, dtype=modified_activation.dtype)

                # Add to all tokens in the last dimension (hidden_size)
                # Assumes activation shape [batch_size, seq_len, hidden_size]
                modified_activation += (strength_hook * cv_device_dtype.unsqueeze(0).unsqueeze(0))

                if use_clip and c_min is not None and c_max is not None:
                    modified_activation = torch.clamp(modified_activation, c_min, c_max)

                if extraction_type_steering == 'full_layer' and isinstance(outp_main, tuple):
                    return (modified_activation,) + outp_main[1:]
                else:
                    return modified_activation
            return actual_hook

        # Determine the module to hook based on extraction_type_steering
        if extraction_type_steering == 'mlp':
            module_to_hook = model.model.layers[layer_idx_steer].mlp
        elif extraction_type_steering == 'full_layer':
            module_to_hook = model.model.layers[layer_idx_steer]
        else:
            logger.warning(f"Unsupported extraction_type_steering '{extraction_type_steering}' for layer {layer_idx_steer}. Skipping hook.")
            continue

        current_clip_active = clip_min is not None and clip_max is not None
        handles.append(module_to_hook.register_forward_hook(
            hook_fn_closure(concept_vec_to_apply, strength_val, current_clip_active, clip_min, clip_max)
        ))
        active_steering_info.append(f"L{layer_idx_steer}({extraction_type_steering}) alpha={strength_val} Clip={'ON' if current_clip_active else 'OFF'}")

    if active_steering_info:
        logger.info(f"Steering active: {'; '.join(active_steering_info)}")
    else:
        logger.info("No steering applied (baseline or no valid strengths/vectors).")


    gen_cfg = GenerationConfig(max_new_tokens=max_new_tokens, do_sample=do_sample, temperature=temperature if do_sample else None, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id)
    inputs = tokenizer(prompt, return_tensors='pt', truncation=True, padding=True).to(device) # Assuming max_length is handled by truncation implicitly
    with torch.no_grad():
        outputs = model.generate(**inputs, generation_config=gen_cfg)
    for h in handles:
        h.remove()
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


def evaluate_perplexity_external(texts, ppl_model_name: str, device: str, batch_size: int, max_length_ppl: int = 1024):
    from transformers import AutoModelForCausalLM as PPLModel, AutoTokenizer as PPLTokenizer # Alias to avoid conflict
    logger.info(f"Loading PPL model: {ppl_model_name}")
    tok = PPLTokenizer.from_pretrained(ppl_model_name)
    mod = PPLModel.from_pretrained(ppl_model_name).to(device)
    mod.eval()
    if tok.pad_token is None: tok.pad_token = tok.eos_token
    total_nll, total_tokens = 0.0, 0
    num_valid_texts = 0
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        # Filter out empty or non-string texts
        current_valid_texts = [t for t in batch_texts if isinstance(t, str) and t.strip()]
        if not current_valid_texts:
            continue
        num_valid_texts += len(current_valid_texts)

        enc = tok(current_valid_texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length_ppl).to(device)
        if enc.input_ids.numel() == 0: continue # Should not happen if current_valid_texts is not empty

        with torch.no_grad():
            out = mod(enc.input_ids, labels=enc.input_ids)
        # NLL is sum of losses, loss is average. So multiply by number of tokens (excluding pad)
        # For causal LM, labels are input_ids shifted, so effectively a seq_len of N has N-1 prediction steps.
        # The .loss attribute from HF models already gives the average neg log likelihood.
        # Number of tokens contributing to loss is what attention_mask sums to (for non-padded tokens in batch)
        # Or more precisely, where labels are not -100 (the ignore_index)

        # A simpler way: HF loss is already averaged. Total NLL = average_loss * number_of_tokens_in_batch_contributing_to_loss
        # For CausalLMs, typically all non-pad tokens contribute if labels=input_ids.
        # num_tokens_in_batch = enc.attention_mask.sum().item() # This is total non-pad tokens

        # The .loss is already the mean loss over the tokens.
        # So, total NLL for the batch is loss * number of tokens that contributed to this loss
        # Number of elements in the batch * (sequence_length - 1) for non-padded sequences
        # Or, more simply, if loss is per-token, then total NLL for batch is out.loss * number of "loss-event" tokens.
        # Let's use input_ids.numel() as an approximation if all tokens are used for loss, or refine if specific token counting is needed.
        # The common practice with HF is that loss is mean of NLLs.

        # Total NLL for the batch = mean_loss_for_batch * number_of_tokens_in_batch
        # The number of tokens is tricky; if labels are input_ids, it's usually all non-pad tokens.
        # If some tokens are masked in labels (e.g., set to -100), they don't contribute.
        # Let's assume input_ids.numel() for simplicity if using labels=input_ids and no manual masking.

        # The loss returned by Hugging Face models when labels are provided is the *average*
        # negative log likelihood over the (non-ignored) tokens.
        # So, to get the sum of NLL for the batch, we multiply by the number of tokens
        # that contributed to the loss.

        # Count non-ignored tokens (labels are input_ids, so all non-pad tokens)
        num_loss_tokens = (enc.input_ids != tok.pad_token_id).sum().item()

        if num_loss_tokens > 0:
            total_nll += out.loss.item() * num_loss_tokens # Sum of NLL for this batch
            total_tokens += num_loss_tokens                  # Total tokens processed for NLL calculation

    if total_tokens == 0 or num_valid_texts == 0 :
        logger.warning("No valid texts or tokens found for PPL calculation.")
        return float('inf')

    avg_nll = total_nll / total_tokens
    ppl = torch.exp(torch.tensor(avg_nll)).item()
    logger.info(f"Perplexity (over {num_valid_texts} valid texts, {total_tokens} tokens): {ppl:.4f}")
    return ppl


def evaluate_style_metrics(texts, output_metrics_file: str):
    if textstat is None:
        logger.error("textstat library is not installed. Cannot evaluate style metrics.")
        metrics = {"error": "textstat not installed"}
        with open(output_metrics_file, 'w') as f: json.dump(metrics, f)
        return metrics

    if not texts:
        logger.warning("No texts provided for style evaluation.")
        metrics = {
            "flesch_reading_ease": float('nan'), "flesch_kincaid_grade": float('nan'),
            "avg_sentence_length": float('nan'), "num_texts": 0
        }
        with open(output_metrics_file, 'w') as f: json.dump(metrics, f)
        return metrics

    f_ease_scores, fk_grade_scores, avg_sent_len_scores = [], [], []
    num_valid_texts = 0

    for text in texts:
        if not isinstance(text, str) or not text.strip():
            continue
        num_valid_texts +=1
        try:
            f_ease_scores.append(textstat.flesch_reading_ease(text))
            fk_grade_scores.append(textstat.flesch_kincaid_grade(text))
            # Calculate average sentence length: total words / total sentences
            # textstat.sentence_count might be 0 for very short texts, handle division by zero.
            sentences = textstat.sentence_count(text)
            words = textstat.lexicon_count(text, removepunct=True)
            if sentences > 0:
                avg_sent_len_scores.append(words / sentences)
            elif words > 0: # If one sentence and it has words
                avg_sent_len_scores.append(words)
            else: # No words, no sentences
                 avg_sent_len_scores.append(0) # Or skip, or handle as NaN later
        except Exception as e:
            logger.warning(f"Could not process text for style metrics: '{text[:50]}...'. Error: {e}")
            # Optionally append NaNs or skip these scores
            f_ease_scores.append(float('nan'))
            fk_grade_scores.append(float('nan'))
            avg_sent_len_scores.append(float('nan'))


    if num_valid_texts == 0:
        logger.warning("No valid texts found for style evaluation after filtering.")
        metrics = {
            "flesch_reading_ease": float('nan'), "flesch_kincaid_grade": float('nan'),
            "avg_sentence_length": float('nan'), "num_texts": 0
        }
    else:
        metrics = {
            "flesch_reading_ease": np.nanmean(f_ease_scores) if f_ease_scores else float('nan'),
            "flesch_kincaid_grade": np.nanmean(fk_grade_scores) if fk_grade_scores else float('nan'),
            "avg_sentence_length": np.nanmean(avg_sent_len_scores) if avg_sent_len_scores else float('nan'),
            "num_texts": num_valid_texts
        }
    logger.info(f"Style Metrics (based on {num_valid_texts} texts): FRE={metrics['flesch_reading_ease']:.2f}, FKG={metrics['flesch_kincaid_grade']:.2f}, ASL={metrics['avg_sentence_length']:.2f}")
    with open(output_metrics_file, 'w') as f:
        json.dump(metrics, f, indent=2)
    return metrics


# ----- CLI -----
def main():
    parser = argparse.ArgumentParser(description="LLM Concept Steering CLI")
    parser.add_argument('--model_name', type=str, default="meta-llama/Llama-3.1-8B-Instruct", help="Name of the Hugging Face model to use.")
    parser.add_argument('--dtype', type=str, default='bfloat16', choices=['bfloat16','float16','float32'], help="Data type for model loading.")
    parser.add_argument('--seed', type=int, default=42, help="Random seed for reproducibility.")
    sub = parser.add_subparsers(dest='cmd', required=True)

    cvp = sub.add_parser('compute-vector', help="Compute a concept vector using a linear probe.")
    cvp.add_argument('--positive_texts_json', required=True, help="JSON file with list of positive example texts.")
    cvp.add_argument('--negative_texts_json', required=True, help="JSON file with list of negative example texts.")
    cvp.add_argument('--mlp_layer_indices', nargs='+', type=int, default=[-1], help="List of MLP layer indices (negative for from end).")
    cvp.add_argument('--extraction_type', choices=['mlp', 'full_layer'], default='mlp', help="Type of activation to extract ('mlp' or 'full_layer').")
    cvp.add_argument('--batch_size', type=int, default=8, help="Batch size for processing texts.")
    cvp.add_argument('--aggregation_strategy', choices=['mean_all_tokens','last_token'], default='mean_all_tokens', help="How to aggregate token activations.")
    cvp.add_argument('--normalize_vector', action='store_true', help="Normalize the computed concept vector.")
    cvp.add_argument('--max_length', type=int, default=192, help="Max sequence length for tokenizer during activation gathering.")
    cvp.add_argument('--output_vector_file', required=True, help="File path to save the computed concept vector(s) (.pt).")

    stp = sub.add_parser('steer', help="Steer model generation using a concept vector.")
    stp.add_argument('--prompt', required=True, help="Prompt for generation.")
    stp.add_argument('--concept_vector_file', required=True, help="File path to the saved concept vector(s) (.pt).")
    stp.add_argument('--strengths_json', required=True, help="JSON file with strengths for each layer, e.g., {'-3': 1.5, '15': -1.0}.")
    stp.add_argument('--steering_extraction_type', choices=['mlp', 'full_layer'], default='mlp', help="Type of module to hook for steering ('mlp' or 'full_layer' activations). Should match how the vector was computed.")
    stp.add_argument('--max_new_tokens', type=int, default=60, help="Max new tokens for generation.")
    stp.add_argument('--temperature', type=float, default=0.7, help="Temperature for sampling (if do_sample is true).")
    stp.add_argument('--do_sample', action='store_true', help="Enable sampling for generation.")
    stp.add_argument('--clip_min', type=float, help="Min value for activation clipping.")
    stp.add_argument('--clip_max', type=float, help="Max value for activation clipping.")

    evp_ppl = sub.add_parser('evaluate-perplexity', help="Evaluate perplexity of generated texts.")
    evp_ppl.add_argument('--texts_json', required=True, help="JSON file with a list of texts to evaluate.")
    evp_ppl.add_argument('--ppl_model_name', default='gpt2', help="Model for perplexity calculation.")
    evp_ppl.add_argument('--batch_size', type=int, default=4, help="Batch size for perplexity evaluation.")
    evp_ppl.add_argument('--max_length_ppl', type=int, default=1024, help="Max sequence length for PPL model tokenizer.")


    evp_style = sub.add_parser('evaluate-style', help="Evaluate style metrics of generated texts.")
    evp_style.add_argument('--texts_json', required=True, help="JSON file with a list of texts to evaluate.")
    evp_style.add_argument('--output_metrics_file', required=True, help="JSON file to save the computed style metrics.")
    # Optionally, could add PPL calculation to evaluate-style as well
    evp_style.add_argument('--ppl_model_name', default='gpt2', help="Model for perplexity calculation (optional, if also calculating PPL here).")
    evp_style.add_argument('--batch_size_ppl', type=int, default=4, help="Batch size for PPL evaluation (if calculating PPL).")
    evp_style.add_argument('--max_length_ppl', type=int, default=1024, help="Max sequence length for PPL model (if calculating PPL).")


    args = parser.parse_args()
    set_seed(args.seed)
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    if args.cmd=='compute-vector':
        model, tok = load_model_and_tokenizer(args.model_name, device=device, dtype_str=args.dtype)
        with open(args.positive_texts_json, 'r') as f: pos = json.load(f)
        with open(args.negative_texts_json, 'r') as f: neg = json.load(f)
        if not pos or not neg:
            logger.error("Positive or negative text lists are empty. Aborting compute-vector.")
            return

        cvs = compute_concept_vector_with_probe(
            model, tok, pos, neg, args.mlp_layer_indices, args.batch_size,
            args.aggregation_strategy, args.normalize_vector, args.seed,
            device, args.max_length, extraction_type=args.extraction_type # Pass extraction_type
        )
        if cvs:
            torch.save(cvs, args.output_vector_file)
            logger.info(f"Saved concept vectors to {args.output_vector_file}")
        else:
            logger.error("Concept vector computation failed or yielded no vectors.")

    elif args.cmd=='steer':
        model, tok = load_model_and_tokenizer(args.model_name, device=device, dtype_str=args.dtype)
        cvs_loaded = torch.load(args.concept_vector_file, map_location=device) # Load to target device

        # Strengths_json keys should be strings of *actual* layer indices if that's how they are saved
        # Or, convert them if they are relative (-1, -3) and cvs_loaded keys are actual
        with open(args.strengths_json, 'r') as f: raw_strengths = json.load(f)

        strengths_processed = {}
        num_model_layers_for_steer = len(model.model.layers)
        for k_str, v_strength in raw_strengths.items():
            try:
                k_int = int(k_str)
                actual_k_steer = k_int if k_int >=0 else num_model_layers_for_steer + k_int
                if actual_k_steer in cvs_loaded: # Check if the vector for this actual layer exists
                     strengths_processed[str(actual_k_steer)] = v_strength # Use string of actual index
                else:
                    logger.warning(f"Strength specified for layer {k_str} (actual: {actual_k_steer}), but no such vector in concept_vector_file. Skipping.")
            except ValueError:
                 logger.warning(f"Invalid layer key '{k_str}' in strengths_json. Skipping.")


        if not strengths_processed:
            logger.warning("No valid strengths to apply after processing strengths_json. Performing baseline generation.")

        out_text = steer_generation_with_mlp_intervention(
            model, tok, args.prompt, cvs_loaded, strengths_processed,
            args.max_new_tokens, args.temperature, args.do_sample,
            args.clip_min, args.clip_max, device, args.seed,
            extraction_type_steering=args.steering_extraction_type # Pass steering_extraction_type
        )
        print(out_text) # Print output to stdout

    elif args.cmd=='evaluate-perplexity':
        with open(args.texts_json, 'r') as f: texts_to_eval = json.load(f)
        if not texts_to_eval:
            logger.warning("Texts JSON file is empty or contains no texts. Skipping perplexity evaluation.")
            print("Perplexity: NaN (No texts to evaluate)")
            return
        ppl = evaluate_perplexity_external(texts_to_eval, args.ppl_model_name, device, args.batch_size, args.max_length_ppl)
        # The function already logs, but main CLI can also print for clarity or if used in scripts
        print(f"Final Perplexity: {ppl:.4f}")


    elif args.cmd=='evaluate-style':
        if textstat is None:
            logger.error("textstat library not found. Aborting evaluate-style.")
            # Save an error JSON
            error_metrics = {"error": "textstat library not installed. Please run 'pip install textstat'."}
            with open(args.output_metrics_file, 'w') as f: json.dump(error_metrics, f, indent=2)
            return

        with open(args.texts_json, 'r') as f: texts_for_style = json.load(f)
        if not texts_for_style:
            logger.warning("Texts JSON file is empty. No texts to evaluate for style.")
            # Save empty/NaN metrics
            empty_metrics = {
                "flesch_reading_ease": float('nan'), "flesch_kincaid_grade": float('nan'),
                "avg_sentence_length": float('nan'), "perplexity": float('nan'), "num_texts": 0
            }
            with open(args.output_metrics_file, 'w') as f: json.dump(empty_metrics, f, indent=2)
            return

        style_metrics = evaluate_style_metrics(texts_for_style, args.output_metrics_file) # Saves within function
        logger.info(f"Style metrics saved to {args.output_metrics_file}: {style_metrics}")

        # Optionally, also calculate and add PPL to the same output file
        # This makes evaluate-style a more comprehensive evaluation endpoint
        # We'll add it to the style_metrics dict and re-save
        logger.info("Calculating perplexity as part of evaluate-style...")
        ppl_for_style_eval = evaluate_perplexity_external(texts_for_style, args.ppl_model_name, device, args.batch_size_ppl, args.max_length_ppl)
        style_metrics["perplexity"] = ppl_for_style_eval

        with open(args.output_metrics_file, 'w') as f: # Re-save with PPL
            json.dump(style_metrics, f, indent=2)
        logger.info(f"Updated style metrics with PPL and re-saved to {args.output_metrics_file}")


if __name__=='__main__':
    main()
EOF

In [2]:
# CELL 1: Installations
!pip uninstall torch torchvision torchaudio transformers -y
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 # Or cu121 if your Colab instance has a newer CUDA
!pip install transformers accelerate bitsandbytes scikit-learn matplotlib seaborn umap-learn huggingface_hub datasets textstat -q -U
!chmod +x steer_llm.py

Found existing installation: torch 2.6.0+cu124
Uninstalling torch-2.6.0+cu124:
  Successfully uninstalled torch-2.6.0+cu124
Found existing installation: torchvision 0.21.0+cu124
Uninstalling torchvision-0.21.0+cu124:
  Successfully uninstalled torchvision-0.21.0+cu124
Found existing installation: torchaudio 2.6.0+cu124
Uninstalling torchaudio-2.6.0+cu124:
  Successfully uninstalled torchaudio-2.6.0+cu124
Found existing installation: transformers 4.51.3
Uninstalling transformers-4.51.3:
  Successfully uninstalled transformers-4.51.3
Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch
  Downloading https://download.pytorch.org/whl/cu118/torch-2.7.0%2Bcu118-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (28 kB)
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu118/torchvision-0.22.0%2Bcu118-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cu118/torchaudio-2.7.0%2Bcu118

In [3]:
# CELL 2: Imports and Initial Model/Tokenizer Loading

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import load_dataset
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import os
import json
import subprocess
from huggingface_hub import login, HfFolder

# --- User Configuration ---
MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct" # Make sure this matches your steer_llm.py default or pass explicitly
use_quantization = True # As per your original notebook
PPL_MODEL_FOR_SCRIPT = "gpt2" # For consistency in script calls

# --- Hugging Face Token ---
print("Attempting Hugging Face login...")
try:
    if HfFolder.get_token() is None:
        print("Hugging Face token not found. Please log in.")
        login() # This will prompt for token if not in cache
    else:
        print(f"Hugging Face token found in cache: {HfFolder.get_token()[:10]}... (masked)")
    print("Login check complete.")
except Exception as e:
    print(f"Error during Hugging Face login check: {e}")
    # Decide if you want to raise e or try to continue if token is not strictly needed for public models

# --- Load Main Model and Tokenizer (for Python-based vC extraction if needed, and reference) ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

print(f"Loading tokenizer for '{MODEL_NAME}'...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print("Tokenizer loaded.")

print(f"Loading model '{MODEL_NAME}' for Python direct use (e.g. full layer vC)...")
if use_quantization:
    bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16)
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, quantization_config=bnb_config, device_map="auto", torch_dtype=torch.bfloat16)
else:
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto", torch_dtype=torch.bfloat16)
print(f"Model '{MODEL_NAME}' loaded for Python direct use.")

if tokenizer.pad_token is None:
    if tokenizer.eos_token is not None:
        tokenizer.pad_token = tokenizer.eos_token
        if hasattr(model, 'config') and model.config.pad_token_id is None: model.config.pad_token_id = tokenizer.pad_token_id
    else:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        model.resize_token_embeddings(len(tokenizer))
        if hasattr(model, 'config'): model.config.pad_token_id = tokenizer.pad_token_id
print(f"Pad token set to: {tokenizer.pad_token} (ID: {tokenizer.pad_token_id})")

# Create base output directory for this iteration
BASE_OUTPUT_DIR = "./iteration_8_combined_outputs"
os.makedirs(BASE_OUTPUT_DIR, exist_ok=True)

print("Cell 2 Setup Complete.")

Attempting Hugging Face login...
Hugging Face token found in cache: hf_KTFplid... (masked)
Login check complete.
Using device: cuda
Loading tokenizer for 'meta-llama/Meta-Llama-3-8B-Instruct'...


tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Tokenizer loaded.
Loading model 'meta-llama/Meta-Llama-3-8B-Instruct' for Python direct use (e.g. full layer vC)...


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

Model 'meta-llama/Meta-Llama-3-8B-Instruct' loaded for Python direct use.
Pad token set to: <|eot_id|> (ID: 128009)
Cell 2 Setup Complete.


In [4]:
# CELL 3: Data Preparation for Sentence Complexity (WikiAuto)

CONCEPT_NAME_P1 = "sentence_complexity_wiki_auto"
OUTPUT_DIR_P1 = os.path.join(BASE_OUTPUT_DIR, CONCEPT_NAME_P1)
os.makedirs(OUTPUT_DIR_P1, exist_ok=True)

N_SAMPLES_FOR_COMPLEXITY_VC = 100  # As per your notebook
WIKIAUTO_DATASET_PATH = "GEM/wiki_auto_asset_turk"
SPLITS_TO_TRY_P1 = ["test_turk", "test_asset"] # From your notebook

simple_texts_p1_list = []
complex_texts_p1_list = []
dataset_to_process_p1 = None
final_split_used_p1 = None

try:
    print(f"Attempting to load data for '{CONCEPT_NAME_P1}' from '{WIKIAUTO_DATASET_PATH}'")
    for split_name_attempt in SPLITS_TO_TRY_P1:
        try:
            dataset_to_process_p1 = load_dataset(WIKIAUTO_DATASET_PATH, split=split_name_attempt, trust_remote_code=True)
            final_split_used_p1 = split_name_attempt
            print(f"Successfully loaded split: '{final_split_used_p1}'. Entries: {len(dataset_to_process_p1)}")
            break
        except Exception:
            pass # Try next split

    if dataset_to_process_p1 is None: # Fallback if direct split load fails
        full_dataset_dict = load_dataset(WIKIAUTO_DATASET_PATH, trust_remote_code=True)
        for split_key_attempt in SPLITS_TO_TRY_P1:
            if split_key_attempt in full_dataset_dict:
                dataset_to_process_p1 = full_dataset_dict[split_key_attempt]
                final_split_used_p1 = split_key_attempt
                print(f"Successfully selected data from DatasetDict for split: '{final_split_used_p1}'. Entries: {len(dataset_to_process_p1)}")
                break
    if dataset_to_process_p1 is None:
        raise ValueError(f"Failed to load target splits from '{WIKIAUTO_DATASET_PATH}'.")

    shuffled_dataset_p1 = dataset_to_process_p1.shuffle(seed=42)
    n_entries_to_select_p1 = min(N_SAMPLES_FOR_COMPLEXITY_VC, len(shuffled_dataset_p1))

    for i in range(n_entries_to_select_p1):
        entry = shuffled_dataset_p1[i]
        original_text = entry.get('source')
        simplified_list = entry.get('references')
        if isinstance(original_text, str) and original_text.strip() and \
           isinstance(simplified_list, list) and len(simplified_list) > 0 and \
           isinstance(simplified_list[0], str) and simplified_list[0].strip():
            complex_texts_p1_list.append(original_text.strip())
            simple_texts_p1_list.append(simplified_list[0].strip())

    if not complex_texts_p1_list or not simple_texts_p1_list:
        raise ValueError("No valid complex/simple text pairs extracted for Phase 1.")

    print(f"Obtained {len(complex_texts_p1_list)} pairs for complexity from '{final_split_used_p1}'.")

except Exception as e:
    print(f"ERROR loading WikiAuto for Phase 1: {e}. Using fallback manual dataset.")
    simple_texts_p1_list = [
        "The cat sat on the mat.", "Birds fly in the blue sky.", "The sun is very bright today.",
        "She likes to read books.", "He plays games after school."
    ]
    complex_texts_p1_list = [
        "The feline, a creature of habit and grace, positioned itself comfortably upon the woven floor covering.",
        "Avian species, endowed with the remarkable ability of flight, navigate the cerulean expanse of the atmosphere.",
        "Today, the solar celestial body emits an exceptionally luminous and brilliant radiance.",
        "She derives considerable intellectual stimulation and pleasure from the perusal of literary works.",
        "Subsequent to the completion of his academic obligations, he engages in recreational digital pastimes."
    ]

# Save to JSON files for steer_llm.py
complex_texts_file_p1 = os.path.join(OUTPUT_DIR_P1, "complex_texts_p1.json")
simple_texts_file_p1 = os.path.join(OUTPUT_DIR_P1, "simple_texts_p1.json")
with open(complex_texts_file_p1, 'w') as f: json.dump(complex_texts_p1_list, f, indent=2)
with open(simple_texts_file_p1, 'w') as f: json.dump(simple_texts_p1_list, f, indent=2)
print(f"Saved complex texts for Phase 1 to {complex_texts_file_p1}")
print(f"Saved simple texts for Phase 1 to {simple_texts_file_p1}")

Attempting to load data for 'sentence_complexity_wiki_auto' from 'GEM/wiki_auto_asset_turk'


README.md:   0%|          | 0.00/34.8k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/89.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

test_asset-00000-of-00001.parquet:   0%|          | 0.00/204k [00:00<?, ?B/s]

test_turk-00000-of-00001.parquet:   0%|          | 0.00/174k [00:00<?, ?B/s]

test_contract-00000-of-00001.parquet:   0%|          | 0.00/194k [00:00<?, ?B/s]

test_wiki-00000-of-00001.parquet:   0%|          | 0.00/180k [00:00<?, ?B/s]

(…)enge_train_sample-00000-of-00001.parquet:   0%|          | 0.00/123k [00:00<?, ?B/s]

(…)validation_sample-00000-of-00001.parquet:   0%|          | 0.00/90.1k [00:00<?, ?B/s]

(…)t_backtranslation-00000-of-00001.parquet:   0%|          | 0.00/186k [00:00<?, ?B/s]

(…)_test_asset_bfp02-00000-of-00001.parquet:   0%|          | 0.00/187k [00:00<?, ?B/s]

(…)_test_asset_bfp05-00000-of-00001.parquet:   0%|          | 0.00/188k [00:00<?, ?B/s]

(…)test_asset_nopunc-00000-of-00001.parquet:   0%|          | 0.00/186k [00:00<?, ?B/s]

(…)k_backtranslation-00000-of-00001.parquet:   0%|          | 0.00/174k [00:00<?, ?B/s]

(…)e_test_turk_bfp02-00000-of-00001.parquet:   0%|          | 0.00/176k [00:00<?, ?B/s]

(…)e_test_turk_bfp05-00000-of-00001.parquet:   0%|          | 0.00/177k [00:00<?, ?B/s]

(…)_test_turk_nopunc-00000-of-00001.parquet:   0%|          | 0.00/174k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/483801 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating test_asset split:   0%|          | 0/359 [00:00<?, ? examples/s]

Generating test_turk split:   0%|          | 0/359 [00:00<?, ? examples/s]

Generating test_contract split:   0%|          | 0/659 [00:00<?, ? examples/s]

Generating test_wiki split:   0%|          | 0/720 [00:00<?, ? examples/s]

Generating challenge_train_sample split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating challenge_validation_sample split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating challenge_test_asset_backtranslation split:   0%|          | 0/359 [00:00<?, ? examples/s]

Generating challenge_test_asset_bfp02 split:   0%|          | 0/359 [00:00<?, ? examples/s]

Generating challenge_test_asset_bfp05 split:   0%|          | 0/359 [00:00<?, ? examples/s]

Generating challenge_test_asset_nopunc split:   0%|          | 0/359 [00:00<?, ? examples/s]

Generating challenge_test_turk_backtranslation split:   0%|          | 0/359 [00:00<?, ? examples/s]

Generating challenge_test_turk_bfp02 split:   0%|          | 0/359 [00:00<?, ? examples/s]

Generating challenge_test_turk_bfp05 split:   0%|          | 0/359 [00:00<?, ? examples/s]

Generating challenge_test_turk_nopunc split:   0%|          | 0/359 [00:00<?, ? examples/s]

Successfully loaded split: 'test_turk'. Entries: 359
Obtained 100 pairs for complexity from 'test_turk'.
Saved complex texts for Phase 1 to ./iteration_8_combined_outputs/sentence_complexity_wiki_auto/complex_texts_p1.json
Saved simple texts for Phase 1 to ./iteration_8_combined_outputs/sentence_complexity_wiki_auto/simple_texts_p1.json


In [5]:
# CELL 4: Compute Sentence Complexity Concept Vector (MLP-based) using steer_llm.py

CONCEPT_VECTOR_FILE_P1_MLP = os.path.join(OUTPUT_DIR_P1, "concept_vector_complexity_mlp_p1.pt")
MLP_LAYERS_COMPLEXITY_P1 = [-1, -3, -5]
MAX_LEN_ACTIVATION_P1 = 192

print(f"Computing MLP-based concept vector for '{CONCEPT_NAME_P1}'...")
command_compute_complexity_mlp = [
    "python", "steer_llm.py",
    # Global arguments should come BEFORE the subcommand
    "--model_name", MODEL_NAME, # From your Cell 2
    "--seed", "42",
    # Subcommand
    "compute-vector",
    # Arguments for compute-vector
    "--positive_texts_json", complex_texts_file_p1,
    "--negative_texts_json", simple_texts_file_p1,
    "--mlp_layer_indices"
] + [str(l) for l in MLP_LAYERS_COMPLEXITY_P1] + [
    "--extraction_type", "mlp",
    "--batch_size", "4",
    "--aggregation_strategy", "mean_all_tokens",
    "--normalize_vector",
    "--max_length", str(MAX_LEN_ACTIVATION_P1),
    "--output_vector_file", CONCEPT_VECTOR_FILE_P1_MLP
    # Removed --seed from here as it's now a global argument
]

try:
    process = subprocess.Popen(command_compute_complexity_mlp, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = process.communicate()
    print("Stdout:\n", stdout.decode())
    if stderr: print("Stderr:\n", stderr.decode())
    if process.returncode == 0 and os.path.exists(CONCEPT_VECTOR_FILE_P1_MLP):
        print(f"Successfully computed MLP complexity vector. Saved to {CONCEPT_VECTOR_FILE_P1_MLP}")
    else:
        print(f"Error computing MLP complexity vector. Return code: {process.returncode}")
        if stderr:
            print(f"Error details: {stderr.decode()}")
except Exception as e:
    print(f"An exception occurred during MLP complexity vector computation: {e}")

Computing MLP-based concept vector for 'sentence_complexity_wiki_auto'...
Stdout:
 
Stderr:
 2025-05-24 20:47:55.633159: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748119675.654081    3109 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748119675.660874    3109 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-24 20:47:58 [INFO] steer_llm:81 Loading model meta-llama/Meta-Llama-3-8B-Instruct on cuda (dtype=bfloat16)
2025-05-24 20:48:01 [INFO] modeling:991 We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
Lo

In [6]:
# CELL 5: Steer for Sentence Complexity (using MLP vC)

TEST_PROMPTS_P1 = [ # Reusing from your notebook's Iteration 8
    "The weather forecast predicts",
    "Artificial intelligence is a field that",
    "To bake a cake, one must first",
    "The history of ancient Rome shows",
    "Environmental conservation efforts aim to"
]
MAX_NEW_GEN_TOKENS_P1 = 60 # As per your notebook
ALPHAS_COMPLEXITY_P1 = [0.0, 0.75, 1.5, -0.75, -1.5] # Baseline (0.0) and steering alphas
CLIP_MIN_P1, CLIP_MAX_P1 = -25.0, 25.0 # From your notebook

# Use one of the layers for which the vector was computed, e.g., the last one specified
# Ensure MLP_LAYERS_COMPLEXITY_P1 is defined from Cell 4
STEERING_LAYER_IDX_STR_P1_MLP = str(MLP_LAYERS_COMPLEXITY_P1[-1]) if MLP_LAYERS_COMPLEXITY_P1 else None


iteration_results_p1_mlp = {}

if not os.path.exists(CONCEPT_VECTOR_FILE_P1_MLP) and any(a != 0.0 for a in ALPHAS_COMPLEXITY_P1):
    print(f"MLP Complexity vector not found at {CONCEPT_VECTOR_FILE_P1_MLP}. Skipping steered (non-baseline) runs for Phase 1.")
elif STEERING_LAYER_IDX_STR_P1_MLP is None and any(a != 0.0 for a in ALPHAS_COMPLEXITY_P1):
    print("MLP_LAYERS_COMPLEXITY_P1 is not defined or empty. Cannot determine steering layer. Skipping steered runs.")
else:
    for alpha_val in ALPHAS_COMPLEXITY_P1:
        use_clipping = True if alpha_val > 0 else False # Clip for "more complex" (positive alpha)
        clip_status_str = "ON" if use_clipping else "OFF"

        run_name = f"P1_MLP_Complex_L{STEERING_LAYER_IDX_STR_P1_MLP}_alpha_{alpha_val:.2f}_clip_{clip_status_str}"
        is_baseline_run = (alpha_val == 0.0)
        if is_baseline_run:
            run_name = "P1_MLP_Baseline_Complexity"

        # For baseline (alpha=0), steer_llm.py steer still needs the args, but strength 0 means no actual steering
        # The script's steer_generation_with_mlp_intervention should handle strength 0 correctly by not adding hooks.
        strengths_p1_mlp = {STEERING_LAYER_IDX_STR_P1_MLP: alpha_val}
        strengths_file_p1_mlp = os.path.join(OUTPUT_DIR_P1, f"strengths_{run_name}.json")
        with open(strengths_file_p1_mlp, 'w') as f: json.dump(strengths_p1_mlp, f)


        output_texts_file_p1_mlp = os.path.join(OUTPUT_DIR_P1, f"{run_name}_outputs.json")
        current_run_outputs = []

        print(f"\\n--- Generating for: {run_name} ---")
        for prompt_base in TEST_PROMPTS_P1:
            prompt_to_steer = f"Explain the following with an appropriate level of sentence complexity: {prompt_base}"

            command_steer = [
                "python", "steer_llm.py",
                # Global arguments
                "--model_name", MODEL_NAME,
                "--seed", "101", # Consistent seed for comparability across alphas
                # Subcommand
                "steer",
                # Arguments for steer
                "--prompt", prompt_to_steer,
                "--max_new_tokens", str(MAX_NEW_GEN_TOKENS_P1),
                "--temperature", "0.7", "--do_sample"
            ]

            # --concept_vector_file and --strengths_json are always required by the script's steer command.
            # For baseline (alpha=0), the strength in the JSON file will be 0, so no effective steering.
            command_steer.extend([
                "--concept_vector_file", CONCEPT_VECTOR_FILE_P1_MLP, # Provide even for baseline
                "--strengths_json", strengths_file_p1_mlp         # Provide even for baseline
            ])

            if not is_baseline_run: # Only add steering_extraction_type if actually steering
                 command_steer.extend(["--steering_extraction_type", "mlp"])

            if use_clipping and not is_baseline_run: # Clipping only for actual steered positive alpha runs
                command_steer.extend(["--clip_min", str(CLIP_MIN_P1), "--clip_max", str(CLIP_MAX_P1)])

            try:
                process = subprocess.Popen(command_steer, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                stdout, stderr = process.communicate()
                if process.returncode == 0:
                    output_text = stdout.decode().strip()
                    completion = output_text.split(prompt_to_steer, 1)[-1].strip() if prompt_to_steer in output_text else output_text
                    print(f"  Prompt: '{prompt_base[:50]}...' -> Completion: '{completion[:100]}...'")
                    current_run_outputs.append(completion)
                else:
                    print(f"  Error steering for prompt '{prompt_base}'. Return code: {process.returncode}")
                    print(f"  Stderr: {stderr.decode()}")
                    current_run_outputs.append(f"ERROR: {stderr.decode()}")
            except Exception as e:
                print(f"  An exception occurred for prompt '{prompt_base}': {e}")
                current_run_outputs.append(f"EXCEPTION: {str(e)}")

        with open(output_texts_file_p1_mlp, 'w') as f:
            json.dump(current_run_outputs, f, indent=2)
        iteration_results_p1_mlp[run_name] = {"outputs_file": output_texts_file_p1_mlp, "outputs": current_run_outputs, "alpha": alpha_val, "clipping": clip_status_str if not is_baseline_run else "N/A"}
        print(f"Saved outputs for {run_name} to {output_texts_file_p1_mlp}")

\n--- Generating for: P1_MLP_Baseline_Complexity ---
  Prompt: 'The weather forecast predicts...' -> Completion: 'a chance of rain tomorrow, and therefore, we should pack an umbrella.
The sentence is structured as ...'
  Prompt: 'Artificial intelligence is a field that...' -> Completion: 'aims to create machines that can perform tasks that typically require human intelligence, such as un...'
  Prompt: 'To bake a cake, one must first...' -> Completion: 'preheat the oven to the correct temperature. Then, the ingredients must be measured and combined in ...'
  Prompt: 'The history of ancient Rome shows...' -> Completion: 'that even the most powerful civilizations can rise and fall. The rise of ancient Rome was marked by ...'
  Prompt: 'Environmental conservation efforts aim to...' -> Completion: 'protect and preserve the natural world, including ecosystems, species, and habitats, from the impact...'
Saved outputs for P1_MLP_Baseline_Complexity to ./iteration_8_combined_outputs/sentence_co

In [7]:
# CELL 6: Evaluate Sentence Complexity Outputs (MLP vC) with Quantitative Metrics

print("\\n--- Phase 1: Evaluating MLP-Steered Sentence Complexity Outputs ---")
evaluation_summary_p1_mlp = {}

for run_name, run_data in iteration_results_p1_mlp.items():
    texts_file_to_eval = run_data["outputs_file"]

    if not os.path.exists(texts_file_to_eval) or os.path.getsize(texts_file_to_eval) == 0:
        print(f"Skipping {run_name} as texts file is missing or empty: {texts_file_to_eval}")
        evaluation_summary_p1_mlp[run_name] = {"error": "Missing or empty input file"}
        continue

    # Check if the JSON file contains an empty list
    with open(texts_file_to_eval, 'r') as f_check:
        try:
            content_check = json.load(f_check)
            if not content_check: # Handles empty list case
                 print(f"Skipping {run_name} as texts file contains an empty list: {texts_file_to_eval}")
                 evaluation_summary_p1_mlp[run_name] = {"error": "Input file contains an empty list"}
                 continue
        except json.JSONDecodeError:
            print(f"Skipping {run_name} due to JSON decode error in: {texts_file_to_eval}")
            evaluation_summary_p1_mlp[run_name] = {"error": "JSON decode error"}
            continue


    print(f"\\nEvaluating metrics for: {run_name}")
    metrics_output_file = os.path.join(OUTPUT_DIR_P1, f"{run_name}_metrics.json")

    command_eval_style = [
        "python", "steer_llm.py", "evaluate-style",
        "--texts_json", texts_file_to_eval,
        "--output_metrics_file", metrics_output_file,
        "--ppl_model_name", PPL_MODEL_FOR_SCRIPT # For PPL within evaluate-style
    ]
    try:
        process = subprocess.Popen(command_eval_style, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        stdout, stderr = process.communicate()
        print("Stdout (evaluate-style):\n", stdout.decode())
        if stderr: print("Stderr (evaluate-style):\n", stderr.decode())

        if process.returncode == 0 and os.path.exists(metrics_output_file):
            with open(metrics_output_file, 'r') as mf:
                metrics = json.load(mf)
            evaluation_summary_p1_mlp[run_name] = metrics
            print(f"Successfully evaluated {run_name}. Metrics: {metrics}")
        else:
            evaluation_summary_p1_mlp[run_name] = {"error": f"Style evaluation script failed. Stderr: {stderr.decode()}"}
            print(f"Error evaluating {run_name}. Stderr: {stderr.decode()}")

    except Exception as e:
        evaluation_summary_p1_mlp[run_name] = {"error": str(e)}
        print(f"Exception during evaluation of {run_name}: {e}")


print("\\n--- Phase 1 MLP Sentence Complexity Evaluation Summary ---")
for run_name, metrics in evaluation_summary_p1_mlp.items():
    original_run_data = iteration_results_p1_mlp.get(run_name, {})
    print(f"Run: {run_name} (Alpha: {original_run_data.get('alpha', 'N/A'):.2f}, Clip: {original_run_data.get('clipping', 'N/A')})")
    if "error" in metrics:
        print(f"  Error: {metrics['error']}")
    else:
        print(f"  Perplexity (CLI): {metrics.get('perplexity', 'N/A'):.2f}" if isinstance(metrics.get('perplexity'), float) else f"  Perplexity (CLI): {metrics.get('perplexity', 'N/A')}")
        print(f"  Flesch Reading Ease: {metrics.get('flesch_reading_ease', 'N/A'):.2f}" if isinstance(metrics.get('flesch_reading_ease'), float) else f"  Flesch Reading Ease: {metrics.get('flesch_reading_ease', 'N/A')}")
        print(f"  Flesch-Kincaid Grade: {metrics.get('flesch_kincaid_grade', 'N/A'):.2f}" if isinstance(metrics.get('flesch_kincaid_grade'), float) else f"  Flesch-Kincaid Grade: {metrics.get('flesch_kincaid_grade', 'N/A')}")
        print(f"  Avg Sentence Length: {metrics.get('avg_sentence_length', 'N/A'):.2f}" if isinstance(metrics.get('avg_sentence_length'), float) else f"  Avg Sentence Length: {metrics.get('avg_sentence_length', 'N/A')}")

\n--- Phase 1: Evaluating MLP-Steered Sentence Complexity Outputs ---
\nEvaluating metrics for: P1_MLP_Baseline_Complexity
Stdout (evaluate-style):
 
Stderr (evaluate-style):
 2025-05-24 20:57:12.381453: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748120232.403171    6894 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748120232.409727    6894 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-24 20:57:16 [INFO] steer_llm:436 Style Metrics (based on 5 texts): FRE=45.56, FKG=11.04, ASL=16.63
2025-05-24 20:57:16 [INFO] steer_llm:575 Style metrics saved to ./iteration_8_combined_outputs/sentence_complexity_wiki_auto/P1_MLP_Baseline_Complexity_me

In [8]:
# CELL 7: Data Preparation for Degree of Certainty

CONCEPT_NAME_P2 = "degree_of_certainty"
OUTPUT_DIR_P2 = os.path.join(BASE_OUTPUT_DIR, CONCEPT_NAME_P2)
os.makedirs(OUTPUT_DIR_P2, exist_ok=True)

high_certainty_texts_p2 = [
    "The results conclusively demonstrate that the treatment is effective.",
    "It is unequivocally true that the earth revolves around the sun.",
    "There is no doubt that this is the optimal solution.",
    "The evidence clearly indicates a positive correlation.",
    "We are absolutely certain of our findings.",
    "This will definitely improve the outcome.",
    "It is undeniably a breakthrough in the field.",
    "The facts confirm this theory without question.",
    "This is certainly the best approach to take.",
    "We can state with high confidence that the model is accurate."
] * 10 # Multiply to get more samples for probe

low_certainty_texts_p2 = [
    "The results might suggest that the treatment could be effective.",
    "It seems possible that the earth revolves around the sun, based on some observations.",
    "There is a chance that this could be a viable solution.",
    "The evidence perhaps indicates a potential correlation.",
    "We are somewhat unsure of our findings at this stage.",
    "This may possibly improve the outcome.",
    "It could perhaps be a step forward in the field.",
    "The facts might support this theory to some extent.",
    "This is possibly a good approach to consider.",
    "We tentatively believe that the model might be fairly accurate."
] * 10 # Multiply for more samples

high_certainty_file_p2 = os.path.join(OUTPUT_DIR_P2, "high_certainty_texts_p2.json")
low_certainty_file_p2 = os.path.join(OUTPUT_DIR_P2, "low_certainty_texts_p2.json")

with open(high_certainty_file_p2, 'w') as f: json.dump(high_certainty_texts_p2, f, indent=2)
with open(low_certainty_file_p2, 'w') as f: json.dump(low_certainty_texts_p2, f, indent=2)
print(f"Saved {len(high_certainty_texts_p2)} high certainty texts for Phase 2 to {high_certainty_file_p2}")
print(f"Saved {len(low_certainty_texts_p2)} low certainty texts for Phase 2 to {low_certainty_file_p2}")

# Define MLP layer indices for certainty vector (can be same or different)
MLP_LAYERS_CERTAINTY_P2 = [-1, -3, -5, -10, -15] # Example, matching earlier suggestion
MAX_LEN_ACTIVATION_P2 = 128

Saved 100 high certainty texts for Phase 2 to ./iteration_8_combined_outputs/degree_of_certainty/high_certainty_texts_p2.json
Saved 100 low certainty texts for Phase 2 to ./iteration_8_combined_outputs/degree_of_certainty/low_certainty_texts_p2.json


In [10]:
# CELL 8: Compute Certainty Concept Vector (MLP-based) using steer_llm.py

CONCEPT_VECTOR_FILE_P2_MLP = os.path.join(OUTPUT_DIR_P2, "concept_vector_certainty_mlp_p2.pt")

print(f"Computing MLP-based concept vector for '{CONCEPT_NAME_P2}'...")
command_compute_certainty_mlp = [
    "python", "steer_llm.py",
    # Global arguments should come BEFORE the subcommand
    "--model_name", MODEL_NAME, # From your Cell 2
    "--seed", "43",             # Seed for this specific command run
    # Subcommand
    "compute-vector",
    # Arguments for compute-vector
    "--positive_texts_json", high_certainty_file_p2,
    "--negative_texts_json", low_certainty_file_p2,
    "--mlp_layer_indices"
] + [str(l) for l in MLP_LAYERS_CERTAINTY_P2] + [
    "--extraction_type", "mlp",
    "--batch_size", "4",
    "--aggregation_strategy", "mean_all_tokens",
    "--normalize_vector",
    "--max_length", str(MAX_LEN_ACTIVATION_P2),
    "--output_vector_file", CONCEPT_VECTOR_FILE_P2_MLP
]

try:
    process = subprocess.Popen(command_compute_certainty_mlp, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = process.communicate()
    print("Stdout:\n", stdout.decode())
    if stderr and stderr.decode().strip(): # Only print stderr if it's not empty
        print("Stderr:\n", stderr.decode())
    if process.returncode == 0 and os.path.exists(CONCEPT_VECTOR_FILE_P2_MLP):
        print(f"Successfully computed MLP certainty vector. Saved to {CONCEPT_VECTOR_FILE_P2_MLP}")
    else:
        print(f"Error computing MLP certainty vector. Return code: {process.returncode}")
        if stderr and stderr.decode().strip():
             print(f"Error details: {stderr.decode()}")
except Exception as e:
    print(f"An exception occurred during MLP certainty vector computation: {e}")

Computing MLP-based concept vector for 'degree_of_certainty'...
Stdout:
 
Stderr:
 2025-05-24 21:01:52.688370: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748120512.709983    8425 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748120512.716618    8425 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-24 21:01:55 [INFO] steer_llm:81 Loading model meta-llama/Meta-Llama-3-8B-Instruct on cuda (dtype=bfloat16)
2025-05-24 21:01:59 [INFO] modeling:991 We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
Loading chec

In [12]:
# CELL 9: Steer for Degree of Certainty (using MLP vC)

import json
import os
import subprocess

TEST_PROMPTS_P2 = [
    "The economic outlook for next year suggests",
    "Regarding the new policy, its impact will likely be",
    "The possibility of life on other planets is",
    "Climate change will affect coastal cities by",
    "The new drug candidate shows potential for"
]
MAX_NEW_GEN_TOKENS_P2 = 50
ALPHAS_CERTAINTY_P2 = [0.0, 1.0, 2.0, -1.0, -2.0]
CLIP_MIN_P2, CLIP_MAX_P2 = -20.0, 20.0

# Ensure MLP_LAYERS_CERTAINTY_P2 is defined from Cell 7 (or earlier if globally)
# And CONCEPT_VECTOR_FILE_P2_MLP from Cell 8
STEERING_LAYER_IDX_STR_P2_MLP = str(MLP_LAYERS_CERTAINTY_P2[-1]) if MLP_LAYERS_CERTAINTY_P2 else None

iteration_results_p2_mlp = {}

if not os.path.exists(CONCEPT_VECTOR_FILE_P2_MLP) and any(a != 0.0 for a in ALPHAS_CERTAINTY_P2):
    print(f"MLP Certainty vector not found at {CONCEPT_VECTOR_FILE_P2_MLP}. Skipping steered (non-baseline) runs for Phase 2.")
elif STEERING_LAYER_IDX_STR_P2_MLP is None and any(a != 0.0 for a in ALPHAS_CERTAINTY_P2):
    print("MLP_LAYERS_CERTAINTY_P2 is not defined or empty. Cannot determine steering layer. Skipping steered runs.")
else:
    for alpha_val in ALPHAS_CERTAINTY_P2:
        use_clipping_p2 = True if alpha_val > 0 else False
        clip_status_str_p2 = "ON" if use_clipping_p2 else "OFF"

        run_name = f"P2_MLP_Certainty_L{STEERING_LAYER_IDX_STR_P2_MLP}_alpha_{alpha_val:.2f}_clip_{clip_status_str_p2}"
        is_baseline_run = (alpha_val == 0.0)
        if is_baseline_run:
            run_name = "P2_MLP_Baseline_Certainty"

        strengths_p2_mlp = {STEERING_LAYER_IDX_STR_P2_MLP: alpha_val} # Strength will be 0.0 for baseline
        strengths_file_p2_mlp = os.path.join(OUTPUT_DIR_P2, f"strengths_{run_name}.json")
        with open(strengths_file_p2_mlp, 'w') as f: json.dump(strengths_p2_mlp, f)

        output_texts_file_p2_mlp = os.path.join(OUTPUT_DIR_P2, f"{run_name}_outputs.json")
        current_run_outputs = []

        print(f"\\n--- Generating for: {run_name} ---")
        for prompt_base in TEST_PROMPTS_P2:
            prompt_to_steer = f"Provide an explanation with an appropriate degree of certainty: {prompt_base}"

            command_steer = [
                "python", "steer_llm.py",
                # Global arguments
                "--model_name", MODEL_NAME,
                "--seed", "202", # Consistent seed for this phase
                # Subcommand
                "steer",
                # Arguments for steer
                "--prompt", prompt_to_steer,
                "--max_new_tokens", str(MAX_NEW_GEN_TOKENS_P2),
                "--temperature", "0.7", "--do_sample",
                # --concept_vector_file and --strengths_json are always required by the script
                "--concept_vector_file", CONCEPT_VECTOR_FILE_P2_MLP,
                "--strengths_json", strengths_file_p2_mlp
            ]

            if not is_baseline_run: # Only add steering_extraction_type if actually steering
                 command_steer.extend(["--steering_extraction_type", "mlp"])

            if use_clipping_p2 and not is_baseline_run: # Clipping only for actual steered positive alpha runs
                command_steer.extend(["--clip_min", str(CLIP_MIN_P2), "--clip_max", str(CLIP_MAX_P2)])

            try:
                process = subprocess.Popen(command_steer, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                stdout, stderr = process.communicate()
                if process.returncode == 0:
                    output_text = stdout.decode().strip()
                    completion = output_text.split(prompt_to_steer, 1)[-1].strip() if prompt_to_steer in output_text else output_text
                    print(f"  Prompt: '{prompt_base[:50]}...' -> Completion: '{completion[:100]}...'")
                    current_run_outputs.append(completion)
                else:
                    print(f"  Error steering for prompt '{prompt_base}'. Return code: {process.returncode}")
                    print(f"  Stderr: {stderr.decode()}")
                    current_run_outputs.append(f"ERROR: {stderr.decode()}")
            except Exception as e:
                print(f"  An exception occurred for prompt '{prompt_base}': {e}")
                current_run_outputs.append(f"EXCEPTION: {str(e)}")

        with open(output_texts_file_p2_mlp, 'w') as f:
            json.dump(current_run_outputs, f, indent=2)
        iteration_results_p2_mlp[run_name] = {"outputs_file": output_texts_file_p2_mlp, "outputs": current_run_outputs, "alpha": alpha_val, "clipping": clip_status_str_p2 if not is_baseline_run else "N/A"}
        print(f"Saved outputs for {run_name} to {output_texts_file_p2_mlp}")

\n--- Generating for: P2_MLP_Baseline_Certainty ---
  Prompt: 'The economic outlook for next year suggests...' -> Completion: 'that the global economy will continue to experience steady growth, with an expected GDP growth rate ...'
  Prompt: 'Regarding the new policy, its impact will likely b...' -> Completion: 'significant, but it is difficult to predict with certainty the exact nature of its effects. While so...'
  Prompt: 'The possibility of life on other planets is...' -> Completion: 'not yet conclusively proven, but there are many compelling reasons to believe that the probability o...'
  Prompt: 'Climate change will affect coastal cities by...' -> Completion: 'causing sea-level rise and increased storm surges.
The statement "Climate change will affect coastal...'
  Prompt: 'The new drug candidate shows potential for...' -> Completion: 'the treatment of a specific type of cancer, but more research is needed to confirm its efficacy.
I'd...'
Saved outputs for P2_MLP_Baseline_Certain

In [13]:
# CELL 10: Evaluate Degree of Certainty Outputs (MLP vC)

print("\\n--- Phase 2: Evaluating MLP-Steered Degree of Certainty Outputs ---")
evaluation_summary_p2_mlp = {}

for run_name, run_data in iteration_results_p2_mlp.items():
    texts_file_to_eval = run_data["outputs_file"]

    if not os.path.exists(texts_file_to_eval) or os.path.getsize(texts_file_to_eval) == 0:
        print(f"Skipping {run_name} as texts file is missing or empty: {texts_file_to_eval}")
        evaluation_summary_p2_mlp[run_name] = {"error": "Missing or empty input file"}
        continue
    with open(texts_file_to_eval, 'r') as f_check: # Check for empty list inside JSON
        try:
            if not json.load(f_check):
                 print(f"Skipping {run_name} as texts file contains an empty list: {texts_file_to_eval}")
                 evaluation_summary_p2_mlp[run_name] = {"error": "Input file contains an empty list"}
                 continue
        except json.JSONDecodeError:
            print(f"Skipping {run_name} due to JSON decode error in: {texts_file_to_eval}")
            evaluation_summary_p2_mlp[run_name] = {"error": "JSON decode error"}
            continue

    print(f"\\nEvaluating metrics for: {run_name}")
    metrics_output_file = os.path.join(OUTPUT_DIR_P2, f"{run_name}_metrics.json")

    command_eval_style = [
        "python", "steer_llm.py", "evaluate-style",
        "--texts_json", texts_file_to_eval,
        "--output_metrics_file", metrics_output_file,
        "--ppl_model_name", PPL_MODEL_FOR_SCRIPT
    ]
    try:
        process = subprocess.Popen(command_eval_style, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        stdout, stderr = process.communicate()
        print("Stdout (evaluate-style):\n", stdout.decode())
        if stderr: print("Stderr (evaluate-style):\n", stderr.decode())

        if process.returncode == 0 and os.path.exists(metrics_output_file):
            with open(metrics_output_file, 'r') as mf:
                metrics = json.load(mf)
            evaluation_summary_p2_mlp[run_name] = metrics
            print(f"Successfully evaluated {run_name}. Metrics: {metrics}")
        else:
            evaluation_summary_p2_mlp[run_name] = {"error": f"Style evaluation script failed. Stderr: {stderr.decode()}"}
            print(f"Error evaluating {run_name}. Stderr: {stderr.decode()}")
    except Exception as e:
        evaluation_summary_p2_mlp[run_name] = {"error": str(e)}
        print(f"Exception during evaluation of {run_name}: {e}")


print("\\n--- Phase 2 MLP Degree of Certainty Evaluation Summary ---")
for run_name, metrics in evaluation_summary_p2_mlp.items():
    original_run_data = iteration_results_p2_mlp.get(run_name, {})
    print(f"Run: {run_name} (Alpha: {original_run_data.get('alpha', 'N/A'):.2f}, Clip: {original_run_data.get('clipping', 'N/A')})")
    if "error" in metrics:
        print(f"  Error: {metrics['error']}")
    else:
        print(f"  Perplexity (CLI): {metrics.get('perplexity', 'N/A'):.2f}" if isinstance(metrics.get('perplexity'), float) else f"  Perplexity (CLI): {metrics.get('perplexity', 'N/A')}")
        print(f"  Flesch Reading Ease: {metrics.get('flesch_reading_ease', 'N/A'):.2f}" if isinstance(metrics.get('flesch_reading_ease'), float) else f"  Flesch Reading Ease: {metrics.get('flesch_reading_ease', 'N/A')}")
        print(f"  Flesch-Kincaid Grade: {metrics.get('flesch_kincaid_grade', 'N/A'):.2f}" if isinstance(metrics.get('flesch_kincaid_grade'), float) else f"  Flesch-Kincaid Grade: {metrics.get('flesch_kincaid_grade', 'N/A')}")
        print(f"  Avg Sentence Length: {metrics.get('avg_sentence_length', 'N/A'):.2f}" if isinstance(metrics.get('avg_sentence_length'), float) else f"  Avg Sentence Length: {metrics.get('avg_sentence_length', 'N/A')}")
        # Add subjective analysis or specific certainty metric analysis here

\n--- Phase 2: Evaluating MLP-Steered Degree of Certainty Outputs ---
\nEvaluating metrics for: P2_MLP_Baseline_Certainty
Stdout (evaluate-style):
 
Stderr (evaluate-style):
 2025-05-24 21:14:16.064771: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748121256.086076   13942 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748121256.093006   13942 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-24 21:14:20 [INFO] steer_llm:436 Style Metrics (based on 5 texts): FRE=50.05, FKG=10.24, ASL=15.93
2025-05-24 21:14:20 [INFO] steer_llm:575 Style metrics saved to ./iteration_8_combined_outputs/degree_of_certainty/P2_MLP_Baseline_Certainty_metrics.json: 

In [15]:
# CELL 11: Compute Sentence Complexity Concept Vector (Full Layer) using steer_llm.py

CONCEPT_NAME_P3 = "sentence_complexity_wiki_auto_full_layer"
OUTPUT_DIR_P3 = os.path.join(BASE_OUTPUT_DIR, CONCEPT_NAME_P3)
os.makedirs(OUTPUT_DIR_P3, exist_ok=True)

CONCEPT_VECTOR_FILE_P3_FULL_LAYER = os.path.join(OUTPUT_DIR_P3, "concept_vector_complexity_full_layer_p3.pt")
# Use the same simple/complex texts from Phase 1
# complex_texts_file_p1, simple_texts_file_p1 (ensure these are accessible from Cell 3)

FULL_LAYER_INDICES_COMPLEXITY_P3 = [-3, -5]
MAX_LEN_ACTIVATION_P3 = 192

print(f"Computing FULL LAYER concept vector for '{CONCEPT_NAME_P3}'...")
command_compute_complexity_full_layer = [
    "python", "steer_llm.py",
    # Global arguments
    "--model_name", MODEL_NAME, # From Cell 2
    "--seed", "44",             # Seed for this run
    # Subcommand
    "compute-vector",
    # Arguments for compute-vector
    "--positive_texts_json", complex_texts_file_p1,
    "--negative_texts_json", simple_texts_file_p1,
    "--mlp_layer_indices" # This argument name is used by the script for layer targeting
] + [str(l) for l in FULL_LAYER_INDICES_COMPLEXITY_P3] + [
    "--extraction_type", "full_layer", # Specify full_layer extraction
    "--batch_size", "2",
    "--aggregation_strategy", "mean_all_tokens",
    "--normalize_vector",
    "--max_length", str(MAX_LEN_ACTIVATION_P3),
    "--output_vector_file", CONCEPT_VECTOR_FILE_P3_FULL_LAYER
]

try:
    process = subprocess.Popen(command_compute_complexity_full_layer, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = process.communicate()
    print("Stdout:\n", stdout.decode())
    if stderr and stderr.decode().strip():
        print("Stderr:\n", stderr.decode())
    if process.returncode == 0 and os.path.exists(CONCEPT_VECTOR_FILE_P3_FULL_LAYER):
        print(f"Successfully computed Full Layer complexity vector. Saved to {CONCEPT_VECTOR_FILE_P3_FULL_LAYER}")
        # Verify content
        loaded_cv_full_layer = torch.load(CONCEPT_VECTOR_FILE_P3_FULL_LAYER)
        print(f"Loaded full_layer vector. Keys (actual layer indices): {list(loaded_cv_full_layer.keys())}")
        for layer_idx, vector in loaded_cv_full_layer.items():
             print(f"  Layer {layer_idx} vector shape: {vector.shape}, norm: {vector.norm().item():.4f}")
    else:
        print(f"Error computing Full Layer complexity vector. Return code: {process.returncode}")
        if stderr and stderr.decode().strip():
            print(f"Error details: {stderr.decode()}")
except Exception as e:
    print(f"An exception occurred during Full Layer complexity vector computation: {e}")

Computing FULL LAYER concept vector for 'sentence_complexity_wiki_auto_full_layer'...
Stdout:
 
Stderr:
 2025-05-24 21:17:09.670453: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748121429.693057   15006 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748121429.699806   15006 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-24 21:17:12 [INFO] steer_llm:81 Loading model meta-llama/Meta-Llama-3-8B-Instruct on cuda (dtype=bfloat16)
2025-05-24 21:17:16 [INFO] modeling:991 We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your o

In [17]:
# CELL 12: Steer for Sentence Complexity (using Full Layer vC)

import json
import os
import subprocess

# Ensure CONCEPT_VECTOR_FILE_P3_FULL_LAYER is defined from Cell 11
# and STEERING_LAYER_IDX_STR_P3_FULL is also correctly defined (actual, non-negative index as string)

iteration_results_p3_full_layer = {}

if not os.path.exists(CONCEPT_VECTOR_FILE_P3_FULL_LAYER) and any(a != 0.0 for a in ALPHAS_COMPLEXITY_P1):
    print(f"Full Layer Complexity vector not found at {CONCEPT_VECTOR_FILE_P3_FULL_LAYER}. Skipping steered (non-baseline) runs for Phase 3.")
elif STEERING_LAYER_IDX_STR_P3_FULL is None and any(a != 0.0 for a in ALPHAS_COMPLEXITY_P1):
    print("STEERING_LAYER_IDX_STR_P3_FULL is not defined or empty. Cannot determine steering layer. Skipping steered runs for Phase 3.")
else:
    print(f"--- Phase 3: Steering with FULL LAYER Complexity vC (Layer {STEERING_LAYER_IDX_STR_P3_FULL}) ---")
    for alpha_val in ALPHAS_COMPLEXITY_P1: # Reusing alphas from Phase 1 for comparison
        use_clipping_p3 = True if alpha_val > 0 else False
        clip_status_str_p3 = "ON" if use_clipping_p3 else "OFF"

        run_name = f"P3_FullLayer_Complex_L{STEERING_LAYER_IDX_STR_P3_FULL}_alpha_{alpha_val:.2f}_clip_{clip_status_str_p3}"
        is_baseline_run_p3 = (alpha_val == 0.0)
        if is_baseline_run_p3:
            run_name = "P3_FullLayer_Baseline_Complexity" # This will be a no-steer run

        # For baseline (alpha=0), steer_llm.py steer still needs the args, but strength 0 means no actual steering.
        strengths_p3_full = {STEERING_LAYER_IDX_STR_P3_FULL: alpha_val} if STEERING_LAYER_IDX_STR_P3_FULL else {}
        strengths_file_p3_full = os.path.join(OUTPUT_DIR_P3, f"strengths_{run_name}.json")
        with open(strengths_file_p3_full, 'w') as f: json.dump(strengths_p3_full, f)


        output_texts_file_p3_full = os.path.join(OUTPUT_DIR_P3, f"{run_name}_outputs.json")
        current_run_outputs = []

        print(f"\\n--- Generating for: {run_name} ---")
        for prompt_base in TEST_PROMPTS_P1: # Reusing prompts from Phase 1
            prompt_to_steer = f"Explain the following with an appropriate level of sentence complexity: {prompt_base}"

            command_steer_full_layer = [
                "python", "steer_llm.py",
                # Global arguments
                "--model_name", MODEL_NAME,
                "--seed", "303", # Consistent seed for this set of experiments
                # Subcommand
                "steer",
                # Arguments for steer
                "--prompt", prompt_to_steer,
                "--max_new_tokens", str(MAX_NEW_GEN_TOKENS_P1), # from Phase 1
                "--temperature", "0.7", "--do_sample",
                # --concept_vector_file and --strengths_json are always required
                "--concept_vector_file", CONCEPT_VECTOR_FILE_P3_FULL_LAYER,
                "--strengths_json", strengths_file_p3_full
            ]

            if not is_baseline_run_p3: # Only add steering_extraction_type if actually steering
                 command_steer_full_layer.extend(["--steering_extraction_type", "full_layer"]) # Key change

            if use_clipping_p3 and not is_baseline_run_p3: # Clipping for steered positive alpha
                command_steer_full_layer.extend(["--clip_min", str(CLIP_MIN_P1), "--clip_max", str(CLIP_MAX_P1)])

            try:
                process_steer_fl = subprocess.Popen(command_steer_full_layer, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                stdout_fl, stderr_fl = process_steer_fl.communicate()
                if process_steer_fl.returncode == 0:
                    output_text_fl = stdout_fl.decode().strip()
                    completion = output_text_fl.split(prompt_to_steer, 1)[-1].strip() if prompt_to_steer in output_text_fl else output_text_fl
                    print(f"  Prompt: '{prompt_base[:50]}...' -> Completion: '{completion[:100]}...'")
                    current_run_outputs.append(completion)
                else:
                    print(f"  Error steering (Full Layer) for prompt '{prompt_base}'. Return code: {process_steer_fl.returncode}")
                    print(f"  Stderr: {stderr_fl.decode()}")
                    current_run_outputs.append(f"ERROR: {stderr_fl.decode()}")
            except Exception as e:
                print(f"  An exception occurred for prompt '{prompt_base}': {e}")
                current_run_outputs.append(f"EXCEPTION: {str(e)}")

        with open(output_texts_file_p3_full, 'w') as f:
            json.dump(current_run_outputs, f, indent=2)
        iteration_results_p3_full_layer[run_name] = {"outputs_file": output_texts_file_p3_full, "outputs": current_run_outputs, "alpha": alpha_val, "clipping": clip_status_str_p3 if not is_baseline_run_p3 else "N/A"}
        print(f"Saved outputs for {run_name} to {output_texts_file_p3_full}")

--- Phase 3: Steering with FULL LAYER Complexity vC (Layer 27) ---
\n--- Generating for: P3_FullLayer_Baseline_Complexity ---
  Prompt: 'The weather forecast predicts...' -> Completion: 'a high chance of rain today. However, I am not convinced that this prediction is accurate. In fact, ...'
  Prompt: 'Artificial intelligence is a field that...' -> Completion: 'deals with the development of intelligent machines that can perform tasks that typically require hum...'
  Prompt: 'To bake a cake, one must first...' -> Completion: 'preheat the oven to 350 degrees Fahrenheit. Then, one must mix together the dry ingredients (flour, ...'
  Prompt: 'The history of ancient Rome shows...' -> Completion: 'that the empire was a major power in the ancient world, with a long-lasting impact on Western civili...'
  Prompt: 'Environmental conservation efforts aim to...' -> Completion: 'protect and preserve the natural world for future generations. These efforts are crucial for maintai...'
Saved outputs for

In [18]:
# CELL 13: Evaluate Sentence Complexity Outputs (Full Layer vC)

print("\\n--- Phase 3: Evaluating Full Layer-Steered Sentence Complexity Outputs ---")
evaluation_summary_p3_full_layer = {}

for run_name, run_data in iteration_results_p3_full_layer.items():
    texts_file_to_eval = run_data["outputs_file"]

    if not os.path.exists(texts_file_to_eval) or os.path.getsize(texts_file_to_eval) == 0:
        print(f"Skipping {run_name} as texts file is missing or empty: {texts_file_to_eval}")
        evaluation_summary_p3_full_layer[run_name] = {"error": "Missing or empty input file"}
        continue
    with open(texts_file_to_eval, 'r') as f_check:
        try:
            if not json.load(f_check):
                 print(f"Skipping {run_name} as texts file contains an empty list: {texts_file_to_eval}")
                 evaluation_summary_p3_full_layer[run_name] = {"error": "Input file contains an empty list"}
                 continue
        except json.JSONDecodeError:
            print(f"Skipping {run_name} due to JSON decode error in: {texts_file_to_eval}")
            evaluation_summary_p3_full_layer[run_name] = {"error": "JSON decode error"}
            continue

    print(f"\\nEvaluating metrics for: {run_name}")
    metrics_output_file = os.path.join(OUTPUT_DIR_P3, f"{run_name}_metrics.json")

    command_eval_style = [
        "python", "steer_llm.py", "evaluate-style",
        "--texts_json", texts_file_to_eval,
        "--output_metrics_file", metrics_output_file,
        "--ppl_model_name", PPL_MODEL_FOR_SCRIPT
    ]
    try:
        process = subprocess.Popen(command_eval_style, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        stdout, stderr = process.communicate()
        print("Stdout (evaluate-style):\n", stdout.decode())
        if stderr: print("Stderr (evaluate-style):\n", stderr.decode())

        if process.returncode == 0 and os.path.exists(metrics_output_file):
            with open(metrics_output_file, 'r') as mf:
                metrics = json.load(mf)
            evaluation_summary_p3_full_layer[run_name] = metrics
            print(f"Successfully evaluated {run_name}. Metrics: {metrics}")
        else:
            evaluation_summary_p3_full_layer[run_name] = {"error": f"Style evaluation script failed. Stderr: {stderr.decode()}"}
            print(f"Error evaluating {run_name}. Stderr: {stderr.decode()}")
    except Exception as e:
        evaluation_summary_p3_full_layer[run_name] = {"error": str(e)}
        print(f"Exception during evaluation of {run_name}: {e}")


print("\\n--- Phase 3 Full Layer Sentence Complexity Evaluation Summary ---")
for run_name, metrics in evaluation_summary_p3_full_layer.items():
    original_run_data = iteration_results_p3_full_layer.get(run_name, {})
    print(f"Run: {run_name} (Alpha: {original_run_data.get('alpha', 'N/A'):.2f}, Clip: {original_run_data.get('clipping', 'N/A')})")
    if "error" in metrics:
        print(f"  Error: {metrics['error']}")
    else:
        print(f"  Perplexity (CLI): {metrics.get('perplexity', 'N/A'):.2f}" if isinstance(metrics.get('perplexity'), float) else f"  Perplexity (CLI): {metrics.get('perplexity', 'N/A')}")
        print(f"  Flesch Reading Ease: {metrics.get('flesch_reading_ease', 'N/A'):.2f}" if isinstance(metrics.get('flesch_reading_ease'), float) else f"  Flesch Reading Ease: {metrics.get('flesch_reading_ease', 'N/A')}")
        print(f"  Flesch-Kincaid Grade: {metrics.get('flesch_kincaid_grade', 'N/A'):.2f}" if isinstance(metrics.get('flesch_kincaid_grade'), float) else f"  Flesch-Kincaid Grade: {metrics.get('flesch_kincaid_grade', 'N/A')}")
        print(f"  Avg Sentence Length: {metrics.get('avg_sentence_length', 'N/A'):.2f}" if isinstance(metrics.get('avg_sentence_length'), float) else f"  Avg Sentence Length: {metrics.get('avg_sentence_length', 'N/A')}")

\n--- Phase 3: Evaluating Full Layer-Steered Sentence Complexity Outputs ---
\nEvaluating metrics for: P3_FullLayer_Baseline_Complexity
Stdout (evaluate-style):
 
Stderr (evaluate-style):
 2025-05-24 21:32:42.204281: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748122362.226056   21315 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748122362.232708   21315 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-24 21:32:46 [INFO] steer_llm:436 Style Metrics (based on 5 texts): FRE=41.33, FKG=10.89, ASL=13.67
2025-05-24 21:32:46 [INFO] steer_llm:575 Style metrics saved to ./iteration_8_combined_outputs/sentence_complexity_wiki_auto_full_layer/P3_Fu

In [20]:
# CELL 14:

print("\\n--- Overall Comparative Analysis ---")
print("This section is for your analysis and documentation.")

print("\\nPhase 1: Sentence Complexity (MLP vC) Summary:")
for run_name, metrics in evaluation_summary_p1_mlp.items():
    print(f"  {run_name}: PPL={metrics.get('perplexity', 'N/A')}, FRE={metrics.get('flesch_reading_ease', 'N/A')}, FKG={metrics.get('flesch_kincaid_grade', 'N/A')}, ASL={metrics.get('avg_sentence_length', 'N/A')}")

print("\\nPhase 2: Degree of Certainty (MLP vC) Summary:")
for run_name, metrics in evaluation_summary_p2_mlp.items():
    print(f"  {run_name}: PPL={metrics.get('perplexity', 'N/A')}, FRE={metrics.get('flesch_reading_ease', 'N/A')}, FKG={metrics.get('flesch_kincaid_grade', 'N/A')}, ASL={metrics.get('avg_sentence_length', 'N/A')}")
    # Add any custom certainty metrics here if you log them

print("\\nPhase 3: Sentence Complexity (Full Layer vC) Summary:")
for run_name, metrics in evaluation_summary_p3_full_layer.items():
    print(f"  {run_name}: PPL={metrics.get('perplexity', 'N/A')}, FRE={metrics.get('flesch_reading_ease', 'N/A')}, FKG={metrics.get('flesch_kincaid_grade', 'N/A')}, ASL={metrics.get('avg_sentence_length', 'N/A')}")

# Example of how to access specific outputs for closer inspection:
print("\nExample Baseline Output (Complexity MLP):")
if "P1_MLP_Baseline_Complexity" in iteration_results_p1_mlp and iteration_results_p1_mlp["P1_MLP_Baseline_Complexity"]["outputs"]:
    print(iteration_results_p1_mlp["P1_MLP_Baseline_Complexity"]["outputs"][0])

print("\nExample Steered Complex Output (Complexity MLP, alpha=1.5):")
complex_run_key_p1 = f"P1_MLP_Complex_L{STEERING_LAYER_IDX_STR_P1_MLP}_alpha_1.50_clip_ON" # Adjust if name changes
if complex_run_key_p1 in iteration_results_p1_mlp and iteration_results_p1_mlp[complex_run_key_p1]["outputs"]:
    print(iteration_results_p1_mlp[complex_run_key_p1]["outputs"][0])

\n--- Overall Comparative Analysis ---
This section is for your analysis and documentation.
\nPhase 1: Sentence Complexity (MLP vC) Summary:
  P1_MLP_Baseline_Complexity: PPL=22.57196807861328, FRE=45.558067665556806, FKG=11.037075274386485, ASL=16.633333333333333
  P1_MLP_Complex_L-5_alpha_0.75_clip_ON: PPL=26.540578842163086, FRE=42.08004842767297, FKG=11.555312788259961, ASL=16.76666666666667
  P1_MLP_Complex_L-5_alpha_1.50_clip_ON: PPL=26.205060958862305, FRE=44.151866037735864, FKG=11.249774004192876, ASL=16.700000000000003
  P1_MLP_Complex_L-5_alpha_-0.75_clip_OFF: PPL=25.479598999023438, FRE=47.49794600443952, FKG=10.745798902454066, ASL=16.550000000000004
  P1_MLP_Complex_L-5_alpha_-1.50_clip_OFF: PPL=22.910484313964844, FRE=41.35285294117649, FKG=11.665022875816994, ASL=16.8
\nPhase 2: Degree of Certainty (MLP vC) Summary:
  P2_MLP_Baseline_Certainty: PPL=27.57182502746582, FRE=50.05087445887447, FKG=10.236519480519481, ASL=15.933333333333334
  P2_MLP_Certainty_L-15_alpha_1.00