!pip install transformers
!pip install --upgrade --force-reinstall google-api-python-client
!pip install --upgrade tensorboard
!pip install accelerate
!pip install google-api-python-client
!pip install numpy
!pip install langid

In [None]:
import pandas as pd
import json
import os
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
import math
from tqdm import tqdm
import gc
import numpy as np

from datasets import load_dataset, Dataset

from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from tqdm import tqdm

In [1]:
API_KEY = "GoogleCloudAPI" 
my_token = "HuggingFaceAPI"

## Steering

In [None]:
print("--- 1. Loading Configuration ---")

model_id = "meta-llama/Meta-Llama-3.1-8B"
mean_diff_file = "neuron_analysis_results_4WAY.json"
lape_file = "neuron_analysis_results_LAPE.json"
rtplx_folder = "rtplx"
rtplx_en_file = "RTP_LX_EN.json"
rtplx_hi_file = "RTP_LX_HI.json"
STEERING_LAYER = 15
STEERING_STRENGTH = -30.0
NUM_PROMPTS_TO_TEST = 200
key_md_en_toxic = "english_toxic_neurons"
key_md_hi_toxic = "hindi_toxic_neurons"
key_lape_en_toxic = "english_toxicity_neurons"
key_lape_hi_toxic = "hindi_toxicity_neurons"

print("--- 2. Initializing Perspective API ---")
try:
    perspective = build(
        'commentanalyzer',
        'v1alpha1',
        developerKey=API_KEY,
        discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
        static_discovery=False
    )
    print("  Perspective API client built successfully.")
except Exception as e:
    print(f"ERROR: Could not initialize Perspective API: {e}")
    perspective = None

def get_toxicity_score(text, lang='en'):
    if perspective is None: return None
    if not text.strip() or "bool]bool" in text or "otasotas" in text:
        return 0.0
    time.sleep(1.2)
    analyze_request = {
        'comment': {'text': text},
        'requestedAttributes': {'TOXICITY': {}},
        'languages': [lang]
    }
    try:
        response = perspective.comments().analyze(body=analyze_request).execute()
        score = response['attributeScores']['TOXICITY']['summaryScore']['value']
        return round(score, 4)
    except HttpError as e:
        if e.resp.status == 429:
            print("  [Perspective API Error: Quota exceeded. Sleeping for 5s...]")
            time.sleep(5)
            return get_toxicity_score(text, lang)
        print(f"  [Perspective API Error for lang={lang}: {e}]")
        return None
    except Exception as e:
        print(f"  [Unexpected API Error for lang={lang}: {e}]")
        return None

print(f"--- 3. Loading original model: {model_id} ---")
tokenizer = AutoTokenizer.from_pretrained(model_id, token=my_token)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    dtype=torch.bfloat16,
    device_map="auto",
    token=my_token
)
model.eval()
if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
print("Original model loaded successfully.")

print(f"--- 4. Loading and randomizing toxic prompts from RTPLX ---")
toxic_prompts_en, toxic_prompts_hi = [], []
try:
    filepath_en = os.path.join(rtplx_folder, rtplx_en_file)
    all_en_prompts = [json.loads(line)["Prompt"] for line in open(filepath_en, 'r', encoding='utf-8')]
    num_to_sample_en = min(NUM_PROMPTS_TO_TEST, len(all_en_prompts))
    toxic_prompts_en = random.sample(all_en_prompts, num_to_sample_en)
    print(f"Loaded {len(all_en_prompts)} English prompts, randomly sampled {len(toxic_prompts_en)}.")
    del all_en_prompts

    filepath_hi = os.path.join(rtplx_folder, rtplx_hi_file)
    all_hi_prompts = [json.loads(line)["Prompt"] for line in open(filepath_hi, 'r', encoding='utf-8')]
    num_to_sample_hi = min(NUM_PROMPTS_TO_TEST, len(all_hi_prompts))
    toxic_prompts_hi = random.sample(all_hi_prompts, num_to_sample_hi)
    print(f"Loaded {len(all_hi_prompts)} Hindi prompts, randomly sampled {len(toxic_prompts_hi)}.")
    del all_hi_prompts
except Exception as e:
    print(f"Error loading test prompts: {e}")
    exit()

print("--- 5. Building all 4 steering vectors ---")
layer_name = f"model.model.layers.{STEERING_LAYER}.mlp"
vector_md_en, vector_lape_en, vector_md_hi, vector_lape_hi = [None]*4
try:
    with open(mean_diff_file, 'r') as f: analysis_md = json.load(f)
    with open(lape_file, 'r') as f: analysis_lape = json.load(f)

    layer_data = analysis_md[key_md_en_toxic][layer_name]
    indices = torch.tensor(layer_data["top_neuron_indices"], dtype=torch.long)
    diffs = torch.tensor(layer_data["top_neuron_mean_diffs"], dtype=torch.bfloat16)
    base_vec = torch.zeros(4096, dtype=torch.bfloat16).scatter_(0, indices, diffs)
    vector_md_en = (base_vec * STEERING_STRENGTH).to("cuda")
    print("  MD English vector built.")
    
    layer_data = analysis_lape[key_lape_en_toxic][layer_name]
    indices = torch.tensor(layer_data["top_neuron_indices"], dtype=torch.long)
    vector_lape_en = torch.zeros(4096, dtype=torch.bfloat16).to("cuda")
    vector_lape_en[indices] = STEERING_STRENGTH
    print("  LAPE English vector built.")
    
    layer_data = analysis_md[key_md_hi_toxic][layer_name]
    indices = torch.tensor(layer_data["top_neuron_indices"], dtype=torch.long)
    diffs = torch.tensor(layer_data["top_neuron_mean_diffs"], dtype=torch.bfloat16)
    base_vec = torch.zeros(4096, dtype=torch.bfloat16).scatter_(0, indices, diffs)
    vector_md_hi = (base_vec * STEERING_STRENGTH).to("cuda")
    print("  MD Hindi vector built.")
    
    layer_data = analysis_lape[key_lape_hi_toxic][layer_name]
    indices = torch.tensor(layer_data["top_neuron_indices"], dtype=torch.long)
    vector_lape_hi = torch.zeros(4096, dtype=torch.bfloat16).to("cuda")
    vector_lape_hi[indices] = STEERING_STRENGTH
    print("  LAPE Hindi vector built.")
except Exception as e:
    print(f"Error building steering vectors: {e}.")

def get_steering_hook(steering_vec):
    def hook(module, input, output):
        output[:, -1, :] = output[:, -1, :] + steering_vec
        return output
    return hook

print(f"\n--- 7. Running Generation & Scoring ---")
generation_kwargs = {"max_new_tokens": 60, "do_sample": False}
all_scores = {
    "baseline_en": [],
    "md_en_on_en": [], "lape_en_on_en": [],
    "md_hi_on_en": [], "lape_hi_on_en": [],
    "baseline_hi": [],
    "md_hi_on_hi": [], "lape_hi_on_hi": [],
    "md_en_on_hi": [], "lape_en_on_hi": [],
}
target_module = model.model.layers[STEERING_LAYER].mlp

print(f"\n--- Processing {len(toxic_prompts_en)} ENGLISH Prompts ---")
for prompt in tqdm(toxic_prompts_en, desc="EN Prompts"):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to("cuda")
    with torch.no_grad(): output = model.generate(**inputs, **generation_kwargs)
    text = tokenizer.decode(output[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    score = get_toxicity_score(text, lang='en')
    if score is not None: all_scores["baseline_en"].append(score)
    
    handle = target_module.register_forward_hook(get_steering_hook(vector_md_en))
    with torch.no_grad(): output = model.generate(**inputs, **generation_kwargs)
    handle.remove()
    text = tokenizer.decode(output[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    score = get_toxicity_score(text, lang='en')
    if score is not None: all_scores["md_en_on_en"].append(score)
    
    handle = target_module.register_forward_hook(get_steering_hook(vector_lape_en))
    with torch.no_grad(): output = model.generate(**inputs, **generation_kwargs)
    handle.remove()
    text = tokenizer.decode(output[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    score = get_toxicity_score(text, lang='en')
    if score is not None: all_scores["lape_en_on_en"].append(score)

    handle = target_module.register_forward_hook(get_steering_hook(vector_md_hi))
    with torch.no_grad(): output = model.generate(**inputs, **generation_kwargs)
    handle.remove()
    text = tokenizer.decode(output[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    score = get_toxicity_score(text, lang='en')
    if score is not None: all_scores["md_hi_on_en"].append(score)
    
    handle = target_module.register_forward_hook(get_steering_hook(vector_lape_hi))
    with torch.no_grad(): output = model.generate(**inputs, **generation_kwargs)
    handle.remove()
    text = tokenizer.decode(output[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    score = get_toxicity_score(text, lang='en')
    if score is not None: all_scores["lape_hi_on_en"].append(score)

print(f"\n--- Processing {len(toxic_prompts_hi)} HINDI Prompts ---")
for prompt in tqdm(toxic_prompts_hi, desc="HI Prompts"):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to("cuda")
    with torch.no_grad(): output = model.generate(**inputs, **generation_kwargs)
    text = tokenizer.decode(output[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    score = get_toxicity_score(text, lang='hi')
    if score is not None: all_scores["baseline_hi"].append(score)
    
    handle = target_module.register_forward_hook(get_steering_hook(vector_md_hi))
    with torch.no_grad(): output = model.generate(**inputs, **generation_kwargs)
    handle.remove()
    text = tokenizer.decode(output[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    score = get_toxicity_score(text, lang='hi')
    if score is not None: all_scores["md_hi_on_hi"].append(score)
    
    handle = target_module.register_forward_hook(get_steering_hook(vector_lape_hi))
    with torch.no_grad(): output = model.generate(**inputs, **generation_kwargs)
    handle.remove()
    text = tokenizer.decode(output[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    score = get_toxicity_score(text, lang='hi')
    if score is not None: all_scores["lape_hi_on_hi"].append(score)

    handle = target_module.register_forward_hook(get_steering_hook(vector_md_en))
    with torch.no_grad(): output = model.generate(**inputs, **generation_kwargs)
    handle.remove()
    text = tokenizer.decode(output[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    score = get_toxicity_score(text, lang='hi')
    if score is not None: all_scores["md_en_on_hi"].append(score)
    
    handle = target_module.register_forward_hook(get_steering_hook(vector_lape_en))
    with torch.no_grad(): output = model.generate(**inputs, **generation_kwargs)
    handle.remove()
    text = tokenizer.decode(output[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    score = get_toxicity_score(text, lang='hi')
    if score is not None: all_scores["lape_en_on_hi"].append(score)

print("\n" + "="*80)
print("--- FINAL RESULTS (Average Toxicity) ---")

def get_avg_score(key):
    if all_scores[key]:
        return np.mean(all_scores[key])
    return 0.0

avg_baseline_en = get_avg_score("baseline_en")
avg_baseline_hi = get_avg_score("baseline_hi")
avg_md_en_on_en = get_avg_score("md_en_on_en")
avg_lape_en_on_en = get_avg_score("lape_en_on_en")
avg_md_hi_on_en = get_avg_score("md_hi_on_en")
avg_lape_hi_on_en = get_avg_score("lape_hi_on_en")
avg_md_hi_on_hi = get_avg_score("md_hi_on_hi")
avg_lape_hi_on_hi = get_avg_score("lape_hi_on_hi")
avg_md_en_on_hi = get_avg_score("md_en_on_hi")
avg_lape_en_on_hi = get_avg_score("lape_en_on_hi")

print(f"\n--- Test 1: EN-on-EN ({len(all_scores['baseline_en'])} Prompts) ---")
print(f"  (English vectors on English prompts)")
print(f"  Average Baseline Toxicity: {avg_baseline_en:.4f}")
print(f"  Average MD-Steer Toxicity: {avg_md_en_on_en:.4f} (Reduction: {avg_baseline_en - avg_md_en_on_en:.4f})")
print(f"  Average LAPE-Steer Toxicity: {avg_lape_en_on_en:.4f} (Reduction: {avg_baseline_en - avg_lape_en_on_en:.4f})")

print(f"\n--- Test 2: HI-on-HI ({len(all_scores['baseline_hi'])} Prompts) ---")
print(f"  (Hindi vectors on Hindi prompts)")
print(f"  Average Baseline Toxicity: {avg_baseline_hi:.4f}")
print(f"  Average MD-Steer Toxicity: {avg_md_hi_on_hi:.4f} (Reduction: {avg_baseline_hi - avg_md_hi_on_hi:.4f})")
print(f"  Average LAPE-Steer Toxicity: {avg_lape_hi_on_hi:.4f} (Reduction: {avg_baseline_hi - avg_lape_hi_on_hi:.4f})")

print(f"\n--- Test 3: EN-on-HI (Cross-Lingual) ---")
print(f"  (English vectors on Hindi prompts)")
print(f"  Average Baseline Toxicity: {avg_baseline_hi:.4f}")
print(f"  Average MD-Steer Toxicity: {avg_md_en_on_hi:.4f} (Reduction: {avg_baseline_hi - avg_md_en_on_hi:.4f})")
print(f"  Average LAPE-Steer Toxicity: {avg_lape_en_on_hi:.4f} (Reduction: {avg_baseline_hi - avg_lape_en_on_hi:.4f})")

print(f"\n--- Test 4: HI-on-EN (Cross-Lingual) ---")
print(f"  (Hindi vectors on English prompts)")
print(f"  Average Baseline Toxicity: {avg_baseline_en:.4f}")
print(f"  Average MD-Steer Toxicity: {avg_md_hi_on_en:.4f} (Reduction: {avg_baseline_en - avg_md_hi_on_en:.4f})")
print(f"  Average LAPE-Steer Toxicity: {avg_lape_hi_on_en:.4f} (Reduction: {avg_baseline_en - avg_lape_hi_on_en:.4f})")

print("\n--- Steering experiment complete. ---")

--- 1. Loading Configuration ---
--- 2. Initializing Perspective API ---
  Perspective API client built successfully.
--- 3. Loading original model: meta-llama/Meta-Llama-3.1-8B ---


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Original model loaded successfully.
--- 4. Loading and randomizing toxic prompts from RTPLX ---
Loaded 999 English prompts, randomly sampled 200.
Loaded 1115 Hindi prompts, randomly sampled 200.
--- 5. Building all 4 steering vectors ---
  MD English vector built.
  LAPE English vector built.
  MD Hindi vector built.
  LAPE Hindi vector built.

--- 7. Running Generation & Scoring ---

--- Processing 200 ENGLISH Prompts ---


EN Prompts:   0%|          | 0/200 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
EN Prompts:   0%|          | 1/200 [00:10<35:24, 10.68s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
EN Prompts:   1%|          | 2/200 [00:21<34:55, 10.58s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `e


--- Processing 200 HINDI Prompts ---


HI Prompts:   0%|          | 0/200 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
HI Prompts:   0%|          | 1/200 [00:10<33:12, 10.01s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
HI Prompts:   1%|          | 2/200 [00:20<33:00, 10.00s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `e


--- FINAL RESULTS (Average Toxicity) ---

--- Test 1: EN-on-EN (200 Prompts) ---
  (English vectors on English prompts)
  Average Baseline Toxicity: 0.3856
  Average MD-Steer Toxicity: 0.3665 (Reduction: 0.0191)
  Average LAPE-Steer Toxicity: 0.0004 (Reduction: 0.3852)

--- Test 2: HI-on-HI (200 Prompts) ---
  (Hindi vectors on Hindi prompts)
  Average Baseline Toxicity: 0.3439
  Average MD-Steer Toxicity: 0.2195 (Reduction: 0.1244)
  Average LAPE-Steer Toxicity: 0.0788 (Reduction: 0.2651)

--- Test 3: EN-on-HI (Cross-Lingual) ---
  (English vectors on Hindi prompts)
  Average Baseline Toxicity: 0.3439
  Average MD-Steer Toxicity: 0.3143 (Reduction: 0.0296)
  Average LAPE-Steer Toxicity: 0.0009 (Reduction: 0.3430)

--- Test 4: HI-on-EN (Cross-Lingual) ---
  (Hindi vectors on English prompts)
  Average Baseline Toxicity: 0.3856
  Average MD-Steer Toxicity: 0.3463 (Reduction: 0.0393)
  Average LAPE-Steer Toxicity: 0.0789 (Reduction: 0.3067)

--- Steering experiment complete. ---



