# Agnostic-Only Finetuning 

In [None]:
!pip install transformers
!pip install --upgrade --force-reinstall google-api-python-client
!pip install --upgrade tensorboard
!pip install accelerate
!pip install google-api-python-client
!pip install numpy
!pip install langid

In [None]:
import pandas as pd
import json
import os
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
import math
from tqdm import tqdm
import gc
import numpy as np

from datasets import load_dataset, Dataset

from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from tqdm import tqdm

## API Keys

In [None]:
API_KEY = "GoogleCloudAPI" 
my_token = "HuggingFaceToken"

In [None]:
HASOC_FOLDER = "."  
TEMP_DATA_FOLDER = "temp_jsonl_data" 

model_id = "meta-llama/Meta-Llama-3.1-8B"

batch_size = 64
max_length = 400 

TOP_N_NEURONS = 100 
RESULTS_FILE = "agnostic_neurons_LAPE.json" 
ACTIVATION_THRESHOLD = 0.0
EPSILON = 1e-9 

print(f"--- 1. Preprocessing HASOC CSVs (from {HASOC_FOLDER}) ---")
os.makedirs(TEMP_DATA_FOLDER, exist_ok=True)

csv_files_to_process = {
    "EN": os.path.join(HASOC_FOLDER, "english_2021.csv"),
    "HI": os.path.join(HASOC_FOLDER, "hindi_2021.csv"),
    "MR": os.path.join(HASOC_FOLDER, "marathi_2021.csv")
}

job_file_map = {}

for lang, filepath in csv_files_to_process.items():
    print(f"  Processing {filepath}...")
    try:
        df = pd.read_csv(filepath)
        
        if lang == 'MR':
            text_col = 'text'
            label_col = 'task_1'
        else: 
            text_col = 'text'
            label_col = 'task_1'

        benign_texts = df[df[label_col] == 'NOT'][text_col].tolist()
        
        benign_outfile = os.path.join(TEMP_DATA_FOLDER, f"hasoc_{lang}_benign.jsonl")
        job_file_map[f"benign_{lang}"] = (benign_outfile, "BenignCompletion")
        
        with open(benign_outfile, 'w', encoding='utf-8') as f:
            for text in benign_texts:
                json.dump({"BenignCompletion": text}, f)
                f.write('\n')
        print(f"    Wrote {len(benign_texts)} benign samples to {benign_outfile}")

    except FileNotFoundError:
        print(f"    ERROR: File not found at {filepath}. Please check your paths.")
    except Exception as e:
        print(f"    An error occurred processing {filepath}: {e}")

print("Preprocessing complete.\n")


print("--- 2. Loading Model and Tokenizer ---")
tokenizer = AutoTokenizer.from_pretrained(model_id, token=my_token)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
print("Tokenizer loaded.")

model = AutoModelForCausalLM.from_pretrained(
    model_id, dtype=torch.bfloat16, device_map="auto", token=my_token
)
model.eval()
print("Model loaded successfully.\n")


def get_hook(storage_dict, layer_name):
    def hook_fn(module, input, output):
        storage_dict[layer_name].append(output.detach().cpu())
    return hook_fn

def run_activation_collection(input_filename, data_field, layer_names):
    all_prompts = []
    try:
        with open(input_filename, 'r', encoding='utf-8') as f:
            for line in f:
                data = json.loads(line)
                all_prompts.append(data[data_field])
        print(f"    Loaded {len(all_prompts)} prompts.")
    except Exception as e:
        print(f"    Error loading {input_filename}: {e}")
        return None

    activation_storage = {} 
    hook_handles = []
    for layer_name in layer_names:
        activation_storage[layer_name] = []
    
    for i, layer in enumerate(model.model.layers):
        if i >= len(layer_names): break
        handle = layer.mlp.register_forward_hook(
            get_hook(activation_storage, layer_names[i])
        )
        hook_handles.append(handle)

    all_collected_activations = {name: [] for name in layer_names}
    num_batches = math.ceil(len(all_prompts) / batch_size)
    
    with torch.no_grad():
        for i in tqdm(range(0, len(all_prompts), batch_size), desc="      Processing", ncols=100, leave=False):
            for layer_name in activation_storage:
                activation_storage[layer_name].clear()
            
            batch_prompts = all_prompts[i:i+batch_size]
            
            inputs = tokenizer(
                batch_prompts, return_tensors="pt", padding="max_length",
                truncation=True, max_length=max_length
            ).to("cuda")
            
            model(**inputs)
            
            last_token_indices = (inputs["attention_mask"].sum(dim=1) - 1).cpu()

            for layer_name, batch_activations_list in activation_storage.items():
                full_batch_tensor = batch_activations_list[0] 
                last_token_activations = full_batch_tensor[
                    torch.arange(full_batch_tensor.size(0)),
                    last_token_indices
                ] 
                all_collected_activations[layer_name].append(last_token_activations)
    
    for handle in hook_handles:
        handle.remove()
    
    final_activations = {}
    for layer_name, tensor_list in all_collected_activations.items():
        if tensor_list:
            final_activations[layer_name] = torch.cat(tensor_list, dim=0)
            
    del all_prompts, all_collected_activations, activation_storage, hook_handles
    gc.collect()
    
    return final_activations


def analyze_with_lape_3_lang(acts_en, acts_hi, acts_mr, layer_names):

    print("      Analyzing with LAPE (for High Entropy)...")
    layer_results = {}
    for layer_name in layer_names:
        tensor_en = acts_en[layer_name] 
        tensor_hi = acts_hi[layer_name] 
        tensor_mr = acts_mr[layer_name] 

        # 1. Binarize: Did the neuron fire? (act > 0)
        binarized_en = tensor_en > ACTIVATION_THRESHOLD
        binarized_hi = tensor_hi > ACTIVATION_THRESHOLD
        binarized_mr = tensor_mr > ACTIVATION_THRESHOLD
        
        # 2. Calculate Firing Probability (P_L) for each language
        P_en = binarized_en.float().mean(dim=0)
        P_hi = binarized_hi.float().mean(dim=0)
        P_mr = binarized_mr.float().mean(dim=0)
        
        # 3. Normalize Probabilities (p_L)
        total_prob = P_en + P_hi + P_mr
        total_prob[total_prob == 0] = 1.0 
        
        p_en = P_en / total_prob
        p_hi = P_hi / total_prob
        p_mr = P_mr / total_prob
        
        # 4. Calculate Entropy (H)
        log_p_en = torch.log2(p_en + EPSILON)
        log_p_hi = torch.log2(p_hi + EPSILON)
        log_p_mr = torch.log2(p_mr + EPSILON)
        
        entropy = - (p_en * log_p_en + p_hi * log_p_hi + p_mr * log_p_mr)
        
        # 5. Find High-Entropy (Agnostic) Neurons
        top_values, top_indices = torch.topk(
            entropy, TOP_N_NEURONS, largest=True 
        )
        
        layer_results[layer_name] = {
            "agnostic_neuron_indices": top_indices.cpu().numpy().tolist(),
            "entropy_scores": top_values.cpu().float().numpy().tolist()
        }
    return layer_results

print("\n--- 4. Starting LAPE Collection & Analysis Loop ---")

layer_names = [f"model.model.layers.{i}.mlp" for i in range(len(model.model.layers))]
final_analysis_results = {}

print("\n  --- Language-Agnostic Analysis (EN vs HI vs MR) ---")
print("    Running Benign EN collection...")
benign_file_en, benign_field_en = job_file_map["benign_EN"]
benign_activations_en = run_activation_collection(benign_file_en, benign_field_en, layer_names)

print("    Running Benign HI collection...")
benign_file_hi, benign_field_hi = job_file_map["benign_HI"]
benign_activations_hi = run_activation_collection(benign_file_hi, benign_field_hi, layer_names)

print("    Running Benign MR collection...")
benign_file_mr, benign_field_mr = job_file_map["benign_MR"]
benign_activations_mr = run_activation_collection(benign_file_mr, benign_field_mr, layer_names)

if benign_activations_en and benign_activations_hi and benign_activations_mr:
    print("    Analyzing (EN vs HI vs MR) for high-entropy neurons...")
    final_analysis_results["language_agnostic_neurons"] = analyze_with_lape_3_lang(
        benign_activations_en, benign_activations_hi, benign_activations_mr, layer_names
    )
    print("    Language-agnostic analysis complete.")
else:
    print("    ERROR: Failed to load activations for one or more languages. Skipping analysis.")

print("    Cleaning up all activations from memory.")
del benign_activations_en
del benign_activations_hi
del benign_activations_mr
gc.collect()

print("\n--- 5. All processing complete. Saving final results. ---")
try:
    with open(RESULTS_FILE, 'w') as f:
        json.dump(final_analysis_results, f, indent=2)
    print(f"LAPE (Agnostic) analysis successfully saved to '{RESULTS_FILE}'.")
except Exception as e:
    print(f"ERROR: Could not save final results: {e}")

print("\n--- All Done ---")

--- 1. Preprocessing HASOC CSVs (from .) ---
  Processing ./english_2021.csv...
    Wrote 1342 benign samples to temp_jsonl_data/hasoc_EN_benign.jsonl
  Processing ./hindi_2021.csv...
    Wrote 3161 benign samples to temp_jsonl_data/hasoc_HI_benign.jsonl
  Processing ./marathi_2021.csv...
    Wrote 1205 benign samples to temp_jsonl_data/hasoc_MR_benign.jsonl
Preprocessing complete.

--- 2. Loading Model and Tokenizer ---


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Tokenizer loaded.


config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

Model loaded successfully.


--- 4. Starting LAPE Collection & Analysis Loop ---

  --- Language-Agnostic Analysis (EN vs HI vs MR) ---
    Running Benign EN collection...
    Loaded 1342 prompts.


                                                                                                    

    Running Benign HI collection...
    Loaded 3161 prompts.


                                                                                                    

    Running Benign MR collection...
    Loaded 1205 prompts.


                                                                                                    

    Analyzing (EN vs HI vs MR) for high-entropy neurons...
      Analyzing with LAPE (for High Entropy)...
    Language-agnostic analysis complete.
    Cleaning up all activations from memory.

--- 5. All processing complete. Saving final results. ---
LAPE (Agnostic) analysis successfully saved to 'agnostic_neurons_LAPE.json'.

--- All Done ---


## Fine-Tuning

In [None]:
import torch
import transformers
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset, Dataset
import pandas as pd
import json
import os
import gc

print("Clearing cache and collecting garbage...")
gc.collect()
torch.cuda.empty_cache()
print("VRAM cache cleared.")

print("--- 1. Loading Configuration ---")

model_id = "meta-llama/Meta-Llama-3.1-8B"
analysis_file = "agnostic_neurons_LAPE.json"
new_model_name = "llama-3.1-8b-agnostic-finetuned-BENIGN-ONLY"

HASOC_FOLDER = "."
hasoc_en_csv_2021 = os.path.join(HASOC_FOLDER, "english_2021.csv")
hasoc_hi_csv_2021 = os.path.join(HASOC_FOLDER, "hindi_2021.csv")
hasoc_mr_csv_2021 = os.path.join(HASOC_FOLDER, "marathi_2021.csv")
hasoc_en_tsv_2019_1 = os.path.join(HASOC_FOLDER, "english_2019_1.tsv")
hasoc_en_tsv_2019_2 = os.path.join(HASOC_FOLDER, "english_2019_2.tsv")
hasoc_hi_tsv_2019_1 = os.path.join(HASOC_FOLDER, "hindi_2019_1.tsv")
hasoc_hi_tsv_2019_2 = os.path.join(HASOC_FOLDER, "hindi_2019_2.tsv")
hasoc_en_xlsx_2020 = os.path.join(HASOC_FOLDER, "english_2020.xlsx")
hasoc_hi_xlsx_2020 = os.path.join(HASOC_FOLDER, "hindi_2020.xlsx")

RESUME_FROM_CHECKPOINT = None

print(f"--- 2. Loading Agnostic Neurons from {analysis_file} ---")
try:
    with open(analysis_file, 'r') as f:
        agnostic_results = json.load(f)["language_agnostic_neurons"]
    agnostic_indices_by_layer = {}
    for layer_name, data in agnostic_results.items():
        indices = data["agnostic_neuron_indices"]
        agnostic_indices_by_layer[layer_name] = torch.tensor(indices, dtype=torch.long)
    print(f"Loaded agnostic neuron indices for {len(agnostic_indices_by_layer)} layers.")
except Exception as e:
    print(f"ERROR: Could not load or parse {analysis_file}. Stopping. {e}")
    exit()

print(f"--- 3. Loading original model in bfloat16: {model_id} ---")
tokenizer = AutoTokenizer.from_pretrained(model_id, token=my_token)
model = AutoModelForCausalLM.from_pretrained(
    model_id, dtype=torch.bfloat16, device_map="auto", token=my_token
)
if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
print("Original model loaded.")

print(f"--- 4. Loading and combining BENIGN-ONLY data (EN, HI, MR) ---")

all_texts = [] 
total_samples_loaded = 0

def load_and_extend(filepath, filetype, text_col, sep=','):
    try:
        if filetype == 'csv':
            df = pd.read_csv(filepath)
        elif filetype == 'tsv':
            df = pd.read_csv(filepath, sep=sep, on_bad_lines='skip')
        elif filetype == 'xlsx':
            df = pd.read_excel(filepath)
        if 'task_1' in df.columns:
            label_col = 'task_1'
        elif 'task1' in df.columns:
            label_col = 'task1'
        else:
            print(f"  Warning: No 'task_1' or 'task1' in {filepath}. Skipping file.")
            return 0
        benign_df = df[df[label_col] == 'NOT']
        texts = benign_df[text_col].dropna().tolist()
        count = len(texts)
        all_texts.extend(texts)
        print(f"  Loaded {count} BENIGN samples from {filepath}.")
        return count
    except Exception as e:
        print(f"  Error loading {filepath}: {e}")
        return 0

total_samples_loaded += load_and_extend(hasoc_en_csv_2021, 'csv', 'text')
total_samples_loaded += load_and_extend(hasoc_hi_csv_2021, 'csv', 'text')
total_samples_loaded += load_and_extend(hasoc_mr_csv_2021, 'csv', 'text')
total_samples_loaded += load_and_extend(hasoc_en_tsv_2019_1, 'tsv', 'text', sep='\t')
total_samples_loaded += load_and_extend(hasoc_en_tsv_2019_2, 'tsv', 'text', sep='\t')
total_samples_loaded += load_and_extend(hasoc_hi_tsv_2019_1, 'tsv', 'text', sep='\t')
total_samples_loaded += load_and_extend(hasoc_hi_tsv_2019_2, 'tsv', 'text', sep='\t')
total_samples_loaded += load_and_extend(hasoc_en_xlsx_2020, 'xlsx', 'text')
total_samples_loaded += load_and_extend(hasoc_hi_xlsx_2020, 'xlsx', 'text')

dataset = Dataset.from_dict({"text": all_texts})
print(f"\nTotal combined multilingual BENIGN dataset size: {total_samples_loaded} samples.")

if total_samples_loaded == 0:
    print("ERROR: No data was loaded. Check your file paths and formats. Stopping.")
    exit()

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512, padding="max_length")
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
print("Dataset tokenized and ready.")

print("--- 5. Applying gradient mask to finetune *only* agnostic neurons ---")
for param in model.parameters():
    param.requires_grad = False
gradient_masks = {}
for i, layer in enumerate(model.model.layers):
    layer_name = f"model.model.layers.{i}.mlp"
    if layer_name in agnostic_results:
        target_weight = layer.mlp.down_proj.weight
        target_weight.requires_grad = True 
        indices_to_train = agnostic_indices_by_layer[layer_name].to(target_weight.device)
        mask = torch.zeros_like(target_weight, dtype=torch.bfloat16)
        mask[indices_to_train, :] = 1.0
        gradient_masks[target_weight] = mask
        if i < 5: 
            print(f"  Mask applied to {layer_name}. {len(indices_to_train)} neurons will be trained.")

def apply_gradient_mask(model):
    for param, mask in gradient_masks.items():
        if param.grad is not None:
            param.grad.data.mul_(mask)
model.register_full_backward_hook(lambda model, grad_in, grad_out: apply_gradient_mask(model))

print("--- 6. Configuring Trainer ---")
training_args = TrainingArguments(
    output_dir=new_model_name,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    num_train_epochs=3,
    logging_steps=10,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=3,
    bf16=True,
    report_to="none"
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

print(f"--- 7. Starting Finetuning on BENIGN-ONLY data ---")
print(f"Checkpoints will be saved to '{new_model_name}'")

trainer.train(resume_from_checkpoint=RESUME_FROM_CHECKPOINT)

print("--- 8. Training complete. Saving final model. ---")
trainer.save_model(new_model_name)
tokenizer.save_pretrained(new_model_name)

print(f"\n--- All Done ---")
print(f"Your new, benign-finetuned model is saved in '{new_model_name}'.")

Clearing cache and collecting garbage...
VRAM cache cleared.
--- 1. Loading Configuration ---
--- 2. Loading Agnostic Neurons from agnostic_neurons_LAPE.json ---
Loaded agnostic neuron indices for 32 layers.
--- 3. Loading original model in bfloat16: meta-llama/Meta-Llama-3.1-8B ---


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Original model loaded.
--- 4. Loading and combining BENIGN-ONLY data (EN, HI, MR) ---
  Loaded 1342 BENIGN samples from ./english_2021.csv.
  Loaded 3161 BENIGN samples from ./hindi_2021.csv.
  Loaded 1205 BENIGN samples from ./marathi_2021.csv.
  Loaded 3591 BENIGN samples from ./english_2019_1.tsv.
  Loaded 865 BENIGN samples from ./english_2019_2.tsv.
  Loaded 713 BENIGN samples from ./hindi_2019_1.tsv.
  Loaded 2196 BENIGN samples from ./hindi_2019_2.tsv.
  Loaded 1852 BENIGN samples from ./english_2020.xlsx.
  Loaded 2116 BENIGN samples from ./hindi_2020.xlsx.

Total combined multilingual BENIGN dataset size: 17041 samples.


Map:   0%|          | 0/17041 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


Dataset tokenized and ready.
--- 5. Applying gradient mask to finetune *only* agnostic neurons ---
  Mask applied to model.model.layers.0.mlp. 100 neurons will be trained.
  Mask applied to model.model.layers.1.mlp. 100 neurons will be trained.
  Mask applied to model.model.layers.2.mlp. 100 neurons will be trained.
  Mask applied to model.model.layers.3.mlp. 100 neurons will be trained.
  Mask applied to model.model.layers.4.mlp. 100 neurons will be trained.
--- 6. Configuring Trainer ---
--- 7. Starting Finetuning on BENIGN-ONLY data ---
Checkpoints will be saved to 'llama-3.1-8b-agnostic-finetuned-BENIGN-ONLY'




Step,Training Loss
10,2.6629
20,2.5706
30,2.5493
40,2.3761
50,2.5119
60,2.4722
70,2.4508
80,2.3806
90,2.3351
100,2.458




--- 8. Training complete. Saving final model. ---

--- All Done ---
Your new, benign-finetuned model is saved in 'llama-3.1-8b-agnostic-finetuned-BENIGN-ONLY'.


## Results

In [None]:
print("--- 1. Loading Configuration ---")

model_id_original = "meta-llama/Meta-Llama-3.1-8B"
model_id_finetuned = "./llama-3.1-8b-agnostic-finetuned-BENIGN-ONLY"

rtplx_folder = "rtplx"
rtplx_en_file = "RTP_LX_EN.json"
rtplx_hi_file = "RTP_LX_HI.json"

NUM_PROMPTS_TO_TEST = 100

print("--- 2. Initializing Perspective API ---")
try:
    perspective = build(
        'commentanalyzer',
        'v1alpha1',
        developerKey=API_KEY,
        discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
        static_discovery=False
    )
    print("  Perspective API client built successfully.")
except Exception as e:
    print(f"ERROR: Could not initialize Perspective API: {e}")
    perspective = None

def get_toxicity_score(text, lang='en'):
    if perspective is None: return None
    if not text.strip() or "bool]bool" in text or "otasotas" in text:
        return 0.0
    time.sleep(1.2)
    analyze_request = {
        'comment': {'text': text},
        'requestedAttributes': {'TOXICITY': {}},
        'languages': [lang]
    }
    try:
        response = perspective.comments().analyze(body=analyze_request).execute()
        score = response['attributeScores']['TOXICITY']['summaryScore']['value']
        return round(score, 4)
    except HttpError as e:
        if e.resp.status == 429:
            print("  [Perspective API Error: Quota exceeded. Sleeping for 5s...]")
            time.sleep(5)
            return get_toxicity_score(text, lang)
        print(f"  [Perspective API Error for lang={lang}: {e}]")
        return None
    except Exception as e:
        print(f"  [Unexpected API Error for lang={lang}: {e}]")
        return None

print(f"--- 3. Loading and randomizing toxic prompts from RTPLX ---")
toxic_prompts_en, toxic_prompts_hi = [], []
try:
    filepath_en = os.path.join(rtplx_folder, rtplx_en_file)
    all_en_prompts = [json.loads(line)["Prompt"] for line in open(filepath_en, 'r', encoding='utf-8')]
    num_to_sample_en = min(NUM_PROMPTS_TO_TEST, len(all_en_prompts))
    toxic_prompts_en = random.sample(all_en_prompts, num_to_sample_en)
    print(f"Loaded {len(all_en_prompts)} English prompts, randomly sampled {len(toxic_prompts_en)}.")
    del all_en_prompts

    filepath_hi = os.path.join(rtplx_folder, rtplx_hi_file)
    all_hi_prompts = [json.loads(line)["Prompt"] for line in open(filepath_hi, 'r', encoding='utf-8')]
    num_to_sample_hi = min(NUM_PROMPTS_TO_TEST, len(all_hi_prompts))
    toxic_prompts_hi = random.sample(all_hi_prompts, num_to_sample_hi)
    print(f"Loaded {len(all_hi_prompts)} Hindi prompts, randomly sampled {len(toxic_prompts_hi)}.")
    del all_hi_prompts
except Exception as e:
    print(f"Error loading test prompts: {e}")
    exit()

print(f"\n--- 4. Running Generation & Scoring ---")
generation_kwargs = {"max_new_tokens": 60, "do_sample": False}
all_scores = {
    "baseline_en": [], "finetuned_en": [],
    "baseline_hi": [], "finetuned_hi": []
}

print(f"\n--- Processing ORIGINAL Model ({model_id_original}) ---")
try:
    tokenizer = AutoTokenizer.from_pretrained(model_id_original, token=my_token)
    model = AutoModelForCausalLM.from_pretrained(
        model_id_original, dtype=torch.bfloat16, device_map="auto", token=my_token
    )
    model.eval()
    if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"

    for prompt in tqdm(toxic_prompts_en, desc="EN Prompts (Baseline)"):
        inputs = tokenizer(prompt, return_tensors="pt", padding=True).to("cuda")
        with torch.no_grad(): output = model.generate(**inputs, **generation_kwargs)
        text = tokenizer.decode(output[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
        score = get_toxicity_score(text, lang='en')
        if score is not None: all_scores["baseline_en"].append(score)

    for prompt in tqdm(toxic_prompts_hi, desc="HI Prompts (Baseline)"):
        inputs = tokenizer(prompt, return_tensors="pt", padding=True).to("cuda")
        with torch.no_grad(): output = model.generate(**inputs, **generation_kwargs)
        text = tokenizer.decode(output[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
        score = get_toxicity_score(text, lang='hi')
        if score is not None: all_scores["baseline_hi"].append(score)

except Exception as e:
    print(f"Error during Baseline model testing: {e}")

print("--- Clearing VRAM before loading next model ---")
del model
del tokenizer
gc.collect()
torch.cuda.empty_cache()

print(f"\n--- Processing FINETUNED Model ({model_id_finetuned}) ---")
try:
    tokenizer = AutoTokenizer.from_pretrained(model_id_finetuned)
    model = AutoModelForCausalLM.from_pretrained(
        model_id_finetuned, dtype=torch.bfloat16, device_map="auto"
    )
    model.eval()
    if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"

    for prompt in tqdm(toxic_prompts_en, desc="EN Prompts (Finetuned)"):
        inputs = tokenizer(prompt, return_tensors="pt", padding=True).to("cuda")
        with torch.no_grad(): output = model.generate(**inputs, **generation_kwargs)
        text = tokenizer.decode(output[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
        score = get_toxicity_score(text, lang='en')
        if score is not None: all_scores["finetuned_en"].append(score)

    for prompt in tqdm(toxic_prompts_hi, desc="HI Prompts (Finetuned)"):
        inputs = tokenizer(prompt, return_tensors="pt", padding=True).to("cuda")
        with torch.no_grad(): output = model.generate(**inputs, **generation_kwargs)
        text = tokenizer.decode(output[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
        score = get_toxicity_score(text, lang='hi')
        if score is not None: all_scores["finetuned_hi"].append(score)

except Exception as e:
    print(f"Error during Finetuned model testing: {e}")

print("\n" + "="*80)
print("--- FINAL RESULTS (Average Toxicity) ---")

def get_avg_score(key):
    if all_scores[key]:
        return np.mean(all_scores[key])
    return 0.0

avg_baseline_en = get_avg_score("baseline_en")
avg_finetuned_en = get_avg_score("finetuned_en")
avg_baseline_hi = get_avg_score("baseline_hi")
avg_finetuned_hi = get_avg_score("finetuned_hi")

avg_baseline_mix = np.mean([avg_baseline_en, avg_baseline_hi]) if all_scores["baseline_en"] and all_scores["baseline_hi"] else 0
avg_finetuned_mix = np.mean([avg_finetuned_en, avg_finetuned_hi]) if all_scores["finetuned_en"] and all_scores["finetuned_hi"] else 0

def print_summary(name, baseline, finetuned, count):
    print(f"\n--- {name} Test ({count} Prompts) ---")
    reduction = baseline - finetuned
    print(f"  Average Baseline Toxicity:  {baseline:.4f}")
    print(f"  Average Finetuned Toxicity: {finetuned:.4f}")
    print(f"    (Reduction: {reduction:.4f})")

print_summary("ENGLISH", avg_baseline_en, avg_finetuned_en, len(all_scores["baseline_en"]))
print_summary("HINDI", avg_baseline_hi, avg_finetuned_hi, len(all_scores["baseline_hi"]))
print_summary("MIXED (Average)", avg_baseline_mix, avg_finetuned_mix, len(all_scores["baseline_en"]) + len(all_scores["baseline_hi"]))

print("\n--- Validation complete. ---")

--- 1. Loading Configuration ---
--- 2. Initializing Perspective API ---
  Perspective API client built successfully.
--- 3. Loading and randomizing toxic prompts from RTPLX ---
Loaded 999 English prompts, randomly sampled 100.
Loaded 1115 Hindi prompts, randomly sampled 100.

--- 4. Running Generation & Scoring ---

--- Processing ORIGINAL Model (meta-llama/Meta-Llama-3.1-8B) ---


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

EN Prompts (Baseline):   0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
EN Prompts (Baseline):   1%|          | 1/100 [00:02<03:57,  2.40s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
EN Prompts (Baseline):   2%|▏         | 2/100 [00:04<04:03,  2.48s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
EN Prompts (Baseline):   3%|▎         | 3/100 [00:07<04:03,  2.51s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
EN Prompts (Baseline):   4%|▍         | 4/100 [00:09<04:01,  2.51s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
EN Prompts (Baseline):   5%|▌         | 5/100 [00:12<03:58,  2.51s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
EN Prompts (Baseline):   6%|▌         | 6/100 [00:14<03:53,  2.49s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
EN Prom

--- Clearing VRAM before loading next model ---

--- Processing FINETUNED Model (./llama-3.1-8b-agnostic-finetuned-BENIGN-ONLY) ---


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

EN Prompts (Finetuned):   0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
EN Prompts (Finetuned):   1%|          | 1/100 [00:02<03:49,  2.32s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
EN Prompts (Finetuned):   2%|▏         | 2/100 [00:04<03:46,  2.31s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
EN Prompts (Finetuned):   3%|▎         | 3/100 [00:06<03:44,  2.32s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
EN Prompts (Finetuned):   4%|▍         | 4/100 [00:09<03:40,  2.30s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
EN Prompts (Finetuned):   5%|▌         | 5/100 [00:11<03:38,  2.30s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
EN Prompts (Finetuned):   6%|▌         | 6/100 [00:13<03:35,  2.30s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--- FINAL RESULTS (Average Toxicity) ---

--- ENGLISH Test (100 Prompts) ---
  Average Baseline Toxicity:  0.3968
  Average Finetuned Toxicity: 0.6310
    (Reduction: -0.2342)

--- HINDI Test (100 Prompts) ---
  Average Baseline Toxicity:  0.3651
  Average Finetuned Toxicity: 0.2832
    (Reduction: 0.0819)

--- MIXED (Average) Test (200 Prompts) ---
  Average Baseline Toxicity:  0.3809
  Average Finetuned Toxicity: 0.4571
    (Reduction: -0.0762)

--- Validation complete. ---



