In [None]:
import pandas as pd
import json
import os
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import math
from tqdm import tqdm
import gc
import numpy as np

HASOC_FOLDER = "."
TEMP_DATA_FOLDER = "temp_jsonl_data"

model_id = "meta-llama/Meta-Llama-3.1-8B"
my_token = "HFtoken"

batch_size = 64
max_length = 400

TOP_N_NEURONS = 100
RESULTS_FILE = "neuron_analysis_results_4WAY.json"

print(f"--- 1. Preprocessing HASOC CSVs (from {HASOC_FOLDER}) ---")
os.makedirs(TEMP_DATA_FOLDER, exist_ok=True)

csv_files_to_process = {
    "EN": os.path.join(HASOC_FOLDER, "english_2021.csv"),
    "HI": os.path.join(HASOC_FOLDER, "hindi_2021.csv")
}
job_file_map = {}

for lang, filepath in csv_files_to_process.items():
    print(f"  Processing {filepath}...")
    try:
        df = pd.read_csv(filepath)
        benign_texts = df[df['task_1'] == 'NOT']['text'].tolist()
        benign_outfile = os.path.join(TEMP_DATA_FOLDER, f"hasoc_{lang}_benign.jsonl")
        job_file_map[f"benign_{lang}"] = (benign_outfile, "BenignCompletion")
        with open(benign_outfile, 'w', encoding='utf-8') as f:
            for text in benign_texts:
                json.dump({"BenignCompletion": text}, f)
                f.write('\n')
        print(f"    Wrote {len(benign_texts)} benign samples to {benign_outfile}")

        toxic_texts = df[df['task_1'] == 'HOF']['text'].tolist()
        toxic_outfile = os.path.join(TEMP_DATA_FOLDER, f"hasoc_{lang}_toxic.jsonl")
        job_file_map[f"toxic_{lang}"] = (toxic_outfile, "Completion")
        with open(toxic_outfile, 'w', encoding='utf-8') as f:
            for text in toxic_texts:
                json.dump({"Completion": text}, f)
                f.write('\n')
        print(f"    Wrote {len(toxic_texts)} toxic samples to {toxic_outfile}")
    except Exception as e:
        print(f"    An error occurred processing {filepath}: {e}")

print("Preprocessing complete.\n")

print("--- 2. Loading Model and Tokenizer ---")
tokenizer = AutoTokenizer.from_pretrained(model_id, token=my_token)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
print("Tokenizer loaded.")

model = AutoModelForCausalLM.from_pretrained(
    model_id, dtype=torch.bfloat16, device_map="auto", token=my_token
)
model.eval()
print("Model and tokenizer loaded successfully.\n")

def get_hook(storage_dict, layer_name):
    def hook_fn(module, input, output):
        storage_dict[layer_name].append(output.detach().cpu())
    return hook_fn

def run_activation_collection(input_filename, data_field, layer_names):
    all_prompts = []
    try:
        with open(input_filename, 'r', encoding='utf-8') as f:
            for line in f:
                data = json.loads(line)
                all_prompts.append(data[data_field])
        print(f"    Loaded {len(all_prompts)} prompts.")
    except Exception as e:
        print(f"    Error loading {input_filename}: {e}")
        return None

    activation_storage = {}
    hook_handles = []
    for layer_name in layer_names:
        activation_storage[layer_name] = []
    
    for i, layer in enumerate(model.model.layers):
        if i >= len(layer_names): break
        handle = layer.mlp.register_forward_hook(
            get_hook(activation_storage, layer_names[i])
        )
        hook_handles.append(handle)

    all_collected_activations = {name: [] for name in layer_names}
    num_batches = math.ceil(len(all_prompts) / batch_size)
    
    with torch.no_grad():
        for i in tqdm(range(0, len(all_prompts), batch_size), desc="      Processing", ncols=100, leave=False):
            for layer_name in activation_storage:
                activation_storage[layer_name].clear()
            
            batch_prompts = all_prompts[i:i+batch_size]
            
            inputs = tokenizer(
                batch_prompts,
                return_tensors="pt",
                padding="max_length",
                truncation=True,
                max_length=max_length
            ).to("cuda")
            
            model(**inputs)
            last_token_indices = (inputs["attention_mask"].sum(dim=1) - 1).cpu()

            for layer_name, batch_activations_list in activation_storage.items():
                full_batch_tensor = batch_activations_list[0]
                last_token_activations = full_batch_tensor[
                    torch.arange(full_batch_tensor.size(0)),
                    last_token_indices
                ] 
                all_collected_activations[layer_name].append(last_token_activations)
    
    for handle in hook_handles:
        handle.remove()
    
    final_activations = {}
    for layer_name, tensor_list in all_collected_activations.items():
        if tensor_list:
            final_activations[layer_name] = torch.cat(tensor_list, dim=0)
            
    del all_prompts, all_collected_activations, activation_storage, hook_handles
    gc.collect()
    
    return final_activations

def analyze_differences(base_acts, diff_acts, layer_names):
    layer_results = {}
    for layer_name in layer_names:
        base_tensor = base_acts[layer_name]
        diff_tensor = diff_acts[layer_name]
        
        mean_base = base_tensor.mean(dim=0)
        mean_diff = diff_tensor.mean(dim=0)
        
        mean_diff_values = mean_diff - mean_base
        
        top_values, top_indices = torch.topk(mean_diff_values, TOP_N_NEURONS)
        
        layer_results[layer_name] = {
            "top_neuron_indices": top_indices.cpu().numpy().tolist(),
            "top_neuron_mean_diffs": top_values.cpu().float().numpy().tolist()
        }
    return layer_results


--- 1. Preprocessing HASOC CSVs (from .) ---
  Processing ./english_2021.csv...
    Wrote 1342 benign samples to temp_jsonl_data/hasoc_EN_benign.jsonl
    Wrote 2501 toxic samples to temp_jsonl_data/hasoc_EN_toxic.jsonl
  Processing ./hindi_2021.csv...
    Wrote 3161 benign samples to temp_jsonl_data/hasoc_HI_benign.jsonl
    Wrote 1433 toxic samples to temp_jsonl_data/hasoc_HI_toxic.jsonl
Preprocessing complete.

--- 2. Loading Model and Tokenizer ---
Tokenizer loaded.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model and tokenizer loaded successfully.


--- 4. Starting 4-Way Collection & Analysis Loop ---


In [None]:
print("\n--- 4. Starting 4-Way Collection & Analysis Loop ---")

layer_names = [f"model.model.layers.{i}.mlp" for i in range(len(model.model.layers))]
final_analysis_results = {}
benign_activations_en = None
benign_activations_hi = None

print("\n  --- Comparison 1: English Toxicity (Toxic EN vs Benign EN) ---")
print("    Running Benign EN collection...")
benign_file_en, benign_field_en = job_file_map["benign_EN"]
benign_activations_en = run_activation_collection(benign_file_en, benign_field_en, layer_names)

print("    Running Toxic EN collection...")
toxic_file_en, toxic_field_en = job_file_map["toxic_EN"]
toxic_activations_en = run_activation_collection(toxic_file_en, toxic_field_en, layer_names)


  --- Comparison 1: English Toxicity (Toxic EN vs Benign EN) ---
    Running Benign EN collection...
    Loaded 1342 prompts.


                                                                                                    

    Running Toxic EN collection...
    Loaded 2501 prompts.


                                                                                                    

In [None]:
if benign_activations_en and toxic_activations_en:
    print("    Analyzing (Toxic EN - Benign EN)...")
    final_analysis_results["english_toxic_neurons"] = analyze_differences(
        benign_activations_en, toxic_activations_en, layer_names
    )
    print("    Analysis complete. Cleaning toxic activations from memory.")
    del toxic_activations_en
else:
    print("    Failed to get EN activations, skipping comparison.")
    del benign_activations_en
    benign_activations_en = None

print("\n  --- Comparison 2: Hindi Toxicity (Toxic HI vs Benign HI) ---")
print("    Running Benign HI collection...")
benign_file_hi, benign_field_hi = job_file_map["benign_HI"]
benign_activations_hi = run_activation_collection(benign_file_hi, benign_field_hi, layer_names)

print("    Running Toxic HI collection...")
toxic_file_hi, toxic_field_hi = job_file_map["toxic_HI"]
toxic_activations_hi = run_activation_collection(toxic_file_hi, toxic_field_hi, layer_names)

if benign_activations_hi and toxic_activations_hi:
    print("    Analyzing (Toxic HI - Benign HI)...")
    final_analysis_results["hindi_toxic_neurons"] = analyze_differences(
        benign_activations_hi, toxic_activations_hi, layer_names
    )
    print("    Analysis complete. Cleaning toxic activations from memory.")
    del toxic_activations_hi
else:
    print("    Failed to get HI activations, skipping comparison.")
    del benign_activations_hi
    benign_activations_hi = None

print("\n  --- Comparison 3 & 4: Language (Benign EN vs Benign HI) ---")
if benign_activations_en and benign_activations_hi:
    print("    Analyzing (Benign EN - Benign HI)...")
    final_analysis_results["english_language_neurons"] = analyze_differences(
        benign_activations_hi, benign_activations_en, layer_names
    )
    print("    Analyzing (Benign HI - Benign EN)...")
    final_analysis_results["hindi_language_neurons"] = analyze_differences(
        benign_activations_en, benign_activations_hi, layer_names
    )
    print("    Language analysis complete.")
else:
    print("    Missing benign EN or HI activations, skipping language comparison.")

print("    Cleaning up all remaining activations.")
del benign_activations_en
del benign_activations_hi
gc.collect()

print("\n--- 5. All processing complete. Saving final results. ---")
try:
    with open(RESULTS_FILE, 'w') as f:
        json.dump(final_analysis_results, f, indent=2)
    print(f"Successfully saved 4-way analysis results to '{RESULTS_FILE}'.")
except Exception as e:
    print(f"ERROR: Could not save final results: {e}")

print("\n--- All Done ---")

    Analyzing (Toxic EN - Benign EN)...
    Analysis complete. Cleaning toxic activations from memory.

  --- Comparison 2: Hindi Toxicity (Toxic HI vs Benign HI) ---
    Running Benign HI collection...
    Loaded 3161 prompts.


      Processing:   0%|                                                      | 0/50 [00:00<?, ?it/s]

                                                                                                    

    Running Toxic HI collection...
    Loaded 1433 prompts.


                                                                                                    

    Analyzing (Toxic HI - Benign HI)...
    Analysis complete. Cleaning toxic activations from memory.

  --- Comparison 3 & 4: Language (Benign EN vs Benign HI) ---
    Analyzing (Benign EN - Benign HI)...
    Analyzing (Benign HI - Benign EN)...
    Language analysis complete.
    Cleaning up all remaining activations.

--- 5. All processing complete. Saving final results. ---
Successfully saved 4-way analysis results to 'neuron_analysis_results_4WAY.json'.

--- All Done ---
