In [None]:
import pandas as pd
import json
import os
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import math
from tqdm import tqdm
import gc
import numpy as np

In [None]:
import pandas as pd
import json
import os
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import math
from tqdm import tqdm
import gc
import numpy as np

HASOC_FOLDER = "."
TEMP_DATA_FOLDER = "temp_jsonl_data"
model_id = "meta-llama/Meta-Llama-3.1-8B"
my_token = "HuggingFaceToken"
batch_size = 64
max_length = 400
TOP_N_NEURONS = 100
RESULTS_FILE = "neuron_analysis_results_LAPE.json"
ACTIVATION_THRESHOLD = 0.0
EPSILON = 1e-9

print(f"--- 1. Preprocessing HASOC CSVs (from {HASOC_FOLDER}) ---")
os.makedirs(TEMP_DATA_FOLDER, exist_ok=True)
csv_files_to_process = {
    "EN": os.path.join(HASOC_FOLDER, "english_2021.csv"),
    "HI": os.path.join(HASOC_FOLDER, "hindi_2021.csv")
}
job_file_map = {}
for lang, filepath in csv_files_to_process.items():
    try:
        df = pd.read_csv(filepath)
        benign_texts = df[df['task_1'] == 'NOT']['text'].tolist()
        benign_outfile = os.path.join(TEMP_DATA_FOLDER, f"hasoc_{lang}_benign.jsonl")
        job_file_map[f"benign_{lang}"] = (benign_outfile, "BenignCompletion")
        with open(benign_outfile, 'w', encoding='utf-8') as f:
            for text in benign_texts: json.dump({"BenignCompletion": text}, f); f.write('\n')
        toxic_texts = df[df['task_1'] == 'HOF']['text'].tolist()
        toxic_outfile = os.path.join(TEMP_DATA_FOLDER, f"hasoc_{lang}_toxic.jsonl")
        job_file_map[f"toxic_{lang}"] = (toxic_outfile, "Completion")
        with open(toxic_outfile, 'w', encoding='utf-8') as f:
            for text in toxic_texts: json.dump({"Completion": text}, f); f.write('\n')
        print(f"  Processed {filepath}")
    except Exception as e:
        print(f"    An error occurred processing {filepath}: {e}")
print("Preprocessing complete.\n")

print("--- 2. Loading Model and Tokenizer ---")
tokenizer = AutoTokenizer.from_pretrained(model_id, token=my_token)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
print("Tokenizer loaded.")
model = AutoModelForCausalLM.from_pretrained(
    model_id, dtype=torch.bfloat16, device_map="auto", token=my_token
)
model.eval()
print("Model and tokenizer loaded successfully.\n")

def run_activation_collection(input_filename, data_field, layer_names):
    all_prompts = []
    try:
        with open(input_filename, 'r', encoding='utf-8') as f:
            for line in f:
                data = json.loads(line)
                all_prompts.append(data[data_field])
        print(f"    Loaded {len(all_prompts)} prompts.")
    except Exception as e:
        print(f"    Error loading {input_filename}: {e}")
        return None
    activation_storage = {}
    hook_handles = []
    for layer_name in layer_names:
        activation_storage[layer_name] = []
    for i, layer in enumerate(model.model.layers):
        if i >= len(layer_names): break
        handle = layer.mlp.register_forward_hook(
            get_hook(activation_storage, layer_names[i])
        )
        hook_handles.append(handle)
    all_collected_activations = {name: [] for name in layer_names}
    num_batches = math.ceil(len(all_prompts) / batch_size)
    with torch.no_grad():
        for i in tqdm(range(0, len(all_prompts), batch_size), desc="      Processing", ncols=100, leave=False):
            for layer_name in activation_storage:
                activation_storage[layer_name].clear()
            batch_prompts = all_prompts[i:i+batch_size]
            inputs = tokenizer(
                batch_prompts, return_tensors="pt", padding="max_length",
                truncation=True, max_length=max_length
            ).to("cuda")
            model(**inputs)
            last_token_indices = (inputs["attention_mask"].sum(dim=1) - 1).cpu()
            for layer_name, batch_activations_list in activation_storage.items():
                full_batch_tensor = batch_activations_list[0]
                last_token_activations = full_batch_tensor[
                    torch.arange(full_batch_tensor.size(0)),
                    last_token_indices
                ]
                all_collected_activations[layer_name].append(last_token_activations)
    for handle in hook_handles:
        handle.remove()
    final_activations = {}
    for layer_name, tensor_list in all_collected_activations.items():
        if tensor_list:
            final_activations[layer_name] = torch.cat(tensor_list, dim=0)
    del all_prompts, all_collected_activations, activation_storage, hook_handles
    gc.collect()
    return final_activations

def get_hook(storage_dict, layer_name):
    def hook_fn(module, input, output):
        storage_dict[layer_name].append(output.detach().cpu())
    return hook_fn

def analyze_with_lape(acts_group_1, acts_group_2, layer_names):
    print("      Analyzing with LAPE (low entropy)...")
    layer_results = {}
    for layer_name in layer_names:
        tensor_1 = acts_group_1[layer_name]
        tensor_2 = acts_group_2[layer_name]
        binarized_1 = tensor_1 > ACTIVATION_THRESHOLD
        binarized_2 = tensor_2 > ACTIVATION_THRESHOLD
        P_1 = binarized_1.float().mean(dim=0)
        P_2 = binarized_2.float().mean(dim=0)
        total_prob = P_1 + P_2
        total_prob[total_prob == 0] = 1.0
        p_1 = P_1 / total_prob
        p_2 = P_2 / total_prob
        log_p_1 = torch.log2(p_1 + EPSILON)
        log_p_2 = torch.log2(p_2 + EPSILON)
        entropy = - (p_1 * log_p_1 + p_2 * log_p_2)
        top_values, top_indices = torch.topk(entropy, TOP_N_NEURONS, largest=False)
        top_p1_probs = P_1[top_indices].cpu().float().numpy().tolist()
        top_p2_probs = P_2[top_indices].cpu().float().numpy().tolist()
        layer_results[layer_name] = {
            "top_neuron_indices": top_indices.cpu().numpy().tolist(),
            "top_neuron_entropy": top_values.cpu().float().numpy().tolist(),
            "group_1_fire_prob": top_p1_probs,
            "group_2_fire_prob": top_p2_probs
        }
    return layer_results

print("\n--- 4. Starting LAPE Collection & Analysis Loop ---")
layer_names = [f"model.model.layers.{i}.mlp" for i in range(len(model.model.layers))]
final_analysis_results = {}
benign_activations_en = None
benign_activations_hi = None

print("\n  --- Comparison 1: Language Specificity (Benign EN vs Benign HI) ---")
print("    Running Benign EN collection...")
benign_file_en, benign_field_en = job_file_map["benign_EN"]
benign_activations_en = run_activation_collection(benign_file_en, benign_field_en, layer_names)
print("    Running Benign HI collection...")
benign_file_hi, benign_field_hi = job_file_map["benign_HI"]
benign_activations_hi = run_activation_collection(benign_file_hi, benign_field_hi, layer_names)
if benign_activations_en and benign_activations_hi:
    print("    Analyzing (Benign EN vs Benign HI)...")
    final_analysis_results["language_specific_neurons"] = analyze_with_lape(
        benign_activations_en, benign_activations_hi, layer_names
    )
    print("    Language analysis complete.")
else:
    print("    Failed to get benign EN or HI activations, skipping language comparison.")

print("\n  --- Comparison 2: English Toxicity Specificity (Toxic EN vs Benign EN) ---")
if benign_activations_en:
    print("    Running Toxic EN collection...")
    toxic_file_en, toxic_field_en = job_file_map["toxic_EN"]
    toxic_activations_en = run_activation_collection(toxic_file_en, toxic_field_en, layer_names)
    if toxic_activations_en:
        print("    Analyzing (Toxic EN vs Benign EN)...")
        final_analysis_results["english_toxicity_neurons"] = analyze_with_lape(
            benign_activations_en, toxic_activations_en, layer_names
        )
        print("    Analysis complete. Cleaning toxic activations from memory.")
        del toxic_activations_en
    else:
        print("    Failed to get Toxic EN activations, skipping.")
else:
    print("    Missing Benign EN activations, skipping English toxicity comparison.")

print("\n  --- Comparison 3: Hindi Toxicity Specificity (Toxic HI vs Benign HI) ---")
if benign_activations_hi:
    print("    Running Toxic HI collection...")
    toxic_file_hi, toxic_field_hi = job_file_map["toxic_HI"]
    toxic_activations_hi = run_activation_collection(toxic_file_hi, toxic_field_hi, layer_names)
    if toxic_activations_hi:
        print("    Analyzing (Toxic HI vs Benign HI)...")
        final_analysis_results["hindi_toxicity_neurons"] = analyze_with_lape(
            benign_activations_hi, toxic_activations_hi, layer_names
        )
        print("    Analysis complete. Cleaning toxic activations from memory.")
        del toxic_activations_hi
    else:
        print("    Failed to get Toxic HI activations, skipping.")
else:
    print("    Missing Benign HI activations, skipping Hindi toxicity comparison.")

print("    Cleaning up all remaining activations.")
del benign_activations_en
del benign_activations_hi
gc.collect()

print("\n--- 5. All processing complete. Saving final results. ---")
try:
    with open(RESULTS_FILE, 'w') as f:
        json.dump(final_analysis_results, f, indent=2)
    print(f"Successfully saved LAPE analysis results to '{RESULTS_FILE}'.")
except Exception as e:
    print(f"ERROR: Could not save final results: {e}")

print("\n--- All Done ---")

--- 1. Preprocessing HASOC CSVs (from .) ---
  Processed ./english_2021.csv
  Processed ./hindi_2021.csv
Preprocessing complete.

--- 2. Loading Model and Tokenizer ---


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Tokenizer loaded.


config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

Model and tokenizer loaded successfully.


--- 4. Starting LAPE Collection & Analysis Loop ---

  --- Comparison 1: Language Specificity (Benign EN vs Benign HI) ---
    Running Benign EN collection...
    Loaded 1342 prompts.


                                                                                                    

    Running Benign HI collection...
    Loaded 3161 prompts.


                                                                                                    

    Analyzing (Benign EN vs Benign HI)...
      Analyzing with LAPE (low entropy)...
    Language analysis complete.

  --- Comparison 2: English Toxicity Specificity (Toxic EN vs Benign EN) ---
    Running Toxic EN collection...
    Loaded 2501 prompts.


                                                                                                    

    Analyzing (Toxic EN vs Benign EN)...
      Analyzing with LAPE (low entropy)...
    Analysis complete. Cleaning toxic activations from memory.

  --- Comparison 3: Hindi Toxicity Specificity (Toxic HI vs Benign HI) ---
    Running Toxic HI collection...
    Loaded 1433 prompts.


                                                                                                    

    Analyzing (Toxic HI vs Benign HI)...
      Analyzing with LAPE (low entropy)...
    Analysis complete. Cleaning toxic activations from memory.
    Cleaning up all remaining activations.

--- 5. All processing complete. Saving final results. ---
Successfully saved LAPE analysis results to 'neuron_analysis_results_LAPE.json'.

--- All Done ---
