In [None]:
!pip install transformers
!pip install --upgrade --force-reinstall google-api-python-client
!pip install --upgrade tensorboard
!pip install accelerate
!pip install google-api-python-client
!pip install numpy
!pip install langid
!pip install peft openpyxl bitsandbytes datasets

In [None]:
import pandas as pd
import json
import os
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
import math
from tqdm import tqdm
import gc
import numpy as np

from datasets import load_dataset, Dataset

from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from tqdm import tqdm

## API Keys

In [None]:
API_KEY = "GoogleCloudAPI" 
my_token = "HuggingFaceToken"

In [None]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import json
import os
import gc

print("--- 1. Loading Configuration ---")
model_id = "meta-llama/Meta-Llama-3.1-8B"
analysis_file = "neuron_analysis_results_4WAY.json"
new_model_path = "./edited_model_TOXIC_ABLATED"
target_neurons_key = "english_toxic_neurons"

print(f"--- 2. Loading original model: {model_id} ---")
tokenizer = AutoTokenizer.from_pretrained(model_id, token=my_token)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    dtype=torch.bfloat16,
    device_map="auto",
    token=my_token
)
model.eval()
print("Original model loaded successfully.")

print(f"--- 3. Loading analysis results from {analysis_file} ---")
try:
    with open(analysis_file, 'r') as f:
        analysis_results = json.load(f)
    neurons_to_edit = analysis_results[target_neurons_key]
    print(f"Successfully loaded neuron list for: {target_neurons_key}")
except Exception as e:
    print(f"ERROR: Could not load {analysis_file}. Stopping. {e}")
    exit()

print(f"--- 4. Starting model surgery (ablating {target_neurons_key}) ---")
print("Zeroing out the 'down_proj' weights for target neurons...")
total_neurons_edited = 0
total_layers_edited = 0

with torch.no_grad():
    for i, layer in enumerate(model.model.layers):
        layer_name = f"model.model.layers.{i}.mlp"
        if layer_name in neurons_to_edit:
            neuron_indices = neurons_to_edit[layer_name]["top_neuron_indices"]
            target_weight_matrix = layer.mlp.down_proj.weight
            target_weight_matrix[neuron_indices, :] = 0.0
            if layer.mlp.down_proj.bias is not None:
                layer.mlp.down_proj.bias[neuron_indices] = 0.0
            total_neurons_edited += len(neuron_indices)
            total_layers_edited += 1

print(f"--- 5. Surgery Complete ---")
print(f"  Ablated {total_neurons_edited} neurons across {total_layers_edited} layers.")

print(f"--- 6. Saving new, ablated model to {new_model_path} ---")
os.makedirs(new_model_path, exist_ok=True)
model.save_pretrained(new_model_path)
tokenizer.save_pretrained(new_model_path)

print(f"\n--- Ablation Complete ---")
print(f"'ablated' model is saved in '{new_model_path}'.")

del model
del tokenizer
gc.collect()
torch.cuda.empty_cache()

## Fine-Tuning

In [None]:
print("--- 1. Loading Configuration ---")

model_id = "./edited_model_TOXIC_ABLATED"
new_model_name = "llama-3.1-8b-ablated-healed-BENIGN-ONLY"

HASOC_FOLDER = "."
dataset_files = [
    os.path.join(HASOC_FOLDER, "english_2021.csv"),
    os.path.join(HASOC_FOLDER, "hindi_2021.csv"),
    os.path.join(HASOC_FOLDER, "marathi_2021.csv"),
    os.path.join(HASOC_FOLDER, "english_2019_1.tsv"),
    os.path.join(HASOC_FOLDER, "english_2019_2.tsv"),
    os.path.join(HASOC_FOLDER, "hindi_2019_1.tsv"),
    os.path.join(HASOC_FOLDER, "hindi_2019_2.tsv"),
    os.path.join(HASOC_FOLDER, "english_2020.xlsx"),
    os.path.join(HASOC_FOLDER, "hindi_2020.xlsx")
]

RESUME_FROM_CHECKPOINT = None

print(f"--- 2. Loading and combining BENIGN-ONLY data (EN, HI, MR) ---")

all_texts = [] 
total_samples_loaded = 0

def load_and_extend(filepath):
    text_col, sep = 'text', ','
    filetype = filepath.split('.')[-1]
    try:
        if filetype == 'csv':
            df = pd.read_csv(filepath)
        elif filetype == 'tsv':
            df = pd.read_csv(filepath, sep='\t', on_bad_lines='skip')
        elif filetype == 'xlsx':
            df = pd.read_excel(filepath)
        if 'text' in df.columns:
            text_col = 'text'
        else:
            print(f"  Warning: No 'text' column in {filepath}. Skipping file.")
            return 0
        if 'task_1' in df.columns:
            label_col = 'task_1'
        elif 'task1' in df.columns:
            label_col = 'task1'
        else:
            print(f"  Warning: No 'task_1' or 'task1' in {filepath}. Skipping file.")
            return 0
        benign_df = df[df[label_col] == 'NOT']
        texts = benign_df[text_col].dropna().tolist()
        count = len(texts)
        all_texts.extend(texts)
        print(f"  Loaded {count} BENIGN samples from {filepath}.")
        return count
    except Exception as e:
        print(f"  Error loading {filepath}: {e}")
        return 0

for f in dataset_files:
    total_samples_loaded += load_and_extend(f)

print(f"\nTotal combined multilingual BENIGN dataset size: {total_samples_loaded} samples.")
if total_samples_loaded == 0: 
    print("ERROR: No data was loaded. Check your file paths and formats. Stopping.")
    exit()

dataset = Dataset.from_dict({"text": all_texts})
del all_texts

print(f"--- 3. Loading ABLATED model in 4-bit (QLoRA): {model_id} ---")

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    device_map="auto",
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
print("Ablated model loaded in 4-bit.")

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512, padding="max_length")

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
print("Dataset tokenized and ready.")

print("--- 5. Configuring QLoRA ---")
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
print("QLoRA adapters applied to model.")
model.print_trainable_parameters()

print("--- 6. Configuring Trainer ---")

training_args = TrainingArguments(
    output_dir=new_model_name,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    num_train_epochs=1,
    logging_steps=10,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=3,
    bf16=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

print(f"--- 7. Starting 'Healing' Finetuning on BENIGN-ONLY data ---")
print(f"Checkpoints will be saved to '{new_model_name}'")

trainer.train(resume_from_checkpoint=RESUME_FROM_CHECKPOINT)

print("--- 8. Training complete. Saving final model. ---")
trainer.save_model(new_model_name)
tokenizer.save_pretrained(new_model_name)

print(f"\n--- All Done ---")

Clearing cache and collecting garbage...
VRAM cache cleared.
--- 1. Loading Configuration ---
--- 2. Loading and combining BENIGN-ONLY data (EN, HI, MR) ---
  Loaded 1342 BENIGN samples from ./english_2021.csv.
  Loaded 3161 BENIGN samples from ./hindi_2021.csv.
  Loaded 1205 BENIGN samples from ./marathi_2021.csv.
  Loaded 3591 BENIGN samples from ./english_2019_1.tsv.
  Loaded 865 BENIGN samples from ./english_2019_2.tsv.
  Loaded 713 BENIGN samples from ./hindi_2019_1.tsv.
  Loaded 2196 BENIGN samples from ./hindi_2019_2.tsv.


  Loaded 1852 BENIGN samples from ./english_2020.xlsx.
  Loaded 2116 BENIGN samples from ./hindi_2020.xlsx.

Total combined multilingual BENIGN dataset size: 17041 samples.
--- 3. Loading ABLATED model in 4-bit (QLoRA): ./edited_model_TOXIC_ABLATED ---


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Ablated model loaded in 4-bit.


Map:   0%|          | 0/17041 [00:00<?, ? examples/s]

Dataset tokenized and ready.
--- 5. Configuring QLoRA ---
QLoRA adapters applied to model.
trainable params: 41,943,040 || all params: 8,072,204,288 || trainable%: 0.5196
--- 6. Configuring Trainer ---
--- 7. Starting 'Healing' Finetuning on BENIGN-ONLY data ---
Checkpoints will be saved to 'llama-3.1-8b-ablated-healed-BENIGN-ONLY'


  return fn(*args, **kwargs)


Step,Training Loss
10,3.1874
20,2.7265
30,2.6968
40,2.5974
50,2.551
60,2.4787
70,2.446
80,2.4549
90,2.5128
100,2.4303


  return fn(*args, **kwargs)


--- 8. Training complete. Saving final model. ---

--- All Done ---
Your new, 'ablated-and-healed' model (LoRA adapters) is saved in 'llama-3.1-8b-ablated-healed-BENIGN-ONLY'.


## Analysis

In [None]:
print("--- 1. Loading Configuration ---")

model_id_original = "meta-llama/Meta-Llama-3.1-8B"
model_id_ablated_base = "./edited_model_TOXIC_ABLATED"
model_id_finetuned_adapters = "./llama-3.1-8b-ablated-healed-BENIGN-ONLY"

rtplx_folder = "rtplx"
rtplx_en_file = "RTP_LX_EN.json"
rtplx_hi_file = "RTP_LX_HI.json"

NUM_PROMPTS_TO_TEST = 100

print("--- 2. Initializing Perspective API ---")
try:
    perspective = build(
        'commentanalyzer',
        'v1alpha1',
        developerKey=API_KEY,
        discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
        static_discovery=False
    )
    print("  Perspective API client built successfully.")
except Exception as e:
    print(f"ERROR: Could not initialize Perspective API: {e}")
    perspective = None

def get_toxicity_score(text, lang='en'):
    if perspective is None:
        return None
    if not text.strip() or "bool]bool" in text or "otasotas" in text:
        return 0.0
    time.sleep(1.2)
    analyze_request = {
        'comment': {'text': text},
        'requestedAttributes': {'TOXICITY': {}},
        'languages': [lang]
    }
    try:
        response = perspective.comments().analyze(body=analyze_request).execute()
        score = response['attributeScores']['TOXICITY']['summaryScore']['value']
        return round(score, 4)
    except HttpError as e:
        if e.resp.status == 429:
            print("  [Perspective API Error: Quota exceeded. Sleeping for 5s...]")
            time.sleep(5)
            return get_toxicity_score(text, lang)
        print(f"  [Perspective API Error for lang={lang}: {e}]")
        return None
    except Exception as e:
        print(f"  [Unexpected API Error for lang={lang}: {e}]")
        return None

print(f"--- 3. Loading and randomizing toxic prompts from RTPLX ---")
toxic_prompts_en, toxic_prompts_hi = [], []
try:
    filepath_en = os.path.join(rtplx_folder, rtplx_en_file)
    all_en_prompts = [json.loads(line)["Prompt"] for line in open(filepath_en, 'r', encoding='utf-8')]
    num_to_sample_en = min(NUM_PROMPTS_TO_TEST, len(all_en_prompts))
    toxic_prompts_en = random.sample(all_en_prompts, num_to_sample_en)
    print(f"Loaded {len(all_en_prompts)} English prompts, randomly sampled {len(toxic_prompts_en)}.")
    del all_en_prompts

    filepath_hi = os.path.join(rtplx_folder, rtplx_hi_file)
    all_hi_prompts = [json.loads(line)["Prompt"] for line in open(filepath_hi, 'r', encoding='utf-8')]
    num_to_sample_hi = min(NUM_PROMPTS_TO_TEST, len(all_hi_prompts))
    toxic_prompts_hi = random.sample(all_hi_prompts, num_to_sample_hi)
    print(f"Loaded {len(all_hi_prompts)} Hindi prompts, randomly sampled {len(toxic_prompts_hi)}.")
    del all_hi_prompts
except Exception as e:
    print(f"Error loading test prompts: {e}")
    exit()

print(f"\n--- 4. Running Generation & Scoring ---")
generation_kwargs = {"max_new_tokens": 60, "do_sample": False}
all_scores = {
    "baseline_en": [], "finetuned_en": [],
    "baseline_hi": [], "finetuned_hi": []
}

print(f"\n--- Processing ORIGINAL Model ({model_id_original}) ---")
try:
    tokenizer = AutoTokenizer.from_pretrained(model_id_original, token=my_token)
    model = AutoModelForCausalLM.from_pretrained(
        model_id_original, dtype=torch.bfloat16, device_map="auto", token=my_token
    )
    model.eval()
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"

    for prompt in tqdm(toxic_prompts_en, desc="EN Prompts (Baseline)"):
        inputs = tokenizer(prompt, return_tensors="pt", padding=True).to("cuda")
        with torch.no_grad():
            output = model.generate(**inputs, **generation_kwargs)
        text = tokenizer.decode(output[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
        score = get_toxicity_score(text, lang='en')
        if score is not None:
            all_scores["baseline_en"].append(score)

    for prompt in tqdm(toxic_prompts_hi, desc="HI Prompts (Baseline)"):
        inputs = tokenizer(prompt, return_tensors="pt", padding=True).to("cuda")
        with torch.no_grad():
            output = model.generate(**inputs, **generation_kwargs)
        text = tokenizer.decode(output[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
        score = get_toxicity_score(text, lang='hi')
        if score is not None:
            all_scores["baseline_hi"].append(score)

except Exception as e:
    print(f"Error during Baseline model testing: {e}")

print("--- Clearing VRAM before loading next model ---")
del model
del tokenizer
gc.collect()
torch.cuda.empty_cache()

print(f"\n--- Processing FINETUNED QLoRA Model ---")
try:
    print(f"  Loading ablated 4-bit base model from: {model_id_ablated_base}")
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_id_ablated_base,
        quantization_config=quantization_config,
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(model_id_ablated_base)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"

    print(f"  Loading and merging adapters from: {model_id_finetuned_adapters}")
    model = PeftModel.from_pretrained(model, model_id_finetuned_adapters)
    model = model.merge_and_unload()
    print("  Finetuned model loaded and merged.")
    model.eval()

    for prompt in tqdm(toxic_prompts_en, desc="EN Prompts (Finetuned)"):
        inputs = tokenizer(prompt, return_tensors="pt", padding=True).to("cuda")
        with torch.no_grad():
            output = model.generate(**inputs, **generation_kwargs)
        text = tokenizer.decode(output[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
        score = get_toxicity_score(text, lang='en')
        if score is not None:
            all_scores["finetuned_en"].append(score)

    for prompt in tqdm(toxic_prompts_hi, desc="HI Prompts (Finetuned)"):
        inputs = tokenizer(prompt, return_tensors="pt", padding=True).to("cuda")
        with torch.no_grad():
            output = model.generate(**inputs, **generation_kwargs)
        text = tokenizer.decode(output[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
        score = get_toxicity_score(text, lang='hi')
        if score is not None:
            all_scores["finetuned_hi"].append(score)

except Exception as e:
    print(f"Error during Finetuned model testing: {e}")

print("\n" + "="*80)
print("--- FINAL RESULTS (Average Toxicity) ---")

def get_avg_score(key):
    if all_scores[key]:
        return np.mean(all_scores[key])
    return 0.0

avg_baseline_en = get_avg_score("baseline_en")
avg_finetuned_en = get_avg_score("finetuned_en")
avg_baseline_hi = get_avg_score("baseline_hi")
avg_finetuned_hi = get_avg_score("finetuned_hi")

avg_baseline_mix = np.mean([avg_baseline_en, avg_baseline_hi]) if all_scores["baseline_en"] and all_scores["baseline_hi"] else 0
avg_finetuned_mix = np.mean([avg_finetuned_en, avg_finetuned_hi]) if all_scores["finetuned_en"] and all_scores["finetuned_hi"] else 0

def print_summary(name, baseline, finetuned, count):
    print(f"\n--- {name} Test ({count} Prompts) ---")
    reduction = baseline - finetuned
    print(f"  Average Baseline Toxicity:  {baseline:.4f}")
    print(f"  Average Finetuned Toxicity: {finetuned:.4f}")
    print(f"    (Reduction: {reduction:.4f})")

print_summary("ENGLISH", avg_baseline_en, avg_finetuned_en, len(all_scores["baseline_en"]))
print_summary("HINDI", avg_baseline_hi, avg_finetuned_hi, len(all_scores["baseline_hi"]))
print_summary("MIXED (Average)", avg_baseline_mix, avg_finetuned_mix, len(all_scores["baseline_en"]) + len(all_scores["baseline_hi"]))

print("\n--- Validation complete. ---")


--- 1. Loading Configuration ---
--- 2. Initializing Perspective API ---
  Perspective API client built successfully.
--- 3. Loading and randomizing toxic prompts from RTPLX ---
Loaded 999 English prompts, randomly sampled 100.
Loaded 1115 Hindi prompts, randomly sampled 100.

--- 4. Running Generation & Scoring ---

--- Processing ORIGINAL Model (meta-llama/Meta-Llama-3.1-8B) ---


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

EN Prompts (Baseline):   0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
EN Prompts (Baseline):   1%|          | 1/100 [00:02<03:59,  2.42s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
EN Prompts (Baseline):   2%|▏         | 2/100 [00:04<03:55,  2.41s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
EN Prompts (Baseline):   3%|▎         | 3/100 [00:07<03:52,  2.40s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
EN Prompts (Baseline):   4%|▍         | 4/100 [00:09<03:50,  2.40s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
EN Prompts (Baseline):   5%|▌         | 5/100 [00:11<03:47,  2.39s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
EN Prompts (Baseline):   6%|▌         | 6/100 [00:14<03:45,  2.39s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
EN Prom

--- Clearing VRAM before loading next model ---

--- Processing FINETUNED QLoRA Model ---
  Loading ablated 4-bit base model from: ./edited_model_TOXIC_ABLATED


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

  Loading and merging adapters from: ./llama-3.1-8b-ablated-healed-BENIGN-ONLY




  Finetuned model loaded and merged.


EN Prompts (Finetuned):   0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
EN Prompts (Finetuned):   1%|          | 1/100 [00:03<04:59,  3.02s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
EN Prompts (Finetuned):   2%|▏         | 2/100 [00:05<04:52,  2.99s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
EN Prompts (Finetuned):   3%|▎         | 3/100 [00:08<04:49,  2.99s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
EN Prompts (Finetuned):   4%|▍         | 4/100 [00:11<04:46,  2.98s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
EN Prompts (Finetuned):   5%|▌         | 5/100 [00:14<04:42,  2.97s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
EN Prompts (Finetuned):   6%|▌         | 6/100 [00:17<04:37,  2.96s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--- FINAL RESULTS (Average Toxicity) ---

--- ENGLISH Test (100 Prompts) ---
  Average Baseline Toxicity:  0.4247
  Average Finetuned Toxicity: 0.4164
    (Reduction: 0.0082)

--- HINDI Test (100 Prompts) ---
  Average Baseline Toxicity:  0.3957
  Average Finetuned Toxicity: 0.2787
    (Reduction: 0.1171)

--- MIXED (Average) Test (200 Prompts) ---
  Average Baseline Toxicity:  0.4102
  Average Finetuned Toxicity: 0.3475
    (Reduction: 0.0627)

--- Validation complete. ---



