# Fine-tuning Toxicity - Evaluation


This notebook contains code used to run toxicity evaluations for the paper associated with: https://github.com/WillHawkins3/finetuningtoxicity

In [None]:
# log in to HF in order to access models (only required for some models)
from huggingface_hub import notebook_login
notebook_login()

In [None]:
# install
!pip install datasets transformers evaluate -q torch tqdm pandas unidecode trl peft accelerate bitsandbytes triton
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [None]:
# imports

import random
from transformers import pipeline, AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer
import pandas as pd
from google.colab import drive
import evaluate
import torch
from tqdm import tqdm
import requests
from io import StringIO
import os
import json

# Mount Google Drive
drive.mount('/content/drive')


In [None]:
# load model

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    "MODEL_PATH", # update to include model path from Hugging Face library
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained("MODEL_PATH") # update to include model path from Hugging Face library
tokenizer.padding_side = 'left'

# Set the padding token to the eos_token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

batch_size = 16


In [None]:
# Toxicity evaluation

df = pd.read_csv('DATA_SET') # specify to load prompt dataset

# Load toxicity metric
toxicity = evaluate.load("toxicity", module_type="measurement")

# Generate model continuations
model_continuations = []

for i in tqdm(range(0, len(df), batch_size)):
    batch_prompts = df['prompt'][i:i+batch_size].tolist()

    # Tokenize the input batch
    inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True, truncation=True).to(model.device)

    # Generate the continuations
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    # Decode the outputs
    full_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # Remove the original prompts from the generated texts
    continuations = [full_text[len(prompt):].strip() for full_text, prompt in zip(full_texts, batch_prompts)]
    model_continuations.extend(continuations)

print(f'Generated {len(model_continuations)} continuations')


# Compute toxicity scores for each continuation
continuation_toxicity_scores = toxicity.compute(predictions=model_continuations)['toxicity']

# Save outputs to a dataframe, and then save to Drive
results_df = pd.DataFrame({
    "prompt": df['prompt'],
    "prompt_toxicity_score": df['toxicity_score'],
    "generation": model_continuations,
    "generation_toxicity_score": continuation_toxicity_scores
})

# Calculate toxicity ratio for each row
results_df['toxicity_ratio'] = results_df['generation_toxicity_score'] / results_df['prompt_toxicity_score']

# SAVE OUTPUTS TO FILE - remember to change the end point to a unique file!
output_path = 'SAVE_LOCATION' # update to specify location to save the file
results_df.to_csv(output_path, index=False)

print(f"Results saved to {output_path}")

# Compute overall toxicity metrics
overall_toxicity_ratio = toxicity.compute(predictions=model_continuations, aggregation="ratio")
print("Overall Toxicity Ratio:", overall_toxicity_ratio)
