##### Run the generations of the abliterated models through a hs classifier.

In [None]:
import nltk 
import json
import pandas as pd
from transformers import AutoTokenizer
from transformers import pipeline
from datasets import Dataset

nltk.download('punkt')
hf_model_id = 'Hate-speech-CNERG/english-abusive-MuRIL'
tokenizer = AutoTokenizer.from_pretrained(hf_model_id)
pipe = pipeline("text-classification", model=hf_model_id)
max_token_length = 512

In [None]:
generation_model_id = 'Meta-Llama-3.1-8B-Instruct-abliterated'
# generation_model_id = 'Mistral-7B-Instruct-v0.3-abliterated'
input_path = f"../generation_processed/{generation_model_id}.csv"
output_path_generation_to_sentence_mappings = f'generations/{generation_model_id}/sentence_mappings.json'
output_path_classifier = f'generations/{generation_model_id}/results.json'
data = pd.read_csv(input_path)
generations = data['generated_answer'].tolist()

In [None]:
def split_into_sentences(text):
    if not isinstance(text, str):
        return []
    return nltk.sent_tokenize(text)

In [None]:
# Split into sentences
all_sentences = []
sentence_to_generation_map = {}  # Maps sentence indices to their original text index (text_idx: [sent_idx1, sent_idx2, ...])

# Step 1: Split all texts into sentences and create mappings
for text_idx, text in enumerate(generations):
    sentences = split_into_sentences(text)
    for sentence in sentences:
        if sentence.strip():  # Skip empty sentences
            sent_idx = len(all_sentences)
            all_sentences.append(sentence)
            sentence_to_generation_map[sent_idx] = text_idx

In [None]:
# Truncate longer sentences to the first 512 tokens.
n_sentences_longer = 0
all_sentences_truncated = []
for sent_idx, sentence in enumerate(all_sentences):
  sentence_num_tokens = len(tokenizer(sentence).input_ids)
  if sentence_num_tokens > max_token_length:
    sentence_truncated = sentence[0:1000]
    # Need to split further.
    n_sentences_longer += 1
    all_sentences_truncated.append(sentence_truncated)
  else:
    all_sentences_truncated.append(sentence)

  if sent_idx % 5000 == 0:
    print(f"Processed {sent_idx} / {len(all_sentences)} sentences")

print(f"Number of sentences that had to be truncated: {n_sentences_longer}")

In [None]:
data = {"text": all_sentences_truncated}
# Create the dataset
hf_dataset = Dataset.from_dict(data)

In [None]:
# Extract HS Classifications using the pipeline
batch_size = 16
results = pipe(hf_dataset["text"], batch_size=batch_size)

In [None]:
# Save results 
with open(output_path_generation_to_sentence_mappings, "w") as f:
    json.dump(sentence_to_generation_map, f)
print(f"Sentence mappings saved to {output_path_generation_to_sentence_mappings}")

with open(output_path_classifier, "w") as f:
    json.dump(results, f)
print(f"Results saved to {output_path_classifier}")