In [1]:
!pip install transformers datasets peft accelerate evaluate
!pip install bert_score rouge_score

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=6336bc7a558146cd1f510a8d94c1b036b193c16b80c56396ba59f0688629

In [None]:
!pip install torch

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset, DatasetDict
from peft import LoraConfig, get_peft_model, TaskType
import numpy as np
from evaluate import load

# 1. Load and prepare dataset
dataset = load_dataset("Rhma/DIALOCONAN")
# Limit to first 2000 examples for faster training
small_dataset = dataset["train"].select(range(3500)) #dataset["train"].select(range(1000))

# Split original train data into train/validation
train_val = small_dataset.train_test_split(test_size=0.15, seed=42)
dataset = DatasetDict({
    "train": train_val["train"],
    "validation": train_val["test"]
})



# 2. Group turns by dialogue_id
def group_dialogues(examples):
    sorted_data = sorted(zip(examples["dialogue_id"], 
                            examples["turn_id"], 
                            examples["text"],
                            examples["type"],
                            examples["TARGET"]),
                       key=lambda x: (x[0], x[1]))
    dialogues = []
    current_dialogue = []
    current_id = None
    for item in sorted_data:
        dialogue_id, turn_id, text, turn_type, target = item
        if dialogue_id != current_id:
            if current_id is not None:
                dialogues.append({
                    "dialogue_id": current_id,
                    "turns": current_dialogue,
                    "target": current_dialogue[0]["target"]
                })
            current_id = dialogue_id
            current_dialogue = []
        current_dialogue.append({
            "text": text,
            "type": turn_type,
            "target": target
        })
    if current_id is not None:
        dialogues.append({
            "dialogue_id": current_id,
            "turns": current_dialogue,
            "target": current_dialogue[0]["target"]
        })
    return {"dialogues": dialogues}

processed_dataset = dataset.map(
    group_dialogues,
    batched=True,
    remove_columns=dataset["train"].column_names,
    batch_size=1000
)

# 3. Create conversation history for each CN turn
def create_conversation_history(examples):
    new_examples = {"input": [], "target": []}
    for dialogue in examples["dialogues"]:
        history = []
        for turn in dialogue["turns"]:
            if turn["type"] == "CN":
                new_examples["input"].append(" [SEP] ".join(history))
                new_examples["target"].append(turn["text"])
            history.append(turn["text"])
    return new_examples

final_dataset = processed_dataset.map(
    create_conversation_history,
    batched=True,
    remove_columns=["dialogues"]
)

# 4. Tokenization for Llama (causal LM)
model_name = "bigscience/bloomz-3b"  # Or use "Qwen/Qwen1.5-7B-Chat"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token  # For Llama, set pad_token to eos_token

# ---- Print dataset size and longest sequence length ----

# 1. Dataset size
print(f"Train dataset size: {len(final_dataset['train'])}")
print(f"Validation dataset size: {len(final_dataset['validation'])}")

# 2. Longest sentence length (after tokenization)
def find_longest_sequence(dataset, tokenizer):
    max_length = 0
    longest_input = ""
    for example in dataset:
        input_text = example["input"] + " [ANS] " + example["target"]
        tokenized = tokenizer(input_text, truncation=False, add_special_tokens=False)
        length = len(tokenized["input_ids"])
        if length > max_length:
            max_length = length
            longest_input = input_text
    return max_length, longest_input

longest_train_length, longest_train_input = find_longest_sequence(final_dataset["train"], tokenizer)
longest_val_length, longest_val_input = find_longest_sequence(final_dataset["validation"], tokenizer)

print(f"Longest tokenized sequence length in train dataset: {longest_train_length}")
print(f"Longest tokenized sequence length in validation dataset: {longest_val_length}")

def preprocess_function(examples):
    # For causal LM, concatenate input and target with a separator
    inputs = [inp + " [ANS] " + tgt for inp, tgt in zip(examples["input"], examples["target"])]
    model_inputs = tokenizer(
        inputs,
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    # Labels: same as input for causal LM
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    return model_inputs

tokenized_datasets = final_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["input", "target"]
)

# 5. Load model and add LoRA
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  # Only LoRA layers are trainable

# 6. Training Arguments
training_args = TrainingArguments(
    output_dir="./llama2-lora-dialoconan",
    per_device_train_batch_size=1,  # Lower for 7B model unless you have A100 40GB+
    per_device_eval_batch_size=1,
    num_train_epochs=2,
    fp16=True,
    eval_strategy="steps",
    eval_steps=500,             # Evaluate every 500 steps
    save_strategy="steps",
    save_steps=500,
    logging_strategy="steps",
    logging_steps=10,
    report_to="none",
    eval_accumulation_steps=1,
    learning_rate=1e-5
)

# 7. Metrics
#bertscore = load("bertscore")
#rouge = load("rouge")
bleu = load("bleu")

def compute_metrics(eval_preds, tokenizer, bertscore, rouge, bleu, batch_size=8):
    preds, labels = eval_preds
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
    # Batch decode to reduce peak memory
    def batch_decode(arr, bs):
        for i in range(0, len(arr), bs):
            yield tokenizer.batch_decode(arr[i:i+bs], skip_special_tokens=True)
    
    decoded_preds = []
    decoded_labels = []
    for pred_batch, label_batch in zip(batch_decode(preds, batch_size), batch_decode(labels, batch_size)):
        decoded_preds.extend(pred_batch)
        decoded_labels.extend(label_batch)
    
    # Compute metrics
    #bert_results = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
    #rouge_results = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
    bleu_results = bleu.compute(predictions=decoded_preds, references=[[ref] for ref in decoded_labels])

    return {
        #"bertscore": np.mean(bert_results["f1"]),
        #"rougeL": rouge_results["rougeL"],
        "bleu": bleu_results["bleu"]
    }
from transformers import TrainerCallback

class ClearCacheCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        import gc, torch
        gc.collect()
        torch.cuda.empty_cache()
        
from functools import partial
compute_fn = partial(compute_metrics, tokenizer=tokenizer, bleu=bleu)

# 8. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer
    #compute_metrics=compute_fn,
    #callbacks=[ClearCacheCallback()]
)

print("Starting training...")
trainer.train()





README.md:   0%|          | 0.00/452 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16625 [00:00<?, ? examples/s]

Map:   0%|          | 0/2975 [00:00<?, ? examples/s]

Map:   0%|          | 0/525 [00:00<?, ? examples/s]

Map:   0%|          | 0/1616 [00:00<?, ? examples/s]

Map:   0%|          | 0/381 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/199 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Train dataset size: 1489
Validation dataset size: 261
Longest tokenized sequence length in train dataset: 188
Longest tokenized sequence length in validation dataset: 118


Map:   0%|          | 0/1489 [00:00<?, ? examples/s]

Map:   0%|          | 0/261 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.01G [00:00<?, ?B/s]

trainable params: 4,915,200 || all params: 3,007,472,640 || trainable%: 0.1634


Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

  trainer = Trainer(


Starting training...


Step,Training Loss,Validation Loss
500,1.1855,0.985784
1000,1.0744,0.92938
1500,0.8075,0.915014
2000,1.5774,0.909114
2500,0.6583,0.905824


TrainOutput(global_step=2978, training_loss=1.1932959042594606, metrics={'train_runtime': 1074.6805, 'train_samples_per_second': 2.771, 'train_steps_per_second': 2.771, 'total_flos': 5409503756943360.0, 'train_loss': 1.1932959042594606, 'epoch': 2.0})

In [3]:
import torch 
# Updated generation function with device handling
def generate_counterspeech(dialogue_history):
    device = model.device
    input_text = " [SEP] ".join(dialogue_history) + " [ANS] "
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
    with torch.inference_mode():
        outputs = model.generate(
            inputs.input_ids,
            max_new_tokens=128,
            num_beams=5,
            repetition_penalty=2.0,
            early_stopping=True
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


# Test with sample dialogue
sample_dialogue = [
    
    "You people are ruining our country!",
    "Immigrants are stealing our jobs!",
    "We should send them all back!"
]

print("\nGenerated counterspeech:")
print(generate_counterspeech(sample_dialogue))


Generated counterspeech:
You people are ruining our country! [SEP] Immigrants are stealing our jobs! [SEP] We should send them all back! [ANS]  It's not true that immigrants are stealing our jobs. In fact, they are contributing to the growth of our economy.


In [24]:
!pip install -U nltk
import nltk

# Download necessary resources
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

# Now you should be able to use WordNet without issues


[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# Install dependencies if not already installed
!pip install -q evaluate detoxify tqdm
!pip install rouge_score bert_score
from evaluate import load
from detoxify import Detoxify
from tqdm import tqdm
import numpy as np
import math

# Load metrics
rouge = load("rouge")
bertscore = load("bertscore")
# Use first 100 samples
inputs = [ex["input"] for ex in final_dataset["validation"]][:100]
targets = [ex["target"] for ex in final_dataset["validation"]][:100]

# Generate predictions
print("Generating counter speech...")
generated = []
for text in tqdm(inputs, desc="Generating"):
    response = generate_counterspeech(text)  # <-- make sure this function is defined
    generated.append(response)



Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Generating counter speech...


Generating: 100%|██████████| 100/100 [03:40<00:00,  2.21s/it]


In [5]:
# BERTScore
print("Calculating BERTScore...")
bertscore_result = bertscore.compute(
    predictions=generated,
    references=targets,
    model_type="distilbert-base-uncased"
)
print(f"BERTScore F1: {np.mean(bertscore_result['f1']):.4f}")

# ROUGE
# ROUGE
print("Calculating ROUGE...")
rouge_result = rouge.compute(predictions=generated, references=targets)
print(f"ROUGE-1 F1: {rouge_result['rouge1']:.4f}")
print(f"ROUGE-2 F1: {rouge_result['rouge2']:.4f}")
print(f"ROUGE-L F1: {rouge_result['rougeL']:.4f}")
# Perplexity
print("Calculating Perplexity...")
def calculate_perplexity(texts):
    total_log_prob = 0.0
    total_words = 0
    for text in texts:
        words = text.split()
        total_words += len(words)
        # You can use a pre-trained language model (e.g., GPT-2) for calculating perplexity
        # Here, we will use a placeholder for the log-prob calculation, which should ideally come from a language model
        # For simplicity, assume a fixed value here
        total_log_prob += len(words) * math.log(1.0)  # Placeholder for log-prob calculation
    return math.exp(-total_log_prob / total_words) if total_words > 0 else float('inf')

perplexity_result = calculate_perplexity(generated)
print(f"Perplexity: {perplexity_result:.4f}")

# Toxicity
print("Calculating Toxicity...")
toxicity_scores = [Detoxify('original').predict(pred)['toxicity'] for pred in tqdm(generated, desc="Toxicity")]
avg_toxicity = np.mean(toxicity_scores)
print(f"Avg. Toxicity Score: {avg_toxicity:.4f}")

Calculating BERTScore...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

BERTScore F1: 0.6175
Calculating ROUGE...
ROUGE-1 F1: 0.0514
ROUGE-2 F1: 0.0031
ROUGE-L F1: 0.0426
Calculating Perplexity...
Perplexity: 1.0000
Calculating Toxicity...


Toxicity:   0%|          | 0/100 [00:00<?, ?it/s]Downloading: "https://github.com/unitaryai/detoxify/releases/download/v0.1-alpha/toxic_original-c1212f89.ckpt" to /root/.cache/torch/hub/checkpoints/toxic_original-c1212f89.ckpt

  0%|          | 0.00/418M [00:00<?, ?B/s][A
  3%|▎         | 12.8M/418M [00:00<00:03, 134MB/s][A
 10%|█         | 43.8M/418M [00:00<00:01, 245MB/s][A
 18%|█▊        | 74.6M/418M [00:00<00:01, 281MB/s][A
 25%|██▌       | 105M/418M [00:00<00:01, 296MB/s] [A
 32%|███▏      | 136M/418M [00:00<00:00, 305MB/s][A
 40%|███▉      | 167M/418M [00:00<00:00, 311MB/s][A
 47%|████▋     | 198M/418M [00:00<00:00, 315MB/s][A
 55%|█████▍    | 228M/418M [00:00<00:00, 316MB/s][A
 62%|██████▏   | 259M/418M [00:00<00:00, 319MB/s][A
 69%|██████▉   | 290M/418M [00:01<00:00, 321MB/s][A
 77%|███████▋  | 321M/418M [00:01<00:00, 318MB/s][A
 84%|████████▍ | 352M/418M [00:01<00:00, 320MB/s][A
 92%|█████████▏| 382M/418M [00:01<00:00, 319MB/s][A
100%|██████████| 418M/418M [00:01

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Toxicity: 100%|██████████| 100/100 [00:54<00:00,  1.82it/s]

Avg. Toxicity Score: 0.0027





**TEST 1: unsolth lama**

In [1]:
# If running in Colab or Kaggle, uncomment the following:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes"
!pip install trl

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-kxsqhkmz/unsloth_dfe497d303844d33bf17b0702eada730
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-kxsqhkmz/unsloth_dfe497d303844d33bf17b0702eada730
  Resolved https://github.com/unslothai/unsloth.git to commit 9390bd528d4126840b142d5c354b8c1d7461f41e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
/bin/bash: -c: line 1: unexpected EOF while looking for matching `"'
/bin/bash: -c: line 2: syntax error: unexpected end of file


In [2]:
import torch
from unsloth import FastLanguageModel
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset, DatasetDict

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [4]:
# 1. Load and prepare dataset
dataset = load_dataset("Rhma/DIALOCONAN")
small_dataset = dataset["train"].select(range(3500))
train_val = small_dataset.train_test_split(test_size=0.15, seed=42)
dataset = DatasetDict({
    "train": train_val["train"],
    "validation": train_val["test"]
})

# 2. Group and preprocess as before
def group_dialogues(examples):
    sorted_data = sorted(zip(examples["dialogue_id"], 
                            examples["turn_id"], 
                            examples["text"],
                            examples["type"],
                            examples["TARGET"]),
                       key=lambda x: (x[0], x[1]))
    dialogues = []
    current_dialogue = []
    current_id = None
    for item in sorted_data:
        dialogue_id, turn_id, text, turn_type, target = item
        if dialogue_id != current_id:
            if current_id is not None:
                dialogues.append({
                    "dialogue_id": current_id,
                    "turns": current_dialogue,
                    "target": current_dialogue[0]["target"]
                })
            current_id = dialogue_id
            current_dialogue = []
        current_dialogue.append({
            "text": text,
            "type": turn_type,
            "target": target
        })
    if current_id is not None:
        dialogues.append({
            "dialogue_id": current_id,
            "turns": current_dialogue,
            "target": current_dialogue[0]["target"]
        })
    return {"dialogues": dialogues}

processed_dataset = dataset.map(
    group_dialogues,
    batched=True,
    remove_columns=dataset["train"].column_names,
    batch_size=1000
)

def create_conversation_history(examples):
    new_examples = {"input": [], "target": []}
    for dialogue in examples["dialogues"]:
        history = []
        for turn in dialogue["turns"]:
            if turn["type"] == "CN":
                new_examples["input"].append(" [SEP] ".join(history))
                new_examples["target"].append(turn["text"])
            history.append(turn["text"])
    return new_examples

final_dataset = processed_dataset.map(
    create_conversation_history,
    batched=True,
    remove_columns=["dialogues"]
)

# 3. Create a "text" field for SFTTrainer
def make_text_field(example):
    # Compose the prompt and target in a single string
    prompt = (
        "Below is a conversation containing hate speech. Write a factual, non-aggressive counterspeech for the last statement.\n\n"
        "Conversation:\n" +
        "\n".join(example["input"].split(" [SEP] ")) +  # Split if your input is still joined by [SEP]
        "\nCounterspeech: "
    )
    return {"text": prompt + example["target"]}


final_dataset = final_dataset.map(make_text_field)

# 4. Load Llama-3.1-8B-Instruct (Unsloth 4bit)
max_seq_length = 128
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=None,
    load_in_4bit=True,
)

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    use_rslora=True,
    use_gradient_checkpointing="unsloth",
)
model.print_trainable_parameters()

# 5. SFTConfig and Trainer
training_args = SFTConfig(
    dataset_text_field="text",  # <--- CRITICAL FIX
    max_seq_length=max_seq_length,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=10,
    max_steps=186,
    logging_steps=100,
    output_dir="./llama3.1-8b-unsloth",
    optim="adamw_8bit",
    seed=42,
    learning_rate=2e-4,
    disable_tqdm=False,
    report_to="none",
)

trainer = SFTTrainer(
    model=model,
    train_dataset=final_dataset["train"],
    eval_dataset=final_dataset["validation"],
    tokenizer=tokenizer,
    args=training_args,
)

print("Starting Unsloth training...")
trainer.train()

==((====))==  Unsloth 2025.4.8: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla P100-PCIE-16GB. Num GPUs = 1. Max memory: 15.888 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 6.0. CUDA Toolkit: 12.1. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
trainable params: 41,943,040 || all params: 8,072,204,288 || trainable%: 0.5196
Starting Unsloth training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,489 | Num Epochs = 1 | Total steps = 186
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


Step,Training Loss
100,0.0305


TrainOutput(global_step=186, training_loss=0.016574088103508435, metrics={'train_runtime': 1421.0858, 'train_samples_per_second': 1.047, 'train_steps_per_second': 0.131, 'total_flos': 6173277413474304.0, 'train_loss': 0.016574088103508435})

In [5]:
!pip install evaluate



In [6]:


import numpy as np

from evaluate import load



# 1. BLEU Score on Validation Set

def compute_bleu_on_validation(trainer, tokenizer, validation_dataset, max_seq_length=128):
    bleu = load("bleu")
    # Generate predictions
    predictions = []
    references = []
    for example in validation_dataset:
        # Prepare input
        input_text = example["text"].split(" [ANS] ")[0] + " [ANS] "
        inputs = tokenizer(input_text, return_tensors="pt", max_length=max_seq_length, truncation=True).to(model.device)
        with torch.inference_mode():
            output = model.generate(
                inputs.input_ids,
                attention_mask=inputs["attention_mask"],
                max_new_tokens=64,
                num_beams=1,
                repetition_penalty=2.0,
                early_stopping=True
            )
        pred = tokenizer.decode(output[0], skip_special_tokens=True)
        # Extract the reference (target counterspeech)
        ref = example["text"].split(" [ANS] ")[1]
        predictions.append(pred.strip())
        references.append([ref.strip()])
    # Compute BLEU
    bleu_score = bleu.compute(predictions=predictions, references=references)
    print(f"BLEU score on validation set: {bleu_score['bleu']:.4f}")
    return bleu_score

# Usage after training:
#bleu_score = compute_bleu_on_validation(trainer, tokenizer, final_dataset["validation"], max_seq_length=128)

# 2. Generate Counterspeech Function

def generate_counterspeech(dialogue_history, model, tokenizer, max_seq_length=128):
    prompt = (
        "Below is a conversation containing hate speech. Write a factual, non-aggressive counterspeech for the last statement.\n\n"
        "Conversation:\n" +
        "\n".join(dialogue_history) +
        "\nCounterspeech:"
    )
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        max_length=max_seq_length,
        truncation=True,
        padding="max_length"
    )
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    with torch.inference_mode():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=64,
            num_beams=1,
            repetition_penalty=2.0,
            early_stopping=True
        )
    counterspeech = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Remove the prompt from the output if the model repeats it
    if "Counterspeech:" in counterspeech:
        counterspeech = counterspeech.split("Counterspeech:")[-1]
    return counterspeech.strip()

  
  

# Example usage:
input_text = (
    "You people are ruining our country!\n"
    "Immigrants are stealing our jobs!\n"
    "We should send them all back!\n"
    
)

print("\nGenerated counterspeech:")
print(generate_counterspeech(input_text, model, tokenizer, max_seq_length=128))



Generated counterspeech:
Below is a conversation containing hate speech. Write a factual, non-aggressive counterspeech for the last statement.

Conversation:
Y
o
u
 
p
e
o
p
l
e
 
a
r
e
 
r
u
i
n
i
n
g
 
o
u
r
 
c
o
u
n
t
r
y
!


I
m
m
i
g
r
a
n
t
s
 
a
r
e
 
s
t
e
a
l
i
n
g
 
o
U U u 
 

 


:


:



:

: :    -   -- -- ——… …... // /.,.,.,....../../............. ------------- --- --- --- === == ####== ##* * * * * * **   ``” ” ” ] [
