In [1]:
!pip install datasets evaluate transformers peft accelerate sentencepiece

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.13.0->peft)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.13.0->peft)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.13.0->peft)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.13.0->peft)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.13.0->peft)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py

In [2]:
# -----------------------------------------
# Imports
# -----------------------------------------
import numpy as np
import torch
from datasets import load_from_disk, DatasetDict
from transformers import (
    T5ForConditionalGeneration, T5Tokenizer,
    MarianMTModel, MarianTokenizer,
    Seq2SeqTrainer, Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)
from peft import get_peft_model, LoraConfig, TaskType
import evaluate


2025-04-14 20:54:10.266053: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744664050.450752      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744664050.498220      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# -----------------------------------------
# 1. Load Dataset
# -----------------------------------------
train_path = "/kaggle/input/convu-dataset/conv_data/therapy_train"
val_path = "/kaggle/input/convu-dataset/conv_data/therapy_val"
test_path = "/kaggle/input/convu-dataset/conv_data/therapy_test"


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
dataset = DatasetDict({
    "train": load_from_disk(train_path),
    "validation": load_from_disk(val_path),
    "test": load_from_disk(test_path)
})

In [6]:
# -----------------------------------------
# 4. Back Translation (Batch & Efficient)
# -----------------------------------------
en_to_fr = "Helsinki-NLP/opus-mt-en-fr"
fr_to_en = "Helsinki-NLP/opus-mt-fr-en"

en2fr_tok = MarianTokenizer.from_pretrained(en_to_fr)
fr2en_tok = MarianTokenizer.from_pretrained(fr_to_en)

en2fr_model = MarianMTModel.from_pretrained(en_to_fr).to(device)
fr2en_model = MarianMTModel.from_pretrained(fr_to_en).to(device)



tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/301M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [7]:
def batch_back_translate(dataset, batch_size=16):
    augmented = []
    for i in tqdm(range(0, len(dataset), batch_size), desc="Back Translating"):
        batch = dataset[i:i+batch_size]
        texts = batch["input_text"]

        # EN → FR
        inputs = en2fr_tok(texts, return_tensors="pt", padding=True, truncation=True).to(device)
        with torch.no_grad():
            fr = en2fr_model.generate(**inputs, max_length=128)
        fr_texts = en2fr_tok.batch_decode(fr, skip_special_tokens=True)

        # FR → EN
        inputs = fr2en_tok(fr_texts, return_tensors="pt", padding=True, truncation=True).to(device)
        with torch.no_grad():
            en = fr2en_model.generate(**inputs, max_length=128)
        en_texts = fr2en_tok.batch_decode(en, skip_special_tokens=True)

        for j, new_input in enumerate(en_texts):
            augmented.append({
                "input_text": new_input,
                "target_text": batch["target_text"][j]
            })
    return augmented

In [8]:
from tqdm import tqdm
from datasets import Dataset,concatenate_datasets
bt_augmented = batch_back_translate(dataset["train"], batch_size=16)
bt_dataset = Dataset.from_list(bt_augmented)
combined_train = concatenate_datasets([dataset["train"], bt_dataset])

Back Translating:   0%|          | 0/251 [00:00<?, ?it/s]Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/301M [00:00<?, ?B/s]

Back Translating: 100%|██████████| 251/251 [10:45<00:00,  2.57s/it]


In [9]:
# -----------------------------------------
# 5. Add Difficulty & Curriculum Split
# -----------------------------------------
def add_difficulty(example):
    example["difficulty"] = len(example["target_text"].split())
    return example

combined_train = combined_train.map(add_difficulty, load_from_cache_file=False,keep_in_memory=True)
combined_train = combined_train.sort("difficulty")

Map:   0%|          | 0/8016 [00:00<?, ? examples/s]

In [10]:
n = len(combined_train)
stage1 = combined_train.select(range(int(0.33 * n)))
stage2 = combined_train.select(range(int(0.33 * n), int(0.66 * n)))
stage3 = combined_train.select(range(int(0.66 * n), n))
stages = [stage1, stage2, stage3]

In [11]:
# -----------------------------------------
# 6. Tokenization
# -----------------------------------------
model_name = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    inputs = [f"Respond appropriately: {x}" for x in examples["input_text"]]
    targets = examples["target_text"]
    model_inputs = tokenizer(inputs, max_length=128, padding="max_length", truncation=True)
    labels = tokenizer(targets, max_length=32, padding="max_length", truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [12]:
tokenized_stages = [stage.map(preprocess_function, batched=True, load_from_cache_file=False,keep_in_memory=True) for stage in stages]
val_dataset = dataset["validation"].map(preprocess_function, batched=True, load_from_cache_file=False,keep_in_memory=True)
test_dataset = dataset["test"].map(preprocess_function, batched=True, load_from_cache_file=False,keep_in_memory=True)

Map:   0%|          | 0/2645 [00:00<?, ? examples/s]

Map:   0%|          | 0/2645 [00:00<?, ? examples/s]

Map:   0%|          | 0/2726 [00:00<?, ? examples/s]

Map:   0%|          | 0/576 [00:00<?, ? examples/s]

Map:   0%|          | 0/968 [00:00<?, ? examples/s]

In [13]:
# -----------------------------------------
# 7. Load Model with LoRA
# -----------------------------------------
base_model = T5ForConditionalGeneration.from_pretrained(model_name)
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)
model = get_peft_model(base_model, lora_config)

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [14]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,  # longer, since early stopping will handle exit
    predict_with_generate=True,
    weight_decay=0.01,
    save_total_limit=2,
    logging_steps=50,
    logging_dir="./logs",
    report_to=[],
                       
)


In [15]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding="max_length")

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    eval_dataset=val_dataset,
    data_collator=data_collator, 
)


  trainer = Seq2SeqTrainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [16]:
for i, stage in enumerate(tokenized_stages):
    print(f"\n🚀 Training on Curriculum Stage {i+1}")
    trainer.train_dataset = stage
    trainer.train(resume_from_checkpoint=False)
    trainer.save_model(f"./checkpoint_stage_{i+1}")

# Final model save
trainer.save_model("./curriculum_trained_model")


🚀 Training on Curriculum Stage 1


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,22.7525,11.288176
2,5.0623,4.368904
3,4.2589,3.880708
4,3.428,3.249779
5,2.6961,2.879253
6,2.264,2.656487
7,1.9644,2.569405
8,1.8027,2.511531
9,1.6689,2.49269
10,1.6669,2.482574



🚀 Training on Curriculum Stage 2


Epoch,Training Loss,Validation Loss
1,2.4488,2.238115
2,2.2562,2.172983
3,2.1386,2.140065
4,2.1411,2.119948
5,2.0791,2.110487
6,2.0782,2.101434
7,2.0503,2.096441
8,2.0627,2.092739
9,2.0119,2.090856
10,2.027,2.090077



🚀 Training on Curriculum Stage 3


Epoch,Training Loss,Validation Loss
1,3.4207,2.092771
2,3.3882,2.09063
3,3.3784,2.088145
4,3.3429,2.090895
5,3.3271,2.096545
6,3.2886,2.096918
7,3.3002,2.097763
8,3.3341,2.098172
9,3.311,2.099103
10,3.3317,2.098271


In [27]:
!pip install bert_score



In [42]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

bleu = evaluate.load("bleu")
bertscore = evaluate.load("bertscore")
model = T5ForConditionalGeneration.from_pretrained("./curriculum_trained_model").to(device)
tokenizer = T5Tokenizer.from_pretrained("./curriculum_trained_model")
model.eval()

dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=default_data_collator)

predictions, references = [], []

for batch in dataloader:
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=128,
        num_beams=5,
        early_stopping=True,
        no_repeat_ngram_size=3,  # <- This helps prevent repeating tokens
        repetition_penalty=1.2   # <- Optional, to penalize word reuse
    )
        

    preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    labels = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)

    for pred, label in zip(preds, labels):
        if label.strip():
            predictions.append(pred)
            references.append([label])




In [43]:
final_bertscore = bertscore.compute(
    predictions=predictions,
    references=[ref[0] for ref in references],
    lang="en"
)

bleu = evaluate.load("bleu")
bleu_score = bleu.compute(predictions=predictions, references=references)

print(f"\n📊 Filtered BERTScore:")
print(f"F1:        {np.mean(final_bertscore['f1']):.4f}")
print(f"Precision: {np.mean(final_bertscore['precision']):.4f}")
print(f"Recall:    {np.mean(final_bertscore['recall']):.4f}")

print(f"\n📊 BLEU Score: {bleu_score['bleu']:.4f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



📊 Filtered BERTScore:
F1:        0.8482
Precision: 0.8499
Recall:    0.8469

📊 BLEU Score: 0.0080


In [52]:
predictions, references, input_texts = [], [], []

for batch in dataloader:
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=128,
            num_beams=5,
            early_stopping=True,
            no_repeat_ngram_size=3,
            repetition_penalty=1.2
        )

    preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    labels = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)
    inputs = tokenizer.batch_decode(batch["input_ids"], skip_special_tokens=True)

    for inp, pred, label in zip(inputs, preds, labels):
        if label.strip():
            input_texts.append(inp)
            predictions.append(pred)
            references.append(label)

# Save to CSV
df = pd.DataFrame({
    "input_text": input_texts,
    "reference": references,
    "prediction": predictions
})
df.to_csv("sample_predictions.csv", index=False)
print("✅ Saved all predictions to 'all_predictions.csv'")

✅ Saved all predictions to 'all_predictions.csv'


In [63]:
from transformers import pipeline
emotion_classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", device=0 if torch.cuda.is_available() else -1)

# Load emotion rewriter
rewriter_model_name = "mrm8488/t5-base-finetuned-emotion"
rewriter_tokenizer = T5Tokenizer.from_pretrained(rewriter_model_name)
rewriter_model = T5ForConditionalGeneration.from_pretrained(rewriter_model_name).to(device)

# Function to rewrite prediction based on emotion
def rewrite_with_emotion(pred, target_emotion):
    prompt = f"rewrite with {target_emotion} emotion: {pred}"
    inputs = rewriter_tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = rewriter_model.generate(**inputs, max_length=64, num_beams=5)
    return rewriter_tokenizer.decode(outputs[0], skip_special_tokens=True)

# Evaluate with emotion alignment
dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=default_data_collator)

predictions, references = [], []

for batch in dataloader:
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=128,
            num_beams=5 , # Beam search
            no_repeat_ngram_size=3,  # <- This helps prevent repeating tokens
            repetition_penalty=1.2   
            
        )

    preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    labels = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)

    for pred, label in zip(preds, labels):
        if not label.strip():
            continue

        # Emotion detection
        pred_emotion = emotion_classifier(pred)[0]['label']
        label_emotion = emotion_classifier(label)[0]['label']

        # Rewrite if mismatch
        if pred_emotion != label_emotion:
            pred = rewrite_with_emotion(pred, label_emotion)

        predictions.append(pred)
        references.append([label])

Device set to use cuda:0


In [64]:
final_bertscore = bertscore.compute(
    predictions=predictions,
    references=[ref[0] for ref in references],
    lang="en"
)

bleu = evaluate.load("bleu")
bleu_score = bleu.compute(predictions=predictions, references=references)

print(f"\n📊 Filtered BERTScore:")
print(f"F1:        {np.mean(final_bertscore['f1']):.4f}")
print(f"Precision: {np.mean(final_bertscore['precision']):.4f}")
print(f"Recall:    {np.mean(final_bertscore['recall']):.4f}")

print(f"\n📊 BLEU Score: {bleu_score['bleu']:.4f}")


📊 Filtered BERTScore:
F1:        0.8332
Precision: 0.8409
Recall:    0.8261

📊 BLEU Score: 0.0087
