In [1]:
from datasets import load_from_disk
import os

In [2]:
dataset_path = "/kaggle/input/datasettvt/conv_data"

In [3]:
# Load datasets
train_dataset = load_from_disk(os.path.join(dataset_path, "therapy_train"))
val_dataset = load_from_disk(os.path.join(dataset_path, "therapy_val"))
test_dataset = load_from_disk(os.path.join(dataset_path, "therapy_test"))

In [4]:
# Function to print 5 examples from a dataset
def print_examples(dataset, name):
    print(f"\n{name} Dataset Samples:")
    for i in range(5):
        print(dataset[i])  # Prints the first 5 examples

# Print examples
print_examples(train_dataset, "Train")
print_examples(val_dataset, "Validation")
print_examples(test_dataset, "Test")



Train Dataset Samples:
{'input_text': 'T: Hi you how to do it today? [SEP] P: Great. How are you?', 'target_text': "I'm doing well. Thanks for asking."}
{'input_text': "T: Hi you how to do it today? [SEP] P: Great. How are you? [SEP] T: I'm doing well. Thanks for asking.", 'target_text': "So you're doing great."}
{'input_text': "T: Hi you how to do it today? [SEP] P: Great. How are you? [SEP] T: I'm doing well. Thanks for asking. [SEP] T: So you're doing great. [SEP] P: I'm doing awesome.", 'target_text': "I know your brother brought you in today and he had expressed some concerns about your mood. Do you know what that's about?"}
{'input_text': "T: Hi you how to do it today? [SEP] P: Great. How are you? [SEP] T: I'm doing well. Thanks for asking. [SEP] T: So you're doing great. [SEP] P: I'm doing awesome. [SEP] T: I know your brother brought you in today and he had expressed some concerns about your mood. Do you know what that's about? [SEP] P: I, I think he's worrying for no reason. 

In [7]:
!pip install transformers datasets bleu bert-score torch


Collecting bleu
  Downloading bleu-0.3.tar.gz (5.2 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting efficiency (from bleu)
  Downloading efficiency-2.0-py3-none-any.whl.metadata (2.5 kB)
Downloading efficiency-2.0-py3-none-any.whl (32 kB)
Building wheels for collected packages: bleu
  Building wheel for bleu (setup.py) ... [?25l[?25hdone
  Created wheel for bleu: filename=bleu-0.3-py3-none-any.whl size=5781 sha256=f91a0c3f74f588838b9ff3737ce849c9e990eed6c5612dd03bb5070cf272630e
  Stored in directory: /root/.cache/pip/wheels/c6/d8/d1/009abe01b8b2c6a14c62d197b510b3cc1076014c22d712c5ce
Successfully built bleu
Installing collected packages: efficiency, bleu
Successfully installed bleu-0.3 efficiency-2.0


In [8]:
import torch
from datasets import load_from_disk
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from torch.utils.data import Dataset
import os



In [9]:
# Load tokenizer
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")

# Dataset class for tokenizing input & target text
class ConversationDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        input_text = "predict next utterance: " + sample["input_text"]
        target_text = sample["target_text"]

        # Tokenize input and target
        inputs = self.tokenizer(
            input_text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt"
        )
        targets = self.tokenizer(
            target_text, max_length=50, padding="max_length", truncation=True, return_tensors="pt"
        )

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": targets["input_ids"].squeeze()
        }

# Convert datasets to PyTorch format
train_data = ConversationDataset(train_dataset, tokenizer)
val_data = ConversationDataset(val_dataset, tokenizer)
test_data = ConversationDataset(test_dataset, tokenizer)


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [10]:
# Load pre-trained model
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

# Define training parameters
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=2,
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to=[],

)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer
)


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  trainer = Trainer(


In [11]:
trainer.train()


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,1.4938,1.467256
2,1.4548,1.446475
3,1.4371,1.438808
4,1.4224,1.433112
5,1.4231,1.429076
6,1.3788,1.428678
7,1.3825,1.427278
8,1.321,1.42662
9,1.3678,1.42688
10,1.339,1.427133


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=5010, training_loss=1.6354563094422727, metrics={'train_runtime': 3673.5397, 'train_samples_per_second': 10.91, 'train_steps_per_second': 1.364, 'total_flos': 2.744507468611584e+16, 'train_loss': 1.6354563094422727, 'epoch': 10.0})

In [10]:
model.save_pretrained("/kaggle/working/t5_therapy_model")
tokenizer.save_pretrained("/kaggle/working/t5_therapy_model")


('/kaggle/working/t5_therapy_model/tokenizer_config.json',
 '/kaggle/working/t5_therapy_model/special_tokens_map.json',
 '/kaggle/working/t5_therapy_model/spiece.model',
 '/kaggle/working/t5_therapy_model/added_tokens.json')

In [13]:
def generate_response(model, tokenizer, text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    input_text = "predict next utterance: " + text
    input_ids = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).input_ids.to(device)
    
    output_ids = model.generate(input_ids, max_length=50, num_beams=5, early_stopping=True)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)


# Test predictions
for i in range(5):
    sample_input = test_dataset[i]["input_text"]
    actual_output = test_dataset[i]["target_text"]
    generated_output = generate_response(model, tokenizer, sample_input)

    print(f"\nInput: {sample_input}")
    print(f"Actual: {actual_output}")
    print(f"Generated: {generated_output}")



Input: T: So, alright, let's take a step back and talk about competency. What education Did you need for the job you have now? [SEP] P: I have my bachelor's degree. And we got a little bit of training. You know every so often we get some training at work and stuff like that.
Actual: Are you evaluated at work by anybody to see if you're in a job you should be?
Generated: You have a bachelor's degree?

Input: T: So, alright, let's take a step back and talk about competency. What education Did you need for the job you have now? [SEP] P: I have my bachelor's degree. And we got a little bit of training. You know every so often we get some training at work and stuff like that. [SEP] T: Are you evaluated at work by anybody to see if you're in a job you should be? [SEP] P: Yeah, I have a supervisor so they check up on stuff and also like if I feel like I have questions and stuff like that, I can go to them as well.
Actual: Have you been Found in that system to be somebody who's satisfying the

In [12]:
!pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m241.6 kB/s[0m eta [36m0:00:00[0m0:01[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [14]:
import evaluate
from bert_score import score
from tqdm import tqdm

# Load metrics
bleu = evaluate.load("bleu")

# Optional: also load BERTScore from Hugging Face if you prefer
# bertscore = evaluate.load("bertscore")

# Helper function to generate model predictions
def generate_response(model, tokenizer, input_text, max_length=32):
    model.eval()
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(model.device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=max_length)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Prepare predictions and references
references = []
predictions = []

for i in tqdm(range(len(test_dataset))):
    input_text = test_dataset[i]["input_text"]
    target_text = test_dataset[i]["target_text"]

    prediction = generate_response(model, tokenizer, input_text)
    
    references.append(target_text)
    predictions.append(prediction)

# BLEU Score
bleu_score = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
print("\nBLEU Score:", bleu_score["bleu"])

# BERTScore
P, R, F1 = score(predictions, references, lang="en", verbose=True)
print("\nBERTScore:")
print("F1 mean:", F1.mean().item())
print("Precision mean:", P.mean().item())
print("Recall mean:", R.mean().item())


Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

100%|██████████| 968/968 [04:38<00:00,  3.48it/s]



BLEU Score: 0.003317928729847959


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/19 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/16 [00:00<?, ?it/s]

done in 5.15 seconds, 187.81 sentences/sec

BERTScore:
F1 mean: 0.8580193519592285
Precision mean: 0.8706005215644836
Recall mean: 0.846301257610321


In [20]:

# Extract BERT F1 score
bert_f1 = F1.mean().item()

# Extract BLEU Score correctly
bleu_value = bleu_score["score"] if isinstance(bleu_score, dict) else bleu_score

# Save scores to a text file
with open("/kaggle/working/evaluation_scores.txt", "w") as file:
    file.write(f"BLEU Score: {bleu_value:.2f}\n")
    file.write(f"BERT Score (F1): {bert_f1:.4f}\n")

print("\nScores saved to evaluation_scores.txt")


Scores saved to evaluation_scores.txt
