In [None]:
# imports
import tensorflow as tf
from transformers import TFT5ForConditionalGeneration, T5Tokenizer
from datasets import load_dataset
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

2025-06-02 14:33:45.215155: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-02 14:33:48.635278: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-06-02 14:33:48.640962: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-06-02 14:33:49.279433: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-02 14:33:50.551057: I tensorflow/core/platform/cpu_feature_guar

In [2]:
# loading the dataset
dataset = load_dataset("empathetic_dialogues", cache_dir="./cache")
train_dataset = dataset["train"]
val_dataset = dataset["validation"]
print("Training samples:", len(train_dataset))
print("Validation samples:", len(val_dataset))

Training samples: 76673
Validation samples: 12030


In [12]:
# loading the T5 model and tokenizer
model_name = "google-t5/t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = TFT5ForConditionalGeneration.from_pretrained(model_name)

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [13]:
def preprocess_data(examples):
    inputs = []
    targets = []
    
    # Group utterances by conversation ID
    conv_id_to_dialogue = {}
    for example in zip(examples["conv_id"], examples["context"], examples["utterance"], examples["speaker_idx"]):
        conv_id, emotion, utterance, speaker_idx = example
        if not isinstance(utterance, str) or not utterance.strip():  # Skip invalid utterances
            continue
        if conv_id not in conv_id_to_dialogue:
            conv_id_to_dialogue[conv_id] = {"emotion": emotion, "utterances": [], "speaker_indices": []}
        conv_id_to_dialogue[conv_id]["utterances"].append(utterance)
        conv_id_to_dialogue[conv_id]["speaker_indices"].append(speaker_idx)
    
    # Create input-target pairs
    for conv_id, data in conv_id_to_dialogue.items():
        emotion = data["emotion"]
        utterances = data["utterances"]
        speaker_indices = data["speaker_indices"]
        for i in range(len(utterances)):
            if speaker_indices[i] == 1:  # Responder's turn
                context = " ".join(utterances[max(0, i-2):i]) if i > 0 else ""
                input_text = f"emotion: {emotion} context: {context}"
                inputs.append(input_text)
                targets.append(utterances[i])
    
    if not inputs or not targets:  # Check for empty lists
        raise ValueError("No valid input-target pairs found in the dataset")
    
    # Tokenize inputs
    model_inputs = tokenizer(
        inputs,
        max_length=128,
        truncation=True,
        padding="max_length",
        return_tensors="np"
    )
    
    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        model_targets = tokenizer(
            targets,
            max_length=128,
            truncation=True,
            padding="max_length",
            return_tensors="np"
        )
    
    return {
        "input_ids": model_inputs["input_ids"],
        "attention_mask": model_inputs["attention_mask"],
        "labels": model_targets["input_ids"]
    }

train_encodings = preprocess_data(train_dataset)
val_encodings = preprocess_data(val_dataset)

In [14]:
def convert_to_tf_dataset(encodings, batch_size=4):
    dataset = tf.data.Dataset.from_tensor_slices({
        "input_ids": tf.cast(encodings["input_ids"], tf.int32),
        "attention_mask": tf.cast(encodings["attention_mask"], tf.int32),
        "labels": tf.cast(encodings["labels"], tf.int32)
    })
    return dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

train_tf_dataset = convert_to_tf_dataset(train_encodings)
val_tf_dataset = convert_to_tf_dataset(val_encodings)

In [15]:
# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
model.compile(optimizer=optimizer, loss= tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=["accuracy"])

In [None]:
history = model.fit(
    train_tf_dataset,
    validation_data=val_tf_dataset,
    epochs=3,
    verbose=1
)
model.save_pretrained("ai_model/fine_tuned_t5_empathetic_base")
tokenizer.save_pretrained("ai_model/fine_tuned_t5_empathetic_base")

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [8]:
# Test on a sample
sample = val_dataset[0]
conv_id = sample["conv_id"]
emotion = sample["context"]
context = sample["utterance"] if sample["speaker_idx"] == 0 else ""
input_text = f"emotion: {emotion} context: {context}"
inputs = tokenizer(input_text, return_tensors="tf", max_length=128, truncation=True, padding=True)
outputs = model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_length=128)
predicted_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Input:", input_text)
print("Predicted Response:", predicted_response)
print("True Response:", sample["utterance"] if sample["speaker_idx"] == 1 else "N/A (Speaker turn)")

Input: emotion: terrified context: 
Predicted Response: I'm going to be going to the gym next weekend.
True Response: N/A (Speaker turn)


In [9]:
# Find a responder turn in the validation set
for sample in val_dataset:
    if sample["speaker_idx"] == 1:  # Responder turn
        conv_id = sample["conv_id"]
        emotion = sample["context"]
        # Get prior context from the same conversation
        context = ""
        for ex in val_dataset:
            if ex["conv_id"] == conv_id and ex["speaker_idx"] == 0 and ex["utterance_idx"] < sample["utterance_idx"]:
                context = ex["utterance"]
                break
        input_text = f"emotion: {emotion} context: {context}"
        inputs = tokenizer(input_text, return_tensors="tf", max_length=128, truncation=True, padding=True)
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=128,
            min_length=5,
            num_beams=4,
            do_sample=True,
            top_k=50,
            top_p=0.9
        )
        predicted_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print("Input:", input_text)
        print("Predicted Response:", predicted_response)
        print("True Response:", sample["utterance"])
        break  # Stop after one sample

Input: emotion: joyful context: 
Predicted Response: I'm going to be going to college next weekend. I'm going to be going to college next weekend.
True Response: That's wonderful. How long have you guys been dating?


In [10]:
# Find a responder turn with context
for sample in val_dataset:
    if sample["speaker_idx"] == 1:  # Responder turn
        conv_id = sample["conv_id"]
        emotion = sample["context"]
        # Get the most recent prior speaker utterance
        context = ""
        for ex in val_dataset:
            if ex["conv_id"] == conv_id and ex["speaker_idx"] == 0 and ex["utterance_idx"] < sample["utterance_idx"]:
                context = ex["utterance"]  # Use the latest speaker utterance
        if not context:  # Skip if no context found
            context = f"Iâ€™m feeling {emotion}."
        input_text = f"emotion: {emotion} context: {context}"
        inputs = tokenizer(input_text, return_tensors="tf", max_length=128, truncation=True, padding=True)
        print("Tokenized Input IDs:", inputs["input_ids"].numpy())  # Debug input
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=128,
            min_length=5,
            num_beams=4,
            do_sample=True,
            top_k=50,
            top_p=0.9,
            no_repeat_ngram_size=2
        )
        predicted_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print("Input:", input_text)
        print("Predicted Response:", predicted_response)
        print("True Response:", sample["utterance"])
        break  # Stop after one sample

Tokenized Input IDs: [[13868    10 26128  2625    10    27    22    51  1829 26128     5     1]]
Input: emotion: joyful context: Iâ€™m feeling joyful.
Predicted Response: Oh no_comma_ how come?
True Response: That's wonderful. How long have you guys been dating?


In [15]:
# Evaluate a responder turn with valid context
found_valid_sample = False
for sample in val_dataset:
    if sample["speaker_idx"] == 1:  # Responder turn
        conv_id = sample["conv_id"]
        emotion = sample["context"]
        utterance_idx = sample["utterance_idx"]
        # Get the most recent prior speaker utterance
        context = ""
        prior_utterances = [
            ex["utterance"]
            for ex in val_dataset
            if ex["conv_id"] == conv_id
            and ex["speaker_idx"] == 0
            and ex["utterance_idx"] < utterance_idx
        ]
        if prior_utterances:
            context = prior_utterances[-1]  # Use the latest speaker utterance
        else:
            context = f"Iâ€™m celebrating a happy moment." if emotion in ["joyful", "happy"] else f"Iâ€™m feeling {emotion}."
        input_text = f"emotion: {emotion} context: {context}"
        inputs = tokenizer(input_text, return_tensors="tf", max_length=128, truncation=True, padding=True)
        print("Tokenized Input IDs:", inputs["input_ids"].numpy())  # Debug input
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=128,
            min_length=10,
            num_beams=6,
            do_sample=False,  # Prioritize coherence
            no_repeat_ngram_size=2
        )
        predicted_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print("Input:", input_text)
        print("Predicted Response:", predicted_response)
        print("True Response:", sample["utterance"])
        found_valid_sample = True
        break

if not found_valid_sample:
    print("No valid responder turn with context found in the validation set.")

Tokenized Input IDs: [[13868    10 26128  2625    10    27    22    51 11646     3     9  1095
    798     5     1]]
Input: emotion: joyful context: Iâ€™m celebrating a happy moment.
Predicted Response: Iâ€™m happy to have a happy moment.
True Response: That's wonderful. How long have you guys been dating?


In [11]:
import tensorflow as tf
from transformers import TFT5ForConditionalGeneration, T5Tokenizer

def generate_empathetic_response(emotion, context, model_path="ai_model/fine_tuned_t5_empathetic"):
    """
    Generate an empathetic response using the fine-tuned T5 model.
    
    Args:
        emotion (str): The emotion label (e.g., "joyful", "sad", "terrified").
        context (str): The conversational context (e.g., "I just got a promotion!").
        model_path (str): Path to the saved model and tokenizer.
    
    Returns:
        str: The generated empathetic response.
    """
    try:
        # Load the tokenizer and model
        tokenizer = T5Tokenizer.from_pretrained(model_path)
        model = TFT5ForConditionalGeneration.from_pretrained(model_path)
        
        # Format input to match training data
        input_text = f"emotion: {emotion} context: {context}"
        
        # Tokenize input
        inputs = tokenizer(
            input_text,
            return_tensors="tf",
            max_length=128,
            truncation=True,
            padding=True
        )
        
        # Generate response
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=128,
            min_length=10,
            num_beams=6,
            do_sample=False,  # Deterministic for consistency
            no_repeat_ngram_size=2
        )
        
        # Decode response
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Print input and output
        print("Input:", input_text)
        print("Generated Response:", response)
        
        return response
    
    except Exception as e:
        print(f"Error: {str(e)}")
        return None

# Example usage
if __name__ == "__main__":
    # Test cases
    test_cases = [
        {"emotion": "joyful", "context": "I just got accepted into my dream college!"},
        {"emotion": "sad", "context": "I lost my job today and feel really down."},
        {"emotion": "terrified", "context": "I heard strange noises in my house last night."}
    ]
    
    for case in test_cases:
        print("\nTest Case:")
        generate_empathetic_response(case["emotion"], case["context"])
        print("-" * 50)


Test Case:


All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at ai_model/fine_tuned_t5_empathetic.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


Input: emotion: joyful context: I just got accepted into my dream college!
Generated Response: I'm so glad I got a job!
--------------------------------------------------

Test Case:


All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at ai_model/fine_tuned_t5_empathetic.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


Input: emotion: sad context: I lost my job today and feel really down.
Generated Response: I haven't been able to work for a long time.
--------------------------------------------------

Test Case:


All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at ai_model/fine_tuned_t5_empathetic.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


Input: emotion: terrified context: I heard strange noises in my house last night.
Generated Response: I heard strange noises in my house last night.
--------------------------------------------------


In [17]:
import tensorflow as tf
from transformers import TFT5ForConditionalGeneration, T5Tokenizer

def generate_empathetic_response(emotion, context, model_path="ai_model/fine_tuned_t5_empathetic"):
    """
    Generate an empathetic response using the fine-tuned T5 model.
    
    Args:
        emotion (str): The emotion label (e.g., "joyful", "sad", "terrified").
        context (str): The conversational context (e.g., "I just got a promotion!").
        model_path (str): Path to the saved model and tokenizer.
    
    Returns:
        str: The generated empathetic response.
    """
    try:
        # Clear GPU memory
        tf.keras.backend.clear_session()
        
        # Load the tokenizer and model
        tokenizer = T5Tokenizer.from_pretrained(model_path)
        model = TFT5ForConditionalGeneration.from_pretrained(model_path)
        
        # Format input with task prefix to match training objective
        input_text = f"generate empathetic response: emotion: {emotion} context: {context}"
        
        # Tokenize input
        inputs = tokenizer(
            input_text,
            return_tensors="tf",
            max_length=128,
            truncation=True,
            padding=True
        )
        
        # Debug: Print tokenized input
        print("Tokenized Input IDs:", inputs["input_ids"].numpy())
        
        # Generate response
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=128,
            min_length=10,
            num_beams=8,  # Increase for coherence
            do_sample=True,  # Enable sampling for creativity
            top_k=50,
            top_p=0.9,
            temperature=0.7,  # Balance creativity and coherence
            no_repeat_ngram_size=3  # Allow some repetition for naturalness
        )
        
        # Decode response
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Print input and output
        print("Input:", input_text)
        print("Generated Response:", response)
        
        return response
    
    except Exception as e:
        print(f"Error: {str(e)}")
        return None

# Example usage with your test cases
if __name__ == "__main__":
    test_cases = [
        {"emotion": "joyful", "context": "I just got accepted into my dream college!"},
        {"emotion": "sad", "context": "I lost my job today and feel really down."},
        {"emotion": "terrified", "context": "I heard strange noises in my house last night."}
    ]
    
    for case in test_cases:
        print("\nTest Case:")
        generate_empathetic_response(case["emotion"], case["context"])
        print("-" * 50)


Test Case:


All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at ai_model/fine_tuned_t5_empathetic.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


Tokenized Input IDs: [[ 3806     3    15    51 27826  1773    10 13868    10 26128  2625    10
     27   131   530  4307   139    82  2461  1900    55     1]]
Input: generate empathetic response: emotion: joyful context: I just got accepted into my dream college!
Generated Response: I've been accepted into my dream college. I'm going to have a dream college!
--------------------------------------------------

Test Case:


All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at ai_model/fine_tuned_t5_empathetic.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


Tokenized Input IDs: [[ 3806     3    15    51 27826  1773    10 13868    10  6819  2625    10
     27  1513    82   613   469    11   473   310   323     5     1]]
Input: generate empathetic response: emotion: sad context: I lost my job today and feel really down.
Generated Response: I don't have a job. It's a good job. I'm sorry I didn't work for a long time.
--------------------------------------------------

Test Case:


All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at ai_model/fine_tuned_t5_empathetic.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


Tokenized Input IDs: [[ 3806     3    15    51 27826  1773    10 13868    10 31539  2625    10
     27  1943  6765  4661     7    16    82   629   336   706     5     1]]
Input: generate empathetic response: emotion: terrified context: I heard strange noises in my house last night.
Generated Response: I heard strange noises in my house last night. I heard a lot of noises from my house.
--------------------------------------------------


In [11]:
print (val_dataset[0])

{'conv_id': 'hit:3_conv:6', 'utterance_idx': 1, 'context': 'terrified', 'prompt': 'Today_comma_as i was leaving for work in the morning_comma_i had a tire burst in the middle of a busy road. That scared the hell out of me!', 'speaker_idx': 6, 'utterance': 'Today_comma_as i was leaving for work in the morning_comma_i had a tire burst in the middle of a busy road. That scared the hell out of me!', 'selfeval': '4|5|5_5|5|5', 'tags': ''}


In [12]:
from datasets import load_metric
bleu = load_metric("bleu")

predictions = []
references = []
for example in val_dataset.select(range(50)):
    conv_id = example["conv_id"]
    emotion = example["context"]
    context = example["utterance"] if example["speaker_idx"] == 0 else ""
    input_text = f"emotion: {emotion} context: {context}"
    inputs = tokenizer(input_text, return_tensors="tf", max_length=128, truncation=True, padding=True)
    outputs = model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_length=128)
    pred_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if example["speaker_idx"] == 1:
        predictions.append(pred_response.split())
        references.append([example["utterance"].split()])

results = bleu.compute(predictions=predictions, references=references)
print("BLEU Score:", results["bleu"])

  bleu = load_metric("bleu")
Downloading builder script: 6.06kB [00:00, 7.95MB/s]                   
Downloading extra modules: 4.07kB [00:00, 10.8MB/s]                   


KeyboardInterrupt: 

In [1]:
import torch
from transformers import BartForConditionalGeneration, BartTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np
from difflib import SequenceMatcher

2025-06-02 16:22:49.363347: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-02 16:22:52.230844: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-06-02 16:22:52.235678: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-06-02 16:22:52.836749: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-02 16:22:53.965243: I tensorflow/core/platform/cpu_feature_guar

In [2]:
dataset = load_dataset("empathetic_dialogues", cache_dir="./cache")
train_dataset = dataset["train"]
val_dataset = dataset["validation"]
print("Training samples:", len(train_dataset))
print("Validation samples:", len(val_dataset))

Training samples: 76673
Validation samples: 12030


In [3]:
model_name = "sshleifer/distilbart-cnn-6-6"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
      

In [4]:
def preprocess_data(examples, task_prefix="generate empathetic response: "):
    inputs = []
    targets = []
    
    # Group utterances by conversation ID
    conv_id_to_dialogue = {}
    for example in zip(examples["conv_id"], examples["context"], examples["utterance"], examples["speaker_idx"]):
        conv_id, emotion, utterance, speaker_idx = example
        if not isinstance(utterance, str) or not utterance.strip() or len(utterance.split()) < 3:
            continue
        if conv_id not in conv_id_to_dialogue:
            conv_id_to_dialogue[conv_id] = {"emotion": emotion, "utterances": [], "speaker_indices": []}
        conv_id_to_dialogue[conv_id]["utterances"].append(utterance)
        conv_id_to_dialogue[conv_id]["speaker_indices"].append(speaker_idx)
    
    # Create input-target pairs
    for conv_id, data in conv_id_to_dialogue.items():
        emotion = data["emotion"]
        utterances = data["utterances"]
        speaker_indices = data["speaker_indices"]
        for i in range(len(utterances)):
            if speaker_indices[i] == 1:  # Responder turn
                context = " ".join(utterances[max(0, i-3):i]) if i > 0 else f"Iâ€™m feeling {emotion}."
                input_text = f"{task_prefix}emotion: {emotion} context: {context}"
                target = utterances[i]
                # Filter out targets too similar to context or too generic
                similarity = SequenceMatcher(None, context.lower(), target.lower()).ratio()
                if similarity < 0.7 and len(target.split()) >= 5 and target.lower() not in ["thatâ€™s nice.", "okay.", "cool."]:
                    inputs.append(input_text)
                    targets.append(target)
    
    if not inputs or not targets:
        raise ValueError("No valid input-target pairs found in the dataset")
    
    # Tokenize
    encodings = tokenizer(
        inputs,
        max_length=64,  # Reduced for GPU
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    target_encodings = tokenizer(
        targets,
        max_length=64,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    
    return {
        "input_ids": encodings["input_ids"],
        "attention_mask": encodings["attention_mask"],
        "labels": target_encodings["input_ids"]
    }

train_encodings = preprocess_data(train_dataset)
val_encodings = preprocess_data(val_dataset)

# Create PyTorch dataset
from torch.utils.data import Dataset
class EmpatheticDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    
    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": self.encodings["labels"][idx]
        }
    
    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = EmpatheticDataset(train_encodings)
val_dataset = EmpatheticDataset(val_encodings)

In [5]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=8,
    per_device_train_batch_size=2,  # Small batch size for GPU
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,  # Effective batch size = 8
    fp16=True,  # Mixed precision
    learning_rate=3e-5,
    logging_steps=100,
    save_steps=500,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    logging_dir="./logs",
    report_to="none"
)

In [6]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)
trainer.train()
model.save_pretrained("ai_model/fine_tuned_distilbart_empathetic")
tokenizer.save_pretrained("ai_model/fine_tuned_distilbart_empathetic")

Epoch,Training Loss,Validation Loss
1,0.7661,0.620654
2,0.6136,0.60592
3,0.4776,0.624932
4,0.37,0.668107
5,0.2902,0.734318
6,0.2292,0.777126
7,0.187,0.807861
8,0.1575,0.824522


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


('ai_model/fine_tuned_distilbart_empathetic/tokenizer_config.json',
 'ai_model/fine_tuned_distilbart_empathetic/special_tokens_map.json',
 'ai_model/fine_tuned_distilbart_empathetic/vocab.json',
 'ai_model/fine_tuned_distilbart_empathetic/merges.txt',
 'ai_model/fine_tuned_distilbart_empathetic/added_tokens.json')

In [8]:
def generate_empathetic_response(emotion, context, model_path="ai_model/fine_tuned_distilbart_empathetic"):
    """
    Generate an empathetic response using the fine-tuned DistilBART model, replacing '_comma_' with ','.
    
    Args:
        emotion (str): The emotion label (e.g., "joyful", "sad", "terrified").
        context (str): The conversational context (e.g., "I just got a promotion!").
        model_path (str): Path to the saved model and tokenizer.
    
    Returns:
        str: The generated empathetic response with '_comma_' replaced by ','.
    """
    try:
        # Load tokenizer and model
        tokenizer = BartTokenizer.from_pretrained(model_path)
        model = BartForConditionalGeneration.from_pretrained(model_path)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)
        
        # Format input
        input_text = f"generate empathetic response: emotion: {emotion} context: {context}"
        
        # Tokenize input
        inputs = tokenizer(
            input_text,
            return_tensors="pt",
            max_length=64,
            truncation=True,
            padding=True
        ).to(device)
        
        # Debug: Print tokenized input
        print("Tokenized Input IDs:", inputs["input_ids"].cpu().numpy())
        
        # Generate response
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=64,
            min_length=10,
            num_beams=8,
            do_sample=True,
            top_k=50,
            top_p=0.9,
            temperature=0.7,
            no_repeat_ngram_size=3
        )
        
        # Decode response
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Replace '_comma_' with ','
        clean_response = response.replace("_comma_", ",")
        
        # Print input and output
        print("Input:", input_text)
        print("Generated Response:", clean_response)
        
        return clean_response
    
    except Exception as e:
        print(f"Error: {str(e)}")
        return None

# Test cases
if __name__ == "__main__":
    test_cases = [
        {"emotion": "joyful", "context": "I just got accepted into my dream college!"},
        {"emotion": "sad", "context": "I lost my job today and feel really down."},
        {"emotion": "terrified", "context": "I heard strange noises in my house last night."},
        {"emotion": "excited", "context": "Iâ€™m starting a new job next week!"},
        {"emotion": "frustrated", "context": "My computer crashed and I lost my project."}
    ]
    for case in test_cases:
        print("\nTest Case:")
        generate_empathetic_response(case["emotion"], case["context"])
        print("-" * 50)


Test Case:
Tokenized Input IDs: [[    0 20557   877  2841 11632 18667  1263    35 11926    35 32076  5377
     35    38    95   300  3903    88   127  3366  1564   328     2]]
Input: generate empathetic response: emotion: joyful context: I just got accepted into my dream college!
Generated Response: Yea, it is. It's a lot of fun!
--------------------------------------------------

Test Case:
Tokenized Input IDs: [[    0 20557   877  2841 11632 18667  1263    35 11926    35  5074  5377
     35    38   685   127   633   452     8   619   269   159     4     2]]
Input: generate empathetic response: emotion: sad context: I lost my job today and feel really down.
Generated Response: I'm sorry to hear that. How long have you been without your job?
--------------------------------------------------

Test Case:
Tokenized Input IDs: [[    0 20557   877  2841 11632 18667  1263    35 11926    35 19419  5377
     35    38  1317  7782 27903    11   127   790    94   363     4     2]]
Input: genera

In [9]:
def generate_empathetic_response(emotion, context, model_path="ai_model/fine_tuned_distilbart_empathetic"):
    """
    Generate an empathetic response using the fine-tuned DistilBART model, replacing '_comma_' with ','.
    
    Args:
        emotion (str): The emotion label (e.g., "joyful", "sad", "terrified").
        context (str): The conversational context (e.g., "I just got a promotion!").
        model_path (str): Path to the saved model and tokenizer.
    
    Returns:
        str: The generated empathetic response with '_comma_' replaced by ','.
    """
    try:
        # Load tokenizer and model
        tokenizer = BartTokenizer.from_pretrained(model_path)
        model = BartForConditionalGeneration.from_pretrained(model_path)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)
        
        # Format input
        input_text = f"generate empathetic response: emotion: {emotion} context: {context}"
        
        # Tokenize input
        inputs = tokenizer(
            input_text,
            return_tensors="pt",
            max_length=64,
            truncation=True,
            padding=True
        ).to(device)
        
        # Debug: Print tokenized input
        print("Tokenized Input IDs:", inputs["input_ids"].cpu().numpy())
        
        # Generate response
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=64,
            min_length=10,
            num_beams=8,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.8,
            no_repeat_ngram_size=3
        )
        
        # Decode response
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Replace '_comma_' with ','
        clean_response = response.replace("_comma_", ",")
        
        # Print input and output
        print("Input:", input_text)
        print("Generated Response:", clean_response)
        
        return clean_response
    
    except Exception as e:
        print(f"Error: {str(e)}")
        return None

# Test cases
if __name__ == "__main__":
    test_cases = [
        {"emotion": "joyful", "context": "I just got accepted into my dream college!"},
        {"emotion": "sad", "context": "I lost my job today and feel really down."},
        {"emotion": "terrified", "context": "I heard strange noises in my house last night."},
        {"emotion": "excited", "context": "Iâ€™m starting a new job next week!"},
        {"emotion": "frustrated", "context": "My computer crashed and I lost my project."}
    ]
    for case in test_cases:
        print("\nTest Case:")
        generate_empathetic_response(case["emotion"], case["context"])
        print("-" * 50)


Test Case:
Tokenized Input IDs: [[    0 20557   877  2841 11632 18667  1263    35 11926    35 32076  5377
     35    38    95   300  3903    88   127  3366  1564   328     2]]
Input: generate empathetic response: emotion: joyful context: I just got accepted into my dream college!
Generated Response: Yea, it is. It's a lot of fun!
--------------------------------------------------

Test Case:
Tokenized Input IDs: [[    0 20557   877  2841 11632 18667  1263    35 11926    35  5074  5377
     35    38   685   127   633   452     8   619   269   159     4     2]]
Input: generate empathetic response: emotion: sad context: I lost my job today and feel really down.
Generated Response: Oh no, why did you lose your job?
--------------------------------------------------

Test Case:
Tokenized Input IDs: [[    0 20557   877  2841 11632 18667  1263    35 11926    35 19419  5377
     35    38  1317  7782 27903    11   127   790    94   363     4     2]]
Input: generate empathetic response: emotion

In [None]:
import torch
from transformers import BartForConditionalGeneration, BartTokenizer, pipeline

def chat_with_bot(prompt, model_path="ai_model/fine_tuned_distilbart_empathetic"):
    """
    Generate an empathetic response from a single user prompt by inferring emotion.
    
    Args:
        prompt (str): User input text (e.g., "I just got accepted into my dream college!").
        model_path (str): Path to the fine-tuned DistilBART model.
    
    Returns:
        str: Empathetic response with '_comma_' replaced by ','.
    """
    try:
        # Clear GPU memory
        torch.cuda.empty_cache()
        
        # Load emotion classifier
        emotion_classifier = pipeline(
            "text-classification",
            model="j-hartmann/emotion-english-distilroberta-base",
            top_k=None,
            device=0 if torch.cuda.is_available() else -1
        )
        
        # Infer emotion from prompt
        emotion_scores = emotion_classifier(prompt)[0]
        # Map to Empathetic Dialogues emotions
        emotion_map = {
            "joy": "joyful",
            "sadness": "sad",
            "fear": "terrified",
            "anger": "angry",
            "surprise": "excited",
            "disgust": "frustrated",
            "neutral": "neutral"
        }
        # Select highest-scoring emotion
        top_emotion = max(emotion_scores, key=lambda x: x["score"])["label"]
        emotion = emotion_map.get(top_emotion, "neutral")
        
        # Load DistilBART model and tokenizer
        tokenizer = BartTokenizer.from_pretrained(model_path)
        model = BartForConditionalGeneration.from_pretrained(model_path)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)
        
        # Format input
        input_text = f"generate empathetic response: emotion: {emotion} context: {prompt}"
        
        # Tokenize input
        inputs = tokenizer(
            input_text,
            return_tensors="pt",
            max_length=64,
            truncation=True,
            padding=True
        ).to(device)
        
        # Debug: Print tokenized input and inferred emotion
        print("Inferred Emotion:", emotion)
        print("Tokenized Input IDs:", inputs["input_ids"].cpu().numpy())
        
        # Generate response
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=64,
            min_length=10,
            num_beams=8,
            do_sample=True,
            top_k=50,
            top_p=0.9,
            temperature=0.8,  # Increased for diversity
            no_repeat_ngram_size=3
        )
        
        # Decode response
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Replace '_comma_' with ','
        clean_response = response.replace("_comma_", ",")
        
        # Print input and output
        print("User Prompt:", prompt)
        print("Generated Response:", clean_response)
        
        return clean_response
    
    except Exception as e:
        print(f"Error: {str(e)}")
        return None

# Interactive chatbot loop
def run_chatbot(model_path="ai_model/fine_tuned_distilbart_empathetic"):
    """
    Run an interactive chatbot session.
    
    Args:
        model_path (str): Path to the fine-tuned DistilBART model.
    """
    print("Welcome to the Empathetic Chatbot! Type your message or 'quit' to exit.")
    conversation_log = []
    
    while True:
        prompt = input("You: ").strip()
        if prompt.lower() == "quit":
            break
        if not prompt:
            print("Please enter a message.")
            continue
        
        response = chat_with_bot(prompt, model_path)
        if response:
            conversation_log.append({"prompt": prompt, "response": response})
    
    # Save conversation log
    import json
    with open("conversation_log.json", "w") as f:
        json.dump(conversation_log, f, indent=2)
    print("Conversation saved to conversation_log.json")
    print("Goodbye!")

# Test cases
if __name__ == "__main__":
    test_prompts = [
        "I just got accepted into my dream college!",
        "I lost my job today and feel really down.",
        "I heard strange noises in my house last night.",
        "Iâ€™m starting a new job next week!",
        "My computer crashed and I lost my project."
    ]
    print("Running test cases:")
    for prompt in test_prompts:
        print("\nTest Case:")
        chat_with_bot(prompt)
        print("-" * 50)
    
    # Start interactive chatbot
    print("\nStarting interactive chatbot...")
    run_chatbot()

Running test cases:

Test Case:


config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/294 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/329M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda:0


Inferred Emotion: joyful
Tokenized Input IDs: [[    0 20557   877  2841 11632 18667  1263    35 11926    35 32076  5377
     35    38    95   300  3903    88   127  3366  1564   328     2]]
User Prompt: I just got accepted into my dream college!
Generated Response: Yea, it is. It's a lot of fun!
--------------------------------------------------

Test Case:


Device set to use cuda:0


Inferred Emotion: sad
Tokenized Input IDs: [[    0 20557   877  2841 11632 18667  1263    35 11926    35  5074  5377
     35    38   685   127   633   452     8   619   269   159     4     2]]
User Prompt: I lost my job today and feel really down.
Generated Response: Oh no, why did you lose your job?
--------------------------------------------------

Test Case:


Device set to use cuda:0


Inferred Emotion: terrified
Tokenized Input IDs: [[    0 20557   877  2841 11632 18667  1263    35 11926    35 19419  5377
     35    38  1317  7782 27903    11   127   790    94   363     4     2]]
User Prompt: I heard strange noises in my house last night.
Generated Response: Oh no, what happened last night?
--------------------------------------------------

Test Case:


Device set to use cuda:0


Inferred Emotion: joyful
Tokenized Input IDs: [[    0 20557   877  2841 11632 18667  1263    35 11926    35 32076  5377
     35    38    17    27   119  1158    10    92   633   220   186   328
      2]]
User Prompt: Iâ€™m starting a new job next week!
Generated Response: That's awesome! What's going on next week?
--------------------------------------------------

Test Case:


Device set to use cuda:0


Inferred Emotion: sad
Tokenized Input IDs: [[    0 20557   877  2841 11632 18667  1263    35 11926    35  5074  5377
     35  1308  3034  6050     8    38   685   127   695     4     2]]
User Prompt: My computer crashed and I lost my project.
Generated Response: Oh no, why did you lose your project?
--------------------------------------------------

Starting interactive chatbot...
Welcome to the Empathetic Chatbot! Type your message or 'quit' to exit.


Device set to use cuda:0


Inferred Emotion: neutral
Tokenized Input IDs: [[    0 20557   877  2841 11632 18667  1263    35 11926    35  7974  5377
     35 12289     2]]
User Prompt: Hi
Generated Response: Oh no, what happened?
Please enter a message.


Device set to use cuda:0


Inferred Emotion: joyful
Tokenized Input IDs: [[    0 20557   877  2841 11632 18667  1263    35 11926    35 32076  5377
     35    38  1550   127   695 32376     2]]
User Prompt: I finished my project!!!!
Generated Response: That's awesome! What project was it?
