In [1]:
pip install transformers datasets torch scikit-learn

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:0

## Load Dataset

In [3]:
from datasets import load_dataset

# Load the DailyDialog dataset
dataset = load_dataset("daily_dialog")

# Explore a sample
print(dataset['train'][0])

The repository for daily_dialog contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/daily_dialog.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/4.48M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11118 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

{'dialog': ['Say , Jim , how about going for a few beers after dinner ? ', ' You know that is tempting but is really not good for our fitness . ', ' What do you mean ? It will help us to relax . ', " Do you really think so ? I don't . It will just make us fat and act silly . Remember last time ? ", " I guess you are right.But what shall we do ? I don't feel like sitting at home . ", ' I suggest a walk over to the gym where we can play singsong and meet some of our friends . ', " That's a good idea . I hear Mary and Sally often go there to play pingpong.Perhaps we can make a foursome with them . ", ' Sounds great to me ! If they are willing , we could ask them to go dancing with us.That is excellent exercise and fun , too . ', " Good.Let ' s go now . ", ' All right . '], 'act': [3, 4, 2, 2, 2, 3, 4, 1, 3, 4], 'emotion': [0, 0, 0, 0, 0, 0, 4, 4, 4, 4]}


## Prepare the Seq2Seq Model and Tokenizer

In [5]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the T5-small model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

## Preprocess the Data for Training

In [9]:
def preprocess_data(examples):
    # Prepare input and target sequences
    inputs = [f"chatbot: {dialog[0]}" for dialog in examples['dialog'][:-1]]
    targets = [dialog[1] for dialog in examples['dialog'][1:]]

    # Tokenize inputs
    model_inputs = tokenizer(
        inputs, max_length=32, truncation=True, padding=True
    )

    # Tokenize targets using `text_target`
    labels = tokenizer(
        text_target=targets, max_length=32, truncation=True, padding=True
    )

    # Add labels to model inputs and ensure consistent lengths
    model_inputs["labels"] = labels["input_ids"]

    # Debug: Print out lengths to ensure consistency
    print(f"Input Length: {len(model_inputs['input_ids'])}, Label Length: {len(model_inputs['labels'])}")

    return model_inputs

# Apply preprocessing with debug information
train_data = dataset['train'].map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)

Map:   0%|          | 0/11118 [00:00<?, ? examples/s]

Input Length: 999, Label Length: 999
Input Length: 999, Label Length: 999
Input Length: 999, Label Length: 999
Input Length: 999, Label Length: 999
Input Length: 999, Label Length: 999
Input Length: 999, Label Length: 999
Input Length: 999, Label Length: 999
Input Length: 999, Label Length: 999
Input Length: 999, Label Length: 999
Input Length: 999, Label Length: 999
Input Length: 999, Label Length: 999
Input Length: 117, Label Length: 117


## Set Up DataLoader

In [10]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_data, batch_size=8, shuffle=True)

##  Define Optimizer and Scheduler

In [11]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)



## Training Loop

In [13]:
from transformers import DataCollatorForSeq2Seq

# Initialize the data collator (handles padding and tensor conversion)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Create DataLoader with the correct collate function
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    train_data, batch_size=8, shuffle=True, collate_fn=data_collator
)

In [None]:
for epoch in range(3):  # Run for 3 epochs
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        # Move tensors to the appropriate device (GPU/CPU)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f"Epoch {epoch + 1} - Loss: {total_loss / len(train_dataloader):.4f}")

## Save the Model

In [None]:
model.save_pretrained("chatbot_model")
tokenizer.save_pretrained("chatbot_model")

## Chatbot Inference: Interactive Chat

In [None]:
def generate_response(input_text):
    # Tokenize the input
    input_ids = tokenizer(f"chatbot: {input_text}", return_tensors="pt").input_ids.to(device)

    # Generate response
    output_ids = model.generate(input_ids, max_length=32, num_beams=4, early_stopping=True)

    # Decode and return the response
    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return response

# Chat with the bot
while True:
    user_input = input("You: ")
    if user_input.lower() in ["quit", "exit"]:
        break

    response = generate_response(user_input)
    print(f"Bot: {response}")