In [1]:
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, Trainer,
    TrainingArguments, DataCollatorForLanguageModeling
)
from datasets import load_dataset
import torch

In [2]:
model_name = "distilgpt2"  

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Fix for GPT-2 which has no pad token

model = AutoModelForCausalLM.from_pretrained(model_name)
model.config.pad_token_id = tokenizer.pad_token_id

In [3]:
# Load the dataset
dataset = load_dataset("empathetic_dialogues", trust_remote_code=True)

split_data = dataset["train"].train_test_split(test_size=0.1)
train_dataset = split_data["train"]
val_dataset = split_data["test"]



In [4]:
#quick testing
train_dataset = train_dataset.select(range(500)) 
val_dataset = val_dataset.select(range(100))      


In [5]:
def tokenize(example):
    text = example["context"] + " </s> " + example["utterance"]
    return tokenizer(text, truncation=True, padding="max_length", max_length=128)

tokenized_train = train_dataset.map(tokenize, batched=False)
tokenized_val = val_dataset.map(tokenize, batched=False)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [6]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # GPT is an auto-regressive model
)

In [7]:
training_args = TrainingArguments(
    output_dir="./chatbot_model",
    eval_strategy="steps",    
    eval_steps=100,
    logging_steps=10,
    per_device_train_batch_size=4,
    num_train_epochs=1,
    learning_rate=5e-5,
    weight_decay=0.01,
    save_steps=100,
    push_to_hub=False
)

In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator
)

  trainer = Trainer(


In [9]:
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
100,3.1103,3.321325


TrainOutput(global_step=125, training_loss=3.3829935455322264, metrics={'train_runtime': 389.2858, 'train_samples_per_second': 1.284, 'train_steps_per_second': 0.321, 'total_flos': 16331046912000.0, 'train_loss': 3.3829935455322264, 'epoch': 1.0})

In [None]:
trainer.save_model("./chatbot_model")
tokenizer.save_pretrained("./chatbot_model")


In [13]:
def respond(input_text):
    inputs = tokenizer(input_text, return_tensors="pt", padding=True)
    output_ids = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=70,
        temperature=0.7,
        top_p=0.9,
        top_k=50,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id
    )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)





In [14]:
# Example
print(respond("I'm feeling very anxious about everything lately."))

I'm feeling very anxious about everything lately. I am so excited about my house. I am so excited about it. My husband is also very happy. I am excited about my house. I hope my house is more like a home. It will be more like my house. I hope I can find someone who will be more happy than my wife
