In [8]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl 

In [9]:
from datasets import load_dataset

# This will download & cache the train/validation/test splits for you
dataset = load_dataset("daily_dialog")
train_texts = ["\n".join(d) for d in dataset["train"]["dialog"]]  # join each dialogue into one text


README.md:   0%|          | 0.00/7.27k [00:00<?, ?B/s]

daily_dialog.py:   0%|          | 0.00/4.85k [00:00<?, ?B/s]

The repository for daily_dialog contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/daily_dialog.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/4.48M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11118 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [12]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 has no pad, so use EOS as pad_token

def tokenize_fn(examples):
    # Tokenize
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
    # For causal LM, labels = input_ids
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

# Wrap train_texts in a Dataset dict to use `.map()`
from datasets import Dataset
train_ds = Dataset.from_dict({"text": train_texts})
tokenized_train = train_ds.map(tokenize_fn, batched=True)
tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask"])


Map:   0%|          | 0/11118 [00:00<?, ? examples/s]

In [13]:
from datasets import load_dataset, Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer

# 1. Load dataset & join dialogues
dataset = load_dataset("daily_dialog", split="train")
train_texts = ["\n".join(d) for d in dataset["dialog"]]
train_ds = Dataset.from_dict({"text": train_texts})

# 2. Tokenize + add labels
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 has no pad token
tokenized_train = train_ds.map(tokenize_fn, batched=True)
tokenized_train.set_format(type="torch", columns=["input_ids","attention_mask","labels"])

# 3. Fine-tune
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

training_args = TrainingArguments(
    output_dir="gpt2_finetuned",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_steps=100,
    fp16=True,                    # if you have a GPU
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    tokenizer=tokenizer,
)

trainer.train()
trainer.save_model("gpt2_finetuned")


Map:   0%|          | 0/11118 [00:00<?, ? examples/s]

  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,1.2184
200,0.6548
300,0.6425
400,0.6621
500,0.6569
600,0.6416
700,0.675
800,0.6507
900,0.6602
1000,0.6351


In [14]:
import torch
torch.save(model.state_dict(), "gpt2_finetuned.pth")

In [15]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = GPT2Tokenizer.from_pretrained("gpt2_finetuned")
model = GPT2LMHeadModel.from_pretrained("gpt2_finetuned").to(device)
model.eval()

def chat(prompt, max_new_tokens=50):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    out = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_p=0.9,
        top_k=50,
        pad_token_id=tokenizer.eos_token_id
    )
    return tokenizer.decode(out[0], skip_special_tokens=True)[len(prompt):].strip()

if __name__ == "__main__":
    print("Chatbot (type 'exit')\n")
    while True:
        u = input("You: ")
        if u.lower() == "exit":
            break
        print("Bot:", chat(u), "\n")


Chatbot (type 'exit')

You: hello
Bot: of the museum , where can I find my library card ? 
 In the library . 
 Could you take a look at this one ? It's a picture book . 
 I don't want to spoil it for you . I think it 

You: thy
Bot: to be of service to you , sir . 
 We are not in business anymore . We can go back to our old ways . 
 Is this right ? 
 Yes , sir . We can go to another house . 

You: where were you going to go this weekend
Bot: ? 
 I ’ m not sure . I ’ Ve been thinking about going to London . 
 Oh , I know , I went to the World Cup in Brazil . 
 Really ? Oh , really ? I went too . I 

You: exit


In [16]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer (same as training)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # important for batching

# Load model architecture
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

# Load your fine-tuned weights
model.load_state_dict(torch.load("gpt2_finetuned.pth", map_location=device))
model.to(device)
model.eval()

# === Chatbot generation function ===
def generate_response(prompt, max_length=150):
    inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=max_length,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.9,
            num_return_sequences=1
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# === Interactive Chat ===
print("🤖 DailyDialog Chatbot Ready! Type 'exit' to quit.")
while True:
    user_input = input("You: ")
    if user_input.lower() in {"exit", "quit"}:
        break
    prompt = f"{user_input}"
    response = generate_response(prompt)
    print(f"Bot: {response[len(user_input):].strip()}")


🤖 DailyDialog Chatbot Ready! Type 'exit' to quit.
You: hello good sir how is your day going


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Bot: ? 
 great ! I ’ m not feeling well . 
 well , I just feel so tired . How about tomorrow morning ? 
 great ! I ’ m going to sleep at eight.Good night then !
You: why you always try to dodge questions
Bot: . 
 How about you ? 
 I sometimes feel awkward .
You: nah i am good lets go to london
Bot: ! 
 I am the new owner of this new office . 
 How nice ! 
 I like it . 
 I want to go there once or twice a year . 
 I am not sure where you want to go . 
 You can go to London or wherever you want . 
 Are we going to London ? 
 I'm not sure yet . 
 I hope so . 
 Let's go to a cafe . 
 Oh my god ! We have so many people !
You: exit


In [17]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Save both model and tokenizer
save_dict = {
    "model_state_dict": model.state_dict(),
    "tokenizer_config": tokenizer.init_kwargs,      # includes special tokens, etc.
    "tokenizer_vocab": tokenizer.get_vocab(),       # vocab dictionary
}

torch.save(save_dict, "gpt2_all_in_one.pth")
