**Import Libraries and Setup**

In [None]:
!pip install transformers datasets accelerate

from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
import torch
import math

**Load Dataset (DailyDialog)**

In [None]:
from datasets import load_dataset

dataset = load_dataset("DeepPavlov/daily_dialog")

train_dataset = dataset["train"]
valid_dataset = dataset["validation"]  # DeepPavlov version has validation split

**Initialize Tokenizer and Model (DistilGPT-2)**

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Ensure padding token exists
tokenizer.pad_token = tokenizer.eos_token

**Model Initialization, Optimizer, and DataLoader Setup**

In [None]:
from transformers import AutoModelForCausalLM

# Load DistilGPT-2 model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained("distilgpt2").to(device)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader   = DataLoader(valid_dataset, batch_size=8)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

**Preprocess Dataset: Flatten Dialogs into Text Column**

In [None]:
def preprocess(example):
    example["text"] = " ".join(example["dialog"])
    return example

train_dataset = train_dataset.map(preprocess)
valid_dataset = valid_dataset.map(preprocess)

**Dataset Tokenization and Label Preparation**

In [None]:
def tokenize_function(example):
    tokens = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )
    # Labels = input_ids, with padding masked out
    tokens["labels"] = [
        (label if label != tokenizer.pad_token_id else -100)
        for label in tokens["input_ids"]
    ]
    return tokens

tokenized_train = train_dataset.map(
    tokenize_function,
    batched=False,
    remove_columns=train_dataset.column_names
)
tokenized_valid = valid_dataset.map(
    tokenize_function,
    batched=False,
    remove_columns=valid_dataset.column_names
)

tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_valid.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

**Preprocess Dataset (flatten dialogs into text)**

In [None]:
def preprocess(example):
    # DailyDialog stores utterances in "dialog" (a list of strings)
    example["text"] = " ".join(example["dialog"])
    return example

train_dataset = train_dataset.map(preprocess)
valid_dataset = valid_dataset.map(preprocess)


**Tokenize Dataset (input_ids, attention_mask, labels)**

In [None]:
def tokenize_function(examples):
    # Tokenize a batch of texts
    tokens = tokenizer(
        examples["text"],              # list of strings
        truncation=True,
        padding="max_length",
        max_length=128
    )
    # Create labels with padding masked out
    tokens["labels"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in ids]
        for ids in tokens["input_ids"]
    ]
    return tokens

# Apply tokenization to train and valid splits
train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=train_dataset.column_names
)
valid_dataset = valid_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=valid_dataset.column_names
)

# Format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
valid_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

**Training Loop (with Progress Bars and Perplexity)**

In [None]:
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import DataCollatorForLanguageModeling
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True,
                          collate_fn=data_collator, pin_memory=True, num_workers=4)
val_loader   = DataLoader(valid_dataset, batch_size=32,
                          collate_fn=data_collator, pin_memory=True, num_workers=4)

optimizer = AdamW(model.parameters(), lr=5e-5)

scaler = torch.cuda.amp.GradScaler()  # ✅ mixed precision

def evaluate(loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device, non_blocking=True)
            attention_mask = batch["attention_mask"].to(device, non_blocking=True)
            labels = batch["labels"].to(device, non_blocking=True)

            with torch.cuda.amp.autocast():
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                total_loss += outputs.loss.item()
    return total_loss / len(loader)

epochs = 3
for epoch in range(epochs):
    model.train()
    total_train_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}", leave=False):
        input_ids = batch["input_ids"].to(device, non_blocking=True)
        attention_mask = batch["attention_mask"].to(device, non_blocking=True)
        labels = batch["labels"].to(device, non_blocking=True)

        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)
    avg_val_loss = evaluate(val_loader)

    print(f"Epoch {epoch+1}: Train Loss={avg_train_loss:.4f}, Val Loss={avg_val_loss:.4f}, "
          f"Train PPL={math.exp(avg_train_loss):.2f}, Val PPL={math.exp(avg_val_loss):.2f}")

**Save Fine-Tuned Model and Tokenizer**

In [None]:
model.save_pretrained("my_model")
tokenizer.save_pretrained("my_model")

**Reload Model and Run Interactive Chat Loop**

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load your fine‑tuned DistilGPT‑2 model
model_name = "my_model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda")

# Define a pad token (DistilGPT‑2 has none by default)
tokenizer.pad_token = tokenizer.eos_token

# Conversation history scaffold
history = """You: Hello
Bot: Hi there! How can I help you today?
"""

while True:
    user_input = input("You: ")
    if user_input.lower() in ["quit", "exit"]:
        break

    # Add the new turn to history
    history += f"You: {user_input}\nBot:"

    # Encode history
    inputs = tokenizer(history, return_tensors="pt").to("cuda")

    # Generate only the new reply
    outputs = model.generate(
        **inputs,
        max_new_tokens=20,                # keep replies short
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        do_sample=True,
        temperature=0.65,                 # balance creativity & focus
        top_k=30,
        top_p=0.85,
        repetition_penalty=1.2            # discourage repeating phrases
    )

    # Extract only the newly generated tokens
    new_tokens = outputs[0][inputs["input_ids"].shape[-1]:]
    bot_reply = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()

    # Stop at first newline or "You:" to avoid spillover
    for stop in ["\n", "You:"]:
        if stop in bot_reply:
            bot_reply = bot_reply.split(stop)[0].strip()

    print("Bot:", bot_reply)

    # Append reply to history
    history += f" {bot_reply}\n"

**Interactive Chat Loop Using Pre-trained DistilGPT-2 (no fine-tuning)**

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load the pre-trained DistilGPT-2 directly from Hugging Face
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda")

# Define a pad token (DistilGPT-2 has none by default)
tokenizer.pad_token = tokenizer.eos_token

# Conversation history scaffold
history = """You: Hello
Bot: Hi there! How can I help you today?
"""

while True:
    user_input = input("You: ")
    if user_input.lower() in ["quit", "exit"]:
        break

    # Add the new turn to history
    history += f"You: {user_input}\nBot:"

    # Encode history
    inputs = tokenizer(history, return_tensors="pt").to("cuda")

    # Generate only the new reply
    outputs = model.generate(
        **inputs,
        max_new_tokens=25,                # keep replies short
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        do_sample=True,
        temperature=0.7,                  # balance creativity & coherence
        top_k=40,
        top_p=0.9,
        repetition_penalty=1.2            # discourage loops
    )

    # Extract only the newly generated tokens
    new_tokens = outputs[0][inputs["input_ids"].shape[-1]:]
    bot_reply = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()

    # Stop at first newline or "You:" to avoid spillover
    for stop in ["\n", "You:"]:
        if stop in bot_reply:
            bot_reply = bot_reply.split(stop)[0].strip()

    print("Bot:", bot_reply)

    # Append reply to history
    history += f" {bot_reply}\n"