In [None]:
# ============================================
# DialoGPT Chatbot - Google Colab (Single Cell)
# ============================================

# Install dependencies
!pip install -q transformers torch accelerate

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Load DialoGPT
MODEL_NAME = "microsoft/DialoGPT-medium"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
model.to(device)
model.eval()

# Set pad token
tokenizer.pad_token = tokenizer.eos_token

# Chat history
chat_history_ids = None
MAX_HISTORY_TOKENS = 1000

print("\nChatbot is ready! Type 'exit' to stop.\n")

while True:
    user_input = input("You: ")

    if user_input.lower() == "exit":
        print("Chatbot: Goodbye!")
        break

    # Encode user input + EOS
    new_input = tokenizer(
        user_input + tokenizer.eos_token,
        return_tensors="pt"
    ).to(device)

    # Append to chat history
    if chat_history_ids is not None:
        input_ids = torch.cat([chat_history_ids, new_input["input_ids"]], dim=-1)
        attention_mask = torch.ones_like(input_ids)
    else:
        input_ids = new_input["input_ids"]
        attention_mask = new_input["attention_mask"]

    # Truncate history if too long
    if input_ids.shape[-1] > MAX_HISTORY_TOKENS:
        input_ids = input_ids[:, -MAX_HISTORY_TOKENS:]
        attention_mask = attention_mask[:, -MAX_HISTORY_TOKENS:]

    # Generate response
    chat_history_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_new_tokens=100,
        temperature=0.7,
        top_k=50,
        top_p=0.9,
        repetition_penalty=1.2,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode only newly generated tokens
    response = tokenizer.decode(
        chat_history_ids[:, input_ids.shape[-1]:][0],
        skip_special_tokens=True
    )

    print("Chatbot:", response.strip())


Using device: cuda


config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/863M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/863M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/293 [00:00<?, ?it/s]

GPT2LMHeadModel LOAD REPORT from: microsoft/DialoGPT-medium
Key                              | Status     |  | 
---------------------------------+------------+--+-
transformer.h.{0...23}.attn.bias | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]


Chatbot is ready! Type 'exit' to stop.

You: hello
Chatbot: Hello! How are you? : 3
You: tell me about apples
Chatbot: They're the best. I love them and they're so easy to make yourself! :D
You: what does engineering mean 
Chatbot: It's a science that involves lots of math and stuff, but it doesn't involve all that much maths. It's more like a math class with an actual physics component.
You: exit
Chatbot: Goodbye!
