In [2]:
import json

# Load chit-chat dataset
with open("dataset.json", "r") as file:
    raw_data = json.load(file)

# Preprocess into input-response pairs
def preprocess_chitchat_dataset(raw_data):
    pairs = []
    for conversation_id, conversation in raw_data.items():
        messages = conversation["messages"]
        for i in range(len(messages) - 1):
            # Extract consecutive message pairs
            current_message = messages[i][-1]["text"]  # Input
            next_message = messages[i + 1][0]["text"]  # Response
            pairs.append({"input": current_message, "response": next_message})
    return pairs

# Generate the input-response pairs
chitchat_pairs = preprocess_chitchat_dataset(raw_data)

# Save as JSON for fine-tuning
with open("chitchat_pairs.json", "w") as file:
    json.dump(chitchat_pairs, file, indent=4)


In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling, EarlyStoppingCallback
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset

# Load DialoGPT model and tokenizer
model_name = "microsoft/DialoGPT-small"  # Options: small, medium, large
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Load preprocessed chit-chat pairs
dataset = load_dataset("json", data_files={"train": "chitchat_pairs.json"})

# Add a padding token if not already defined
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))  # Adjust model embedding size

# Tokenize the dataset
def preprocess_data(examples):
    inputs = tokenizer(examples["input"], truncation=True, padding="max_length", max_length=128)
    outputs = tokenizer(examples["response"], truncation=True, padding="max_length", max_length=128)
    inputs["labels"] = outputs["input_ids"]  # Set labels for causal LM training
    return inputs

# Tokenize the full dataset
tokenized_dataset = dataset.map(preprocess_data, batched=True)

# Split the dataset into train and eval sets (using train_test_split)
train_texts, eval_texts, train_labels, eval_labels = train_test_split(
    tokenized_dataset["train"]["input"], tokenized_dataset["train"]["response"], test_size=0.1, random_state=42
)

# Tokenize the eval dataset as well
def tokenize_data_in_batches(texts, responses, tokenizer, batch_size=512):
    encodings = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors="pt",

        return_attention_mask=True
    )

    response_encodings = tokenizer(
        responses,
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors="pt",

        return_attention_mask=True
    )

    encodings["labels"] = response_encodings["input_ids"]  # Set response as labels for LM training

    return TextDataset(encodings, response_encodings["input_ids"])

# Define the TextDataset class
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.input_ids = encodings["input_ids"]
        self.attention_mask = encodings["attention_mask"]
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.labels[idx],
        }

# Create train and eval datasets
train_dataset = tokenize_data_in_batches(train_texts, train_labels, tokenizer, batch_size=512)
eval_dataset = tokenize_data_in_batches(eval_texts, eval_labels, tokenizer, batch_size=512)

# Data collator with dynamic padding
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Causal language model, not masked
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Evaluate every epoch
    save_strategy="epoch",  # Save the model every epoch
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    logging_dir="./logs",
    logging_steps=10,
    fp16=True,  # Enable mixed precision training
    gradient_accumulation_steps=4,  # Simulate larger batch size
    dataloader_num_workers=4,  # Use multiple workers for data loading
)

# Define the Trainer with Early Stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # Add eval_dataset here for evaluation
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],  # Early stopping callback
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
trainer.save_model("./chitchat_dialogpt")
tokenizer.save_pretrained("./chitchat_dialogpt")
print("Model saved to ./chitchat_dialogpt")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/641 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/351M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Map:   0%|          | 0/131569 [00:00<?, ? examples/s]

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Epoch,Training Loss,Validation Loss
0,3.8028,3.651124
1,3.6428,3.588856
2,3.5401,3.58037


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Model saved to ./chitchat_dialogpt


In [25]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the saved model and tokenizer
model_name = "./chitchat_dialogpt"  # Path to your saved model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

def generate_response(model, tokenizer, prompt, max_length=30):
    # Tokenize the input prompt
    inputs = tokenizer.encode(prompt + tokenizer.eos_token, return_tensors="pt").to(model.device)

    # Generate a response using the model with controlled parameters to avoid unnecessary long responses
    outputs = model.generate(
        inputs,
        max_length=max_length,  # Set a cap for maximum length of response
        num_return_sequences=1,  # Generate only one response
        pad_token_id=tokenizer.eos_token_id,  # Set pad token to EOS token for early stopping
        no_repeat_ngram_size=3,  # Prevent repeating 3-grams (three consecutive tokens)
        top_k=30,  # Limit to top 30 most likely tokens
        top_p=0.85,  # Nucleus sampling, limit tokens to top 85% probability
        temperature=0.6,  # Lower temperature for more deterministic and concise responses
        repetition_penalty=2.0,  # Penalize repetition of tokens
        eos_token_id=tokenizer.eos_token_id,  # Ensure EOS token is used to end generation
        early_stopping=True,  # Ensure generation stops at a natural stopping point
    )

    # Decode the generated tokens back to text, skip special tokens like EOS
    response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

    # Ensure that the response doesn't start with the input question (if it repeats)
    if response.lower().startswith(prompt.lower()):
        response = response[len(prompt):].strip()

    return response

# Test with a sample prompt
prompt = "Good morning Sam, how are you?"
response = generate_response(model, tokenizer, prompt)
print(f"Input: {prompt}")
print(f"Generated Response: {response}")


Input: Good morning Sam, how are you?
Generated Response: I am doing great! How about yourself. Have a good day Adam and have fun with your evening :)
