# Fine-tuning DialoGPT with Clean Separation of Training and Inference

This notebook demonstrates fine-tuning DialoGPT on a comments dataset with a clean separation between:
1. Training phase
2. Saving the model
3. Loading the model from disk
4. Inference phase

In [15]:
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.optim import AdamW
import os
import gc

In [16]:
# Define device as MPS if available (for Mac with Apple Silicon)
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: mps


In [17]:
# Verify MPS is built and available
if device.type == 'mps':
    print(f"MPS is built: {torch.backends.mps.is_built()}")

MPS is built: True


## 1. Load and Prepare Data

In [18]:
# Load dataset from CSV file
data = pd.read_csv('comments_dataset.csv')
comments = data['comment'].tolist()
labels = data['label'].tolist()
replies = data['reply'].tolist()

# Display the first few rows
data.head(10)

Unnamed: 0,comment,label,reply
0,Fantastic effort!,appreciation,"Thank you, that means the world to me!"
1,Fantastic effort!,appreciation,I appreciate your kind words!
2,This is the worst thing I've seen.,troll,Apologies if this wasn't up to the mark.
3,I really appreciate your hard work!,appreciation,"Thank you, that means the world to me!"
4,Pathetic!,troll,I'm always open to constructive criticism.
5,You should just quit.,troll,"I respect your opinion, and I'll keep improving."
6,I've seen better work from a 5-year-old.,troll,I apologize if this didn't meet your expectati...
7,Brilliant execution!,appreciation,Thanks a lot!
8,Terrible effort!,troll,"I respect your opinion, and I'll keep improving."
9,This is a complete waste of time.,troll,I apologize if this didn't meet your expectati...


In [19]:
# Custom dataset class                                                                                               
class CommentsDataset(Dataset):                                                                                      
    def __init__(self, data, comments, replies, labels, tokenizer, max_length=128):                                  
        self.comments = comments                                                                                     
        self.replies = replies                                                                                       
        self.labels = labels                                                                                         
        self.tokenizer = tokenizer                                                                                   
        self.max_length = max_length                                                                                 
                                                                                                                    
    def __len__(self):                                                                                               
        return len(self.comments)                                                                                    
                                                                                                                    
    def __getitem__(self, idx):                                                                                      
        # Format as a conversation with a clear separator                                                            
        conversation = f"User: {self.comments[idx]}\nAssistant: {self.replies[idx]}"                                 
                                                                                                                    
        # Tokenize the entire conversation                                                                           
        encoding = self.tokenizer(                                                                                   
            conversation,                                                                                            
            padding="max_length",                                                                                    
            truncation=True,                                                                                         
            max_length=self.max_length,                                                                              
            return_tensors='pt'                                                                                      
        )                                                                                                            
                                                                                                                    
        # For training, we need the labels to be the same as input_ids                                               
        # This is for the causal language modeling objective                                                         
        input_ids = encoding['input_ids'].squeeze()                                                                  
        attention_mask = encoding['attention_mask'].squeeze()                                                        
        labels = input_ids.clone()                                                                                   
                                                                                                                    
        return {                                                                                                     
            'input_ids': input_ids,                                                                                  
            'attention_mask': attention_mask,                                                                        
            'labels': labels                                                                                         
        } 

## 2. Load Base Model for Training

In [20]:
# Load the base model and tokenizer
print("Loading base DialoGPT-small model...")
tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-small')
model = AutoModelForCausalLM.from_pretrained('microsoft/DialoGPT-small').to(device)

# Set padding token
tokenizer.pad_token = tokenizer.eos_token

Loading base DialoGPT-small model...


In [21]:
# Create dataset and dataloader
dataset = CommentsDataset(data, comments, replies, labels, tokenizer)
train_loader = DataLoader(dataset, batch_size=4, shuffle=True)

## 3. Training Phase

In [22]:
# Training parameters
num_epochs = 1
learning_rate = 2e-5

# Define optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Training loop
print("Starting training...")
for epoch in range(num_epochs):
    for batch in train_loader:
        # Move batch data to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch + 1}/{num_epochs} Loss: {loss.item()}")

Starting training...
Epoch 1/1 Loss: 0.04108741134405136


## 4. Save the Fine-tuned Model

In [23]:
# Create directory for the model if it doesn't exist
save_dir = "./fine_tuned_dialoGPT"
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# Save the fine-tuned model
print(f"Saving fine-tuned model to {save_dir}...")
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print("Model and tokenizer saved successfully.")

Saving fine-tuned model to ./fine_tuned_dialoGPT...
Model and tokenizer saved successfully.


## 5. Clean Separation: Clear Model from Memory

In [24]:
# Clear the model from memory to ensure clean separation
print("Clearing model from memory...")
del model
gc.collect()
torch.cuda.empty_cache() if torch.cuda.is_available() else None
print("Model cleared from memory.")

Clearing model from memory...
Model cleared from memory.


## 6. Load the Fine-tuned Model from Disk for Inference

In [25]:
# Load the fine-tuned model from disk
print(f"Loading fine-tuned model from {save_dir}...")
model = AutoModelForCausalLM.from_pretrained(save_dir).to(device)
tokenizer = AutoTokenizer.from_pretrained(save_dir)
print("Model and tokenizer loaded successfully.")

Loading fine-tuned model from ./fine_tuned_dialoGPT...
Model and tokenizer loaded successfully.


## 7. Inference Function

In [26]:
def generate_reply(comment):                                                                                         
     """                                                                                                              
     Generate a reply using the fine-tuned model loaded from disk.                                                    
     This function uses the model that was explicitly loaded from ./fine_tuned_dialoGPT                               
     """                                                                                                              
     # Format the input as a conversation prompt                                                                      
     prompt = f"User: {comment}\nAssistant:"                                                                          
                                                                                                                      
     inputs = tokenizer(prompt, return_tensors="pt", padding=True)                                                    
     input_ids = inputs['input_ids'].to(device)                                                                       
     attention_mask = inputs['attention_mask'].to(device)                                                             
                                                                                                                      
     # Generate reply                                                                                                 
     reply_ids = model.generate(                                                                                      
         input_ids=input_ids,                                                                                         
         attention_mask=attention_mask,                                                                               
         max_length=len(input_ids[0]) + 50,  # Allow for a reasonable response length                                 
         pad_token_id=tokenizer.eos_token_id,                                                                         
         temperature=0.7,                                                                                             
         top_k=50,                                                                                                    
         top_p=0.9,                                                                                                   
         do_sample=True                                                                                               
     )                                                                                                                
                                                                                                                      
     # Decode only the generated part, not the input prompt                                                           
     generated_text = tokenizer.decode(reply_ids[0], skip_special_tokens=True)                                        
                                                                                                                      
     # Extract only the Assistant's reply                                                                             
     reply = generated_text.split("Assistant:")[-1].strip()                                                           
                                                                                                                      
     # If the reply is empty, return a default response                                                               
     if not reply:                                                                                                    
         reply = "I appreciate your comment."                                                                         
                                                                                                                      
     return reply 

## 8. Test the Model with Sample Comments

In [31]:
# Test comments
test_comments = [
    "Your work is amazing!",
    "I really appreciate your effort",
    "This is terrible",
    "Nice work!",
    "your work is good?"
]

# Generate replies for each test comment
for comment in test_comments:
    reply = generate_reply(comment)
    print(f"Comment: {comment}")
    print(f"Reply: {reply}\n")

Comment: Your work is amazing!
Reply: Thank you, that means the world to me!

Comment: I really appreciate your effort
Reply: I appreciate your kind words!

Comment: This is terrible
Reply: I'm sorry you feel that way.

Comment: Nice work!
Reply: Really appreciate it!

Comment: your work is good?
Reply: Thanks a ton!



In [None]:
from torch.utils.data import Dataset, DataLoader
"""Data Loader : While training a model, we typically want to pass samples in “minibatches”, 
reshuffle the data at every epoch to reduce model overfitting, 
xand use Python’s multiprocessing to speed up data retrieval."""

# Custom dataset class
class CommentsDataset(Dataset):
    def __init__(self, data, comments, replies, label, tokenizer, max_length=128):
        self.comments = comments
        self.replies = replies
        self.label = label
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.comments)

    def __getitem__(self, idx):
        # Tokenize input (comment) and target (reply)
        inputs = self.tokenizer(
            self.comments[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        label = self.tokenizer(
            self.label[idx],
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        labels = self.tokenizer(
            self.replies[idx], #Using iloc because we are using pandas DataFrame
            padding="max_length", #Using padding to makesure all the input sequences have the same length
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        #Many PyTorch models, especially transformer-based models, 
        # expect input tensors to have specific shapes. If the tensors have unnecessary dimensions, it can lead to errors.
        # Correct names: `input_ids` and `attention_mask`
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        label_s = label['input_ids'].squeeze()
        labels = labels['input_ids'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label__s': label_s,
            'labels': labels
        }

# Sample data for testing
comments = ["This tutorial was helpful!", "Can you explain step 3?"]
replies = ["Glad it helped!", "Sure, let me explain step 3 in detail."]

# Instantiate dataset and dataloader
dataset = CommentsDataset(data, comments, label, replies, tokenizer)
train_loader = DataLoader(dataset, batch_size=4, shuffle=True)


In [None]:
from torch.utils.data import Dataset, DataLoader
"""Data Loader : While training a model, we typically want to pass samples in “minibatches”, 
reshuffle the data at every epoch to reduce model overfitting, 
xand use Python’s multiprocessing to speed up data retrieval."""

# Custom dataset class
class CommentsDataset(Dataset):
    def __init__(self, data, comments, replies, label, tokenizer, max_length=128):
        self.comments = comments
        self.replies = replies
        self.label = label
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.comments)

    def __getitem__(self, idx):
        # Tokenize input (comment) and target (reply)
        inputs = self.tokenizer(
            self.comments[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        label = self.tokenizer(
            self.label[idx],
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        labels = self.tokenizer(
            self.replies[idx], #Using iloc because we are using pandas DataFrame
            padding="max_length", #Using padding to makesure all the input sequences have the same length
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        #Many PyTorch models, especially transformer-based models, 
        # expect input tensors to have specific shapes. If the tensors have unnecessary dimensions, it can lead to errors.
        # Correct names: `input_ids` and `attention_mask`
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        label_s = label['input_ids'].squeeze()
        labels = labels['input_ids'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label__s': label_s,
            'labels': labels
        }

# Sample data for testing
comments = ["This tutorial was helpful!", "Can you explain step 3?"]
replies = ["Glad it helped!", "Sure, let me explain step 3 in detail."]

# Instantiate dataset and dataloader
dataset = CommentsDataset(data, comments, label, replies, tokenizer)
train_loader = DataLoader(dataset, batch_size=4, shuffle=True)


In [None]:
from torch.utils.data import Dataset, DataLoader
"""Data Loader : While training a model, we typically want to pass samples in “minibatches”, 
reshuffle the data at every epoch to reduce model overfitting, 
xand use Python’s multiprocessing to speed up data retrieval."""

# Custom dataset class
class CommentsDataset(Dataset):
    def __init__(self, data, comments, replies, label, tokenizer, max_length=128):
        self.comments = comments
        self.replies = replies
        self.label = label
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.comments)

    def __getitem__(self, idx):
        # Tokenize input (comment) and target (reply)
        inputs = self.tokenizer(
            self.comments[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        label = self.tokenizer(
            self.label[idx],
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        labels = self.tokenizer(
            self.replies[idx], #Using iloc because we are using pandas DataFrame
            padding="max_length", #Using padding to makesure all the input sequences have the same length
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        #Many PyTorch models, especially transformer-based models, 
        # expect input tensors to have specific shapes. If the tensors have unnecessary dimensions, it can lead to errors.
        # Correct names: `input_ids` and `attention_mask`
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        label_s = label['input_ids'].squeeze()
        labels = labels['input_ids'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label__s': label_s,
            'labels': labels
        }

# Sample data for testing
comments = ["This tutorial was helpful!", "Can you explain step 3?"]
replies = ["Glad it helped!", "Sure, let me explain step 3 in detail."]

# Instantiate dataset and dataloader
dataset = CommentsDataset(data, comments, label, replies, tokenizer)
train_loader = DataLoader(dataset, batch_size=4, shuffle=True)


In [None]:
from torch.utils.data import Dataset, DataLoader
"""Data Loader : While training a model, we typically want to pass samples in “minibatches”, 
reshuffle the data at every epoch to reduce model overfitting, 
xand use Python’s multiprocessing to speed up data retrieval."""

# Custom dataset class
class CommentsDataset(Dataset):
    def __init__(self, data, comments, replies, label, tokenizer, max_length=128):
        self.comments = comments
        self.replies = replies
        self.label = label
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.comments)

    def __getitem__(self, idx):
        # Tokenize input (comment) and target (reply)
        inputs = self.tokenizer(
            self.comments[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        label = self.tokenizer(
            self.label[idx],
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        labels = self.tokenizer(
            self.replies[idx], #Using iloc because we are using pandas DataFrame
            padding="max_length", #Using padding to makesure all the input sequences have the same length
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        #Many PyTorch models, especially transformer-based models, 
        # expect input tensors to have specific shapes. If the tensors have unnecessary dimensions, it can lead to errors.
        # Correct names: `input_ids` and `attention_mask`
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        label_s = label['input_ids'].squeeze()
        labels = labels['input_ids'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label__s': label_s,
            'labels': labels
        }

# Sample data for testing
comments = ["This tutorial was helpful!", "Can you explain step 3?"]
replies = ["Glad it helped!", "Sure, let me explain step 3 in detail."]

# Instantiate dataset and dataloader
dataset = CommentsDataset(data, comments, label, replies, tokenizer)
train_loader = DataLoader(dataset, batch_size=4, shuffle=True)
