In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments

# Updated dataset class using 'input_text' and 'output_text'
class ConversationDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=512):
        self.data = pd.read_csv(csv_file)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        # Create a dialogue string combining input and output texts.
        text = f"User: {row['input_text']}\nBot: {row['output_text']}"
        inputs = self.tokenizer(text, return_tensors="pt", max_length=self.max_length,
                                  truncation=True, padding="max_length")
        input_ids = inputs["input_ids"].squeeze()
        attention_mask = inputs["attention_mask"].squeeze()
        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": input_ids}

# Load tokenizer and model.
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
# Set pad token to eos_token for padding.
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")

# Create full dataset from CSV.
full_dataset = ConversationDataset("/content/Diverse_10K_Subset.csv", tokenizer)

# Split dataset: 80% training, 20% validation.
train_size = int(0.8 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

# Set up training arguments with mixed precision to reduce training time.
training_args = TrainingArguments(
    output_dir="./results",
    run_name="chatbot_finetuning_v1",
    num_train_epochs=10,               
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    report_to=[],                       
    fp16=True,                         
    evaluation_strategy="steps",
    eval_steps=1000,                     
)

# Initialize the Trainer with training and validation datasets.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Start fine-tuning.
trainer.train()

# Save the fine-tuned model and tokenizer.
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss
1000,0.0134,0.012005
2000,0.0121,0.011659
3000,0.0116,0.011165
4000,0.0112,0.011108
5000,0.0112,0.010984
6000,0.0111,0.010866
7000,0.0109,0.010776
8000,0.0108,0.010751
9000,0.0109,0.010619
10000,0.0107,0.010595


('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.json',
 './fine_tuned_model/merges.txt',
 './fine_tuned_model/added_tokens.json',
 './fine_tuned_model/tokenizer.json')