In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments

# Updated dataset class using 'input_text' and 'output_text'
class ConversationDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=512):
        self.data = pd.read_csv(csv_file)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        # Create a dialogue string combining input and output texts.
        text = f"User: {row['input_text']}\nBot: {row['output_text']}"
        inputs = self.tokenizer(text, return_tensors="pt", max_length=self.max_length,
                                  truncation=True, padding="max_length")
        input_ids = inputs["input_ids"].squeeze()
        attention_mask = inputs["attention_mask"].squeeze()
        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": input_ids}

# Load tokenizer and model.
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
# Set pad token to eos_token for padding.
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")

# Create full dataset from CSV.
full_dataset = ConversationDataset("/content/Diverse_10K_Subset.csv", tokenizer)

# Split dataset: 80% training, 20% validation.
train_size = int(0.8 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

# Set up training arguments with mixed precision to reduce training time.
training_args = TrainingArguments(
    output_dir="./results",
    run_name="chatbot_finetuning_v1",
    num_train_epochs=10,                # 10 epochs as specified.
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    report_to=[],                       # Disable wandb logging if not needed.
    fp16=True,                          # Enable mixed precision.
    evaluation_strategy="steps",
    eval_steps=1000,                     # Evaluate every 100 steps.
)

# Initialize the Trainer with training and validation datasets.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Start fine-tuning.
trainer.train()

# Save the fine-tuned model and tokenizer.
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss
1000,0.0134,0.012005
2000,0.0121,0.011659
3000,0.0116,0.011165
4000,0.0112,0.011108
5000,0.0112,0.010984
6000,0.0111,0.010866
7000,0.0109,0.010776
8000,0.0108,0.010751
9000,0.0109,0.010619
10000,0.0107,0.010595


('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.json',
 './fine_tuned_model/merges.txt',
 './fine_tuned_model/added_tokens.json',
 './fine_tuned_model/tokenizer.json')

In [3]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.3.0-py3-none-any.whl.metadata (19 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.3.0-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [7]:
pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=8ea357a5f3f7670e6359ba8e333bdbea8149a43c1ea4f407306ab80f39bfcb58
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [9]:
pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.

In [10]:
import math
import torch
import evaluate
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the fine-tuned model and tokenizer.
tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_model")
model = AutoModelForCausalLM.from_pretrained("./fine_tuned_model")
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

predictions = []
references = []

# Iterate over the validation dataset (assumed to be available as val_dataset).
# It is assumed that each sample in val_dataset has keys:
# "input_ids", "attention_mask", and that the original ground truth text is present within
# the tokenized sequence after "Bot:".
for i in range(len(val_dataset)):
    sample = val_dataset[i]
    input_ids = sample["input_ids"].unsqueeze(0).to(device)
    attention_mask = sample["attention_mask"].unsqueeze(0).to(device)
    # Generate response using max_new_tokens.
    generated_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_new_tokens=64,
        pad_token_id=tokenizer.eos_token_id
    )
    pred_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    predictions.append(pred_text)

    # Extract reference text.
    # Our dataset was originally constructed as: "User: {input_text}\nBot: {output_text}"
    original_text = tokenizer.decode(sample["input_ids"], skip_special_tokens=True)
    if "Bot:" in original_text:
        ref_text = original_text.split("Bot:")[-1].strip()
    else:
        ref_text = original_text
    references.append(ref_text)

# Load evaluation metrics from Hugging Face evaluate library.
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")
meteor_metric = evaluate.load("meteor")
bertscore_metric = evaluate.load("bertscore")

# Compute metrics.
results_bleu = bleu_metric.compute(predictions=predictions, references=[[ref] for ref in references])
results_rouge = rouge_metric.compute(predictions=predictions, references=references)
results_meteor = meteor_metric.compute(predictions=predictions, references=references)
results_bertscore = bertscore_metric.compute(predictions=predictions, references=references, lang="en")

# Print out the evaluation metrics.
print("Evaluation Metrics:")
print("BLEU score:", results_bleu["bleu"])
print("ROUGE-L score:", results_rouge["rougeL"])
print("METEOR score:", results_meteor["meteor"])
avg_bertscore_f1 = sum(results_bertscore["f1"]) / len(results_bertscore["f1"])
print("BERTScore F1:", avg_bertscore_f1)

# Optionally, if you have evaluation loss from trainer.evaluate(), compute perplexity:
# eval_loss = <your eval loss>
# perplexity = math.exp(eval_loss)
# print("Perplexity:", perplexity)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation Metrics:
BLEU score: 0.38195039542356096
ROUGE-L score: 0.6557958129995847
METEOR score: 0.8803038664338615
BERTScore F1: 0.8970270980298519


In [11]:
!zip -r fine_tuned_model.zip fine_tuned_model
from google.colab import files
files.download("fine_tuned_model.zip")


  adding: fine_tuned_model/ (stored 0%)
  adding: fine_tuned_model/merges.txt (deflated 53%)
  adding: fine_tuned_model/config.json (deflated 50%)
  adding: fine_tuned_model/tokenizer.json (deflated 82%)
  adding: fine_tuned_model/tokenizer_config.json (deflated 55%)
  adding: fine_tuned_model/vocab.json (deflated 59%)
  adding: fine_tuned_model/generation_config.json (deflated 24%)
  adding: fine_tuned_model/model.safetensors (deflated 7%)
  adding: fine_tuned_model/special_tokens_map.json (deflated 74%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>