In [None]:
import datasets
import tempfile
import logging
import random
import config
import os
import yaml
import time
import torch
import transformers
import pandas as pd
import jsonlines
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import TrainingArguments
from transformers import AutoModelForCausalLM

In [None]:
logger = logging.getLogger(__name__)
global_config = None

In [None]:
dataset_name = "lamini_docs.jsonl"
dataset_path = "/content/lamini_docs.jsonl"

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "google/gemma-3-270m"
token = "YOUR_API_KEY"

tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token)
model = AutoModelForCausalLM.from_pretrained(model_id, use_auth_token=token)

In [None]:
from datasets import load_dataset

ds = load_dataset("Cynaptics/persona-chat")

In [None]:
for i in range(5):
    print(ds["train"][i])
    print("-" * 50)

In [None]:
def flatten_dataset(ds):
    """Flattens the nested structure of the dataset."""
    flattened_ds = []
    for entry in ds:
        flattened_ds.append({
            "conv_id": entry["conv_id"],
            "persona_b": entry["persona_b"],
            "dialogue": entry["dialogue"],
            "reference": entry["reference"],
            "__index_level_0__": entry["__index_level_0__"]
        })
    return flattened_ds

# Flatten the training split of the dataset
flattened_train_ds = flatten_dataset(ds["train"])

# Convert to a Hugging Face Dataset object
from datasets import Dataset
flattened_train_ds = Dataset.from_list(flattened_train_ds)

print("Flattened dataset example:")
print(flattened_train_ds[0])

In [None]:
def combine_persona_dialogue(entry):
    """Combines the persona and dialogue into a single text field."""
    persona_text = " ".join(entry["persona_b"])
    dialogue_text = " ".join(entry["dialogue"])
    entry["text"] = f"persona: {persona_text} dialogue: {dialogue_text}"
    return entry

# Apply the combination function to the flattened dataset
combined_train_ds = flattened_train_ds.map(combine_persona_dialogue)

print("\nCombined dataset example:")
print(combined_train_ds[0]["text"])

In [None]:
def tokenize_function(examples):
    """Tokenizes the text data."""
    return tokenizer(examples["text"], truncation=True, max_length=512)

# Apply the tokenization function to the combined dataset
tokenized_train_ds = combined_train_ds.map(tokenize_function, batched=True)

print("\nTokenized dataset example (input_ids):")
print(tokenized_train_ds[0]["input_ids"])

In [None]:
# Format the dataset to PyTorch tensors
tokenized_train_ds.set_format(type="torch", columns=["input_ids", "attention_mask"])

print("\nFormatted dataset example:")
print(tokenized_train_ds[0])

In [None]:
training_args = TrainingArguments(
    output_dir=tempfile.mkdtemp(),  # Directory for saving model checkpoints and logs
    num_train_epochs=1,  # Number of training epochs
    per_device_train_batch_size=2,  # Reduced batch size
    gradient_accumulation_steps=2, # Accumulate gradients over 2 steps
    learning_rate=2e-5,  # Learning rate
    logging_dir=tempfile.mkdtemp(),  # Directory for storing logs
    report_to="none" # Disable Weights & Biases logging
)

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    data_collator=data_collator,
)

print(trainer)

In [None]:
trainer.train()

In [None]:
model.save_pretrained("./my_finetuned_model")
tokenizer.save_pretrained("./my_finetuned_model")