In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, get_scheduler
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW

In [None]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-410m")
dataset = load_dataset("conceptofmind/flan2021_submix_original")
# dataset = dataset.train_test_split(test_size=0.8)
system_prompt = tokenizer.bos_token + "system\n The following is a conversation between user and an AI assistant. " \
                                      "The assistant is helpful, creative, clever, and very friendly.\n" \
                + tokenizer.eos_token

In [None]:
def tokenize_function(example):
    dictionary = tokenizer(system_prompt + tokenizer.bos_token + "user: " + example["inputs"] + tokenizer.eos_token +
                           tokenizer.bos_token + "assistant: " + example["targets"] + tokenizer.eos_token,
                           truncation=True)
    dictionary["k"] = len(tokenizer(system_prompt + tokenizer.bos_token + "user: " + example["inputs"]
                                    + tokenizer.eos_token + tokenizer.bos_token + "assistant: ")["input_ids"]) - 1
    return dictionary

In [None]:
train_dataset = dataset["train"].shuffle(seed=42).select(range(20000))
tokenized_dataset = train_dataset.map(tokenize_function)
tokenized_dataset = tokenized_dataset.remove_columns(["inputs", "targets", "task_source", "task_name",
                                                      "template_type"])
tokenized_dataset.set_format("torch")
train_dataloader = DataLoader(tokenized_dataset, shuffle=True, batch_size=1)

In [None]:
model = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-410m")
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)
loss_fct = torch.nn.CrossEntropyLoss()

In [None]:
model.train()
i = 0
running_loss = 0.0
training_loss = []
model_save_path = 'flan-pythia.pt'

To perform supervised fine-tuning, we move logits and labels, ensuring that the task is not self-supervised. The objective is to predict target tokens based on input tokens, rather than predicting the next input tokens.

In [None]:
for batch in train_dataloader:
    i += 1
    print(i)
    batch = {k: v.to(device) for k, v in batch.items()}
    k = int(batch["k"])
    del batch["k"]
    outputs = model(**batch)
    labels = batch["input_ids"].to(outputs.logits.device)
    #Logits are generated from the last token of the input sequence up to the end, excluding the last token. 
    #This is because the logit corresponding to the last token is responsible for predicting the EOS (end-of-sequence) token.
    shift_logits = outputs.logits[:, k:-1, :].contiguous()
    #Labels are from first target token till the end.
    shift_labels = labels[:, k + 1:].contiguous()
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    loss.backward()
    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    running_loss += loss.item()
    if i % 100 == 0:
        print(running_loss / 100)
        training_loss.append(running_loss / 100)
        running_loss = 0.0
        torch.save(model.state_dict(), model_save_path)
        i = 0
torch.save(model.state_dict(), model_save_path)