In [1]:
import os

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [3]:
from datasets import load_dataset, Dataset

In [4]:
from post_training import get_lora_config, get_training_arguments
from dataset import get_baseline_dataset
from trl import SFTTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, DataCollatorWithPadding



In [5]:
tiny_textbooks = load_dataset("nampdn-ai/tiny-textbooks")

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

In [6]:
bookcorpus = load_dataset("bookcorpus")

In [67]:
wikitext = load_dataset("wikitext", "wikitext-2-raw-v1")

In [8]:
c4 = load_dataset("c4", "en", streaming=True)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [9]:
minipile = load_dataset("JeanKaddour/minipile", streaming=True)

In [10]:
alpaca = load_dataset("tatsu-lab/alpaca")

## Dataset helpers

In [11]:
from loguru import logger
import numpy as np
from torch.utils.data import Dataset
import random

class TokenizedDataset(Dataset):
    def __init__(self, list_of_strings, tokenizer, max_length=2048):
        self.data = []
        self.tokenizer = tokenizer
        self.total_calls = 0
        self.total_length = 0
        tokenizer.padding_side = "right"
        pad = "do_not_pad"
        self.max_length = max_length
        for s in list_of_strings:
            encoded = tokenizer(
                text=s + tokenizer.eos_token,
                return_tensors="np",
                truncation=True,
                max_length=self.max_length,
                padding=pad,
            )
            self.total_length += encoded['input_ids'].shape[1]
            self.data.append({
                'input_ids': encoded['input_ids'].squeeze(0),
                'labels': encoded['input_ids'].squeeze(0),
                'attention_mask': encoded['attention_mask'].squeeze(0)
            })
        self.mean_length = self.total_length / len(list_of_strings)
        self.packed_data = self.data.copy()
        #self.pack(64)

    def __len__(self):
        return len(self.packed_data)

    def __getitem__(self, idx):
        return self.packed_data[idx]

class TokenizedQADataset(TokenizedDataset):
    """
    Same as the tokenized dataset, but designed to "mask out" the labels of
    the prompts such that they don't affect the model's loss.
    
    Question and answer pairs are concatenated as they are given. Make sure
    to include the right separator between them (" ", "\n", ". ", etc.)
    """
    def __init__(self, list_of_question_answers, tokenizer, max_length=2048):
        self.data = []
        self.tokenizer = tokenizer
        self.total_calls = 0
        self.total_length = 0
        tokenizer.padding_side = "right"
        self.max_length = max_length
        for question, answer in list_of_question_answers:
            encoded_question = tokenizer(
                text=question,
                return_tensors="np",
                truncation=True,
                max_length=self.max_length,
                padding="do_not_pad",
            )
            encoded_answer = tokenizer(
                text=answer+tokenizer.eos_token,
                return_tensors="np",
                truncation=True,
                max_length=self.max_length,
                padding="max_length",
            )
            encoded_inputs = np.concatenate([encoded_question["input_ids"],
                                            encoded_answer["input_ids"]], axis=-1)
            # labels have to be -100 so that the question does not affect the model's loss
            encoded_labels = np.concatenate([np.array([-100] * encoded_question["input_ids"].shape[-1])[None, :],
                                            encoded_answer["input_ids"]], axis=-1)
            encoded_attention_mask = np.concatenate([encoded_question["attention_mask"],
                                            encoded_answer["attention_mask"]], axis=-1)
            encoded_labels[encoded_labels == tokenizer.eos_token_id] = -100

            self.total_length += encoded_inputs.shape[1]
            self.data.append({
                'input_ids': encoded_inputs.squeeze(0)[:max_length],
                'labels': encoded_labels.squeeze(0)[:max_length],
                'attention_mask': encoded_attention_mask.squeeze(0)[:max_length]
            })
        self.mean_length = self.total_length / len(list_of_question_answers)
        self.packed_data = self.data.copy()

In [12]:
# alpaca = TokenizedQADataset(alpaca_list_ds, tokenizer, max_length=256)

In [13]:
# from trl import SFTTrainer

# def fn(x, y, *args):
#     return y

# SFTTrainer._prepare_dataset = fn

# from transformers import DataCollatorWithPadding

# trainer = SFTTrainer(
#     model=model,
#     train_dataset=alpaca,
#     eval_dataset=alpaca,
#     # tokenizer=tokenizer,
#     args=training_arguments,
#     # packing=False,
#     dataset_text_field="text",
#     max_seq_length=256, # tweak this,
#     data_collator=DataCollatorWithPadding(tokenizer)
# )

## post processing

In [17]:
from datasets import load_dataset, Dataset
def to_dataset(iterable_dataset):
    data_list = [item for item in iterable_dataset]
    dataset = Dataset.from_dict({key: [dic[key] for dic in data_list] for key in data_list[0]})
    return dataset

In [19]:
c4 = c4["train"].take(2000)

In [20]:
c4 = to_dataset(c4)

In [21]:
next(iter(c4))

{'text': 'Beginners BBQ Class Taking Place in Missoula!\nDo you want to get better at making delicious BBQ? You will have the opportunity, put this on your calendar now. Thursday, September 22nd join World Class BBQ Champion, Tony Balay from Lonestar Smoke Rangers. He will be teaching a beginner level class for everyone who wants to get better with their culinary skills.\nHe will teach you everything you need to know to compete in a KCBS BBQ competition, including techniques, recipes, timelines, meat selection and trimming, plus smoker and fire information.\nThe cost to be in the class is $35 per person, and for spectators it is free. Included in the cost will be either a t-shirt or apron and you will be tasting samples of each meat that is prepared.',
 'timestamp': '2019-04-25T12:57:54Z',
 'url': 'https://klyq.com/beginners-bbq-class-taking-place-in-missoula/'}

In [22]:
minipile = minipile["train"].take(2000)

In [23]:
next(iter(minipile))

{'text': "HTC's Vive Pro headset is available to pre-order for $799\n\nWe've seen plenty of Beats-focused KIRFs in our time, some better than others. Few, however, play quite so directly on the name as OrigAudio's Beets. For $25, adopters get a set of headphones that bear little direct resemblance to Dr. Dre's audio gear of choice, but are no doubt bound to impress friends -- at least, up until they see a root vegetable logo instead of a lower-case B. Thankfully, there's more to it than just amusing and confusing peers. Every purchase will lead to a donation of canned beets (what else?) to the Second Harvest Food Bank of Orange County. For us, that's reason enough to hope that Beats doesn't put the kibosh on OrigAudio's effort. Besides, we could use some accompaniment for our BeetBox."}

In [24]:
minipile = to_dataset(minipile)

In [25]:
tiny_textbooks = tiny_textbooks["train"].shuffle().select(range(0, 2000))

In [26]:
next(iter(tiny_textbooks))["text"]

"Manu Korihi News for 29 November 2011. One of the new Green Party MPs, who's Maori, says there's still a long way to go to have the Treaty of Waitangi recognised properly; A lawyer representing a whanau member, who took the body of James Takamore, says a series of meetings will be held with the wider hapu to talk about how to proceed following a court ruling; The senior Maori advisor at Massey University says this year's Nga Kupu Ora Maori Book Awards is a chance to celebrate a milestone in Maori language publishing; Waikato University's Te Piringa Faculty of Law, launched a new research centre today, which will tackle a variety of environmental law issues, including Maori and indigenous governance."

In [27]:
bookcorpus = bookcorpus["train"].shuffle().select(range(0, 2000))

In [28]:
next(iter(bookcorpus))["text"]

'outside , the storm continued to rage , beating against the windows , but inside they were locked in their own world .'

In [75]:
wikitext = wikitext["train"].filter(lambda example: len(example["text"]) >= 200).shuffle().select(range(0, 2000))

Filter:   0%|          | 0/36718 [00:00<?, ? examples/s]

In [82]:
wikitext[0]["text"]

' " In the second stanza , the tree is a sucking babe drawing nourishment from Mother Earth ; in the third it is a supplicant reaching its leafy arms to the sky in prayer ... In the fourth stanza , the tree is a girl with jewels ( a nest of robins ) in her hair ; and in the fifth , it is a chaste woman living alone with nature and with God . There is no warrant in the poem to say that it is different trees that remind the poet of these different types of people . " \n'

In [31]:
alpaca = alpaca["train"].shuffle().select(range(0, 2000))

In [32]:
alpaca_template = {
    "description": "Template used by Alpaca-LoRA.",
    "prompt_input": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n",
    "prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n",
    "response_split": "### Response:"    
}

In [33]:
alpaca_list_ds = []
for sample in alpaca.to_list():
    if sample["input"] != "":
        prompt = alpaca_template["prompt_input"].format(instruction=sample["instruction"], input=sample["input"])
    else:
        prompt = alpaca_template["prompt_no_input"].format(instruction=sample["instruction"])
    response = sample["output"]
    alpaca_list_ds.append((prompt, response))

In [35]:
model_id = "microsoft/phi-1_5"
model_revision = "349cf8b5e81fd5f791d1740da5de1313a0419bbd" # latest as of feb 1st
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

In [36]:
alpaca = TokenizedQADataset(alpaca_list_ds, tokenizer, max_length=256)

## Evaluating Crossentropy

In [None]:
# model_id = "microsoft/phi-1_5"
# model_revision = "349cf8b5e81fd5f791d1740da5de1313a0419bbd" # latest as of feb 1st

In [37]:
vocab = tokenizer.get_vocab()
len(vocab)

50295

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    revision=model_revision,
    trust_remote_code=True,
    # be careful with this?
    # torch_dtype=torch.float16,
    # attn_implementation="flash_attention_2",
)

In [39]:
training_arguments = get_training_arguments("./tmp")

In [40]:
model = model.cuda()

In [41]:
def evaluate_dataset(dataset, custom_dataset=False):
    trainer = SFTTrainer(
        model=model,
        train_dataset=dataset,
        eval_dataset=dataset,
        tokenizer=tokenizer,
        args=training_arguments,
        packing=False,
        dataset_text_field="text",
        max_seq_length=256, # tweak this,
        data_collator= DataCollatorWithPadding(tokenizer) if custom_dataset else None
    )
    return trainer.evaluate()

In [42]:
eval_res = {}
for ds in [c4, minipile, tiny_textbooks, wikitext, alpaca]:
    eval_res[ds] = evaluate_dataset(ds, custom_dataset=ds==alpaca)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [43]:
# eval_res[alpaca] = evaluate_dataset(alpaca, custom_dataset=True)

In [44]:
for ds, res in eval_res.items():
    print(ds)
    print(res)
    print("=============")

Dataset({
    features: ['text', 'timestamp', 'url'],
    num_rows: 2000
})
{'eval_loss': 3.2249398231506348, 'eval_runtime': 53.4456, 'eval_samples_per_second': 37.421, 'eval_steps_per_second': 4.678}
Dataset({
    features: ['text'],
    num_rows: 2000
})
{'eval_loss': 2.792614698410034, 'eval_runtime': 53.4744, 'eval_samples_per_second': 37.401, 'eval_steps_per_second': 4.675}
Dataset({
    features: ['text', 'source', 's', 'len', 'idx', 'textbook'],
    num_rows: 2000
})
{'eval_loss': 3.3270151615142822, 'eval_runtime': 52.5336, 'eval_samples_per_second': 38.071, 'eval_steps_per_second': 4.759}
Dataset({
    features: ['text'],
    num_rows: 2000
})
{'eval_loss': 4.042848110198975, 'eval_runtime': 43.792, 'eval_samples_per_second': 45.67, 'eval_steps_per_second': 5.709}
<__main__.TokenizedQADataset object at 0x7fb22d267b10>
{'eval_loss': 1.5633244514465332, 'eval_runtime': 53.5137, 'eval_samples_per_second': 37.374, 'eval_steps_per_second': 4.672}


In [84]:
# ignore the result from wikitext above, consider instead this one:
evaluate_dataset(wikitext)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'eval_loss': 3.7951738834381104,
 'eval_runtime': 50.3011,
 'eval_samples_per_second': 39.761,
 'eval_steps_per_second': 4.97}