In [2]:
from transformers import AutoTokenizer

In [3]:
## loading dataset
from datasets import load_dataset
from random import randrange

dataset = load_dataset("databricks/databricks-dolly-15k", split = "train")

print(f"Dataset size : {len(dataset)}")
print(dataset[randrange(len(dataset))])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

databricks-dolly-15k.jsonl:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15011 [00:00<?, ? examples/s]

Dataset size : 15011
{'instruction': 'Was the Pixie Lott song "Mama Do" ever the #1 single?', 'context': '"Mama Do (Uh Oh, Uh Oh)" debuted at number one on the UK Singles Chart, selling 58,840 downloads in its first week. Lott described her reaction upon learning the song reached number one: "I was in bed at my mum and dad\'s, where I still live, in Brentwood, Essex. I was on the laptop, and I got a phone call from my A&R man. He really dragged it out. I said, \'Please put me out of my misery.\' And he said, \'I\'m really, really sorry but... you\'re No 1!\' I spent the rest of the day running around the house, screaming."\n\nThe single was certified silver by the British Phonographic Industry (BPI) on 28 August 2009 for sales in excess of 200,000 copies. Exactly 11 years later, on 28 August 2020, the certification was upgraded to gold for sales and streams of over 400,000. As a result of the single\'s popularity at the time, the single\'s iTunes B-side, "Use Somebody", debuted and pea

In [4]:
def format_dolly(sample):
    instruction = f"### Instruction\n{sample["instruction"]}"
    context = f"### Context\n{sample["context"]}" if len(sample["context"]) > 0 else None
    response = f"### Answer\n{sample["response"]}"
    prompt = "\n\n".join([i for i in [instruction, context, response] if i is not None])
    return prompt


In [5]:
format_dolly(dataset[randrange(len(dataset))])

"### Instruction\nWhat is the New Administrative Capital of Egypt?\n\n### Context\nThe New Administrative Capital (NAC) (Arabic: العاصمة الإدارية الجديدة, romanized: al-ʿĀṣima al-ʾIdārīya al-Gadīda) is a new urban community in Cairo Governorate, Egypt and a satellite of Cairo City. It is planned to be Egypt's new capital and has been under construction since 2015. It was announced by the then Egyptian housing minister Mostafa Madbouly at the Egypt Economic Development Conference on 13 March 2015. The capital city is considered one of the projects for economic development, and is part of a larger initiative called Egypt Vision 2030.\n\n### Answer\nThe New Administrative Capital is planned city that will replace Cairo as the capital or Egypt.  Construction has been underway since 2015 and is planned to be ready by 2030."

In [6]:
model_id = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id) # load tokenizer fot his model
tokenizer.pad_token = tokenizer.eos_token # here eos use as pad token

tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [7]:
dataset.features # dataset columns

{'instruction': Value('string'),
 'context': Value('string'),
 'response': Value('string'),
 'category': Value('string')}

In [8]:
 from random import randint
 from itertools import chain
 from functools import partial

 def template_dataset(sample):
    sample["text"] = f"{format_dolly(sample)}{tokenizer.eos_token}"
    return sample

dataset = dataset.map(template_dataset, remove_columns=list(dataset.features))

print(dataset[0])

Map:   0%|          | 0/15011 [00:00<?, ? examples/s]

{'text': "### Instruction\nWhen did Virgin Australia start operating?\n\n### Context\nVirgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia's domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney.\n\n### Answer\nVirgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.</s>"}


In [12]:
dataset.features

{'text': Value('string')}

In [14]:
remainder = { # if max token lenght 2048 and if all does't fit remainder data will be contain here
    "input_ids": [],
    "attention_mask":[], # which are real and which are padding
    "token_type_ids":[]
}

# chunking
def chunk(sample, chunk_length = 2048):

    global remainder

    concat_examples = {
        k: list(chain(*sample[k])) for k in sample.keys()
    }
    concat_examples = {
        k: remainder[k] + concat_examples[k] for k in concat_examples.keys()
    }

    batch_total_length = len(concat_examples[list(sample.keys())[0]])

    if batch_total_length >= chunk_length:
        batch_total_length = (batch_total_length // chunk_length) * chunk_length

    result = {
        k: [t[i + chunk_length] for i in range(0, batch_total_length, chunk_length)] for k, t in concat_examples.items()
    }

    remainder = {
        k: concat_examples[k][batch_total_length:] for k in concat_examples.keys()
    }

    result["labels"] = result["input_ids"].copy()

    return result


# tokenize the text
lm_dataset = dataset.map(
    lambda sample: tokenizer(sample["text"]),
    batched = True, # provide default 1000 samples
    remove_columns = list(dataset.features)
).map(
    partial(chunk, chunk_length = 2048),
    batched = True
)

print(f"Total Number of samples : {len(lm_dataset)}")


Map:   0%|          | 0/15011 [00:00<?, ? examples/s]

Total Number of samples : 1528
