In [None]:
# !pip install jsonlines
!pip install datasets
!pip install transformers

Successfully installed safetensors-0.3.3 tokenizers-0.13.3 transformers-4.32.0


In [None]:
import pandas as pd
import datasets

from pprint import pprint
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/pythia-70m')

Downloading (…)okenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [None]:
text = "Hello bro what are you doing"

In [37]:
encoded_text = tokenizer(text)['input_ids']

In [None]:
encoded_text

[12092, 1795, 752, 403, 368, 2509]

In [None]:
decoded_text = tokenizer.decode(encoded_text)

In [None]:
decoded_text

'Hello bro what are you doing'

# Tokenize multiple texts at once

In [None]:
list_text = ['i love you','i hate you','will you marry me?']
encoded_texts = tokenizer(list_text)

In [None]:
print('Encoded several texts : ',encoded_texts['input_ids'])

Encoded several texts :  [[74, 2389, 368], [74, 9239, 368], [9846, 368, 17129, 479, 32]]


# Padding and Truncation

In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
encoded_text_longest = tokenizer(list_text,padding=True)
print('Using padding : ',encoded_text_longest['input_ids'])

Using padding :  [[74, 2389, 368, 0, 0], [74, 9239, 368, 0, 0], [9846, 368, 17129, 479, 32]]


In [None]:
encoded_text_truncation = tokenizer(list_text,max_length=3,truncation=True)
print('Using truncation : ',encoded_text_truncation['input_ids'])

Using padding :  [[74, 2389, 368], [74, 9239, 368], [9846, 368, 17129]]


In [None]:
tokenizer.truncation_side = 'left'
encoded_text_truncation_left = tokenizer(list_text,max_length=3,truncation=True)
print('Using left-side truncation : ',encoded_text_truncation_left['input_ids'])

Using left-side truncation :  [[74, 2389, 368], [74, 9239, 368], [17129, 479, 32]]


In [None]:
encoded_text_both = tokenizer(list_text,max_length=3,truncation=True,padding=True)
print('Using both padding and truncation : ',encoded_text_both['input_ids'])

Using both padding and truncation :  [[74, 2389, 368], [74, 9239, 368], [17129, 479, 32]]


### Prepare instruction dataset

In [None]:
from datasets import load_dataset
dataset = load_dataset("lamini/lamini_docs")

Downloading readme:   0%|          | 0.00/577 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/615k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/83.7k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1260 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/140 [00:00<?, ? examples/s]

In [49]:
dataset['train'].to_dict()['question'][0]

'How can I evaluate the performance and quality of the generated text from Lamini models?'

In [39]:
import pandas as pd

# filename = "lamini_docs.jsonl"
# instruction_dataset_df = pd.read_json(filename, lines=True)
# examples = instruction_dataset_df.to_dict()
examples = dataset['train'].to_dict()

if "question" in examples and "answer" in examples:
  text = examples["question"][0] + examples["answer"][0]
elif "instruction" in examples and "response" in examples:
  text = examples["instruction"][0] + examples["response"][0]
elif "input" in examples and "output" in examples:
  text = examples["input"][0] + examples["output"][0]
else:
  text = examples["text"][0]

prompt_template = """### Question:
{question}

### Answer:"""

num_examples = len(examples["question"])
finetuning_dataset = []
for i in range(num_examples):
  question = examples["question"][i]
  answer = examples["answer"][i]
  text_with_prompt_template = prompt_template.format(question=question)
  finetuning_dataset.append({"question": text_with_prompt_template, "answer": answer})

from pprint import pprint
print("One datapoint in the finetuning dataset:")
pprint(finetuning_dataset[0])

One datapoint in the finetuning dataset:
{'answer': 'There are several metrics that can be used to evaluate the '
           'performance and quality of generated text from Lamini models, '
           'including perplexity, BLEU score, and human evaluation. Perplexity '
           'measures how well the model predicts the next word in a sequence, '
           'while BLEU score measures the similarity between the generated '
           'text and a reference text. Human evaluation involves having human '
           'judges rate the quality of the generated text based on factors '
           'such as coherence, fluency, and relevance. It is recommended to '
           'use a combination of these metrics for a comprehensive evaluation '
           "of the model's performance.",
 'question': '### Question:\n'
             'How can I evaluate the performance and quality of the generated '
             'text from Lamini models?\n'
             '\n'
             '### Answer:'}


# Tokenizer a single Example

In [40]:
text = finetuning_dataset[0]["question"] + finetuning_dataset[0]["answer"]
tokenized_inputs = tokenizer(
    text,
    return_tensors="np",
    padding=True
)
print(tokenized_inputs["input_ids"])

[[ 4118 19782    27   187  2347   476   309  7472   253  3045   285  3290
    273   253  4561  2505   432   418  4988    74  3210    32   187   187
   4118 37741    27  2512   403  2067 17082   326   476   320   908   281
   7472   253  3045   285  3290   273  4561  2505   432   418  4988    74
   3210    13  1690 44229   414    13   378  1843    54  4868    13   285
   1966  7103    15  3545 12813   414  5593   849   973   253  1566 26295
    253  1735  3159   275   247  3425    13  1223   378  1843    54  4868
   5593   253 14259   875   253  4561  2505   285   247  3806  2505    15
   8801  7103  8687  1907  1966 16006  2281   253  3290   273   253  4561
   2505  1754   327  2616   824   347 25253    13  2938  1371    13   285
  17200    15   733   310  8521   281   897   247  5019   273   841 17082
    323   247 11088  7103   273   253  1566   434  3045    15]]


In [41]:
max_length = 2048
max_length = min(
    tokenized_inputs["input_ids"].shape[1],
    max_length,
)

In [42]:
tokenized_inputs = tokenizer(
    text,
    return_tensors="np",
    truncation=True,
    max_length=max_length
)

In [43]:
tokenized_inputs["input_ids"]

array([[ 4118, 19782,    27,   187,  2347,   476,   309,  7472,   253,
         3045,   285,  3290,   273,   253,  4561,  2505,   432,   418,
         4988,    74,  3210,    32,   187,   187,  4118, 37741,    27,
         2512,   403,  2067, 17082,   326,   476,   320,   908,   281,
         7472,   253,  3045,   285,  3290,   273,  4561,  2505,   432,
          418,  4988,    74,  3210,    13,  1690, 44229,   414,    13,
          378,  1843,    54,  4868,    13,   285,  1966,  7103,    15,
         3545, 12813,   414,  5593,   849,   973,   253,  1566, 26295,
          253,  1735,  3159,   275,   247,  3425,    13,  1223,   378,
         1843,    54,  4868,  5593,   253, 14259,   875,   253,  4561,
         2505,   285,   247,  3806,  2505,    15,  8801,  7103,  8687,
         1907,  1966, 16006,  2281,   253,  3290,   273,   253,  4561,
         2505,  1754,   327,  2616,   824,   347, 25253,    13,  2938,
         1371,    13,   285, 17200,    15,   733,   310,  8521,   281,
      

# Tokenize the instrucation dataset

In [57]:
def tokenize_function(examples):
    if "question" in examples and "answer" in examples:
      text = examples["question"][0] + examples["answer"][0]
    elif "input" in examples and "output" in examples:
      text = examples["input"][0] + examples["output"][0]
    else:
      text = examples["text"][0]

    tokenizer.pad_token = tokenizer.eos_token
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        padding=True,
    )

    max_length = min(
        tokenized_inputs["input_ids"].shape[1],
        2048
    )
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=max_length
    )

    return tokenized_inputs

In [60]:
# finetuning_dataset_loaded = datasets.load_dataset("json", data_files=filename, split="train")
from datasets import Dataset

finetuning_dataset_loaded = Dataset.from_dict(dataset['train'].to_dict())
tokenized_dataset = finetuning_dataset_loaded.map(
    tokenize_function,
    batched=True,
    batch_size=1,
    drop_last_batch=True
)
# tokenized_dataset = tokenize_function(finetuning_dataset_loaded)

print(tokenized_dataset['input_ids'])

Map:   0%|          | 0/1260 [00:00<?, ? examples/s]

[[2347, 476, 309, 7472, 253, 3045, 285, 3290, 273, 253, 4561, 2505, 432, 418, 4988, 74, 3210, 32, 2512, 403, 2067, 17082, 326, 476, 320, 908, 281, 7472, 253, 3045, 285, 3290, 273, 4561, 2505, 432, 418, 4988, 74, 3210, 13, 1690, 44229, 414, 13, 378, 1843, 54, 4868, 13, 285, 1966, 7103, 15, 3545, 12813, 414, 5593, 849, 973, 253, 1566, 26295, 253, 1735, 3159, 275, 247, 3425, 13, 1223, 378, 1843, 54, 4868, 5593, 253, 14259, 875, 253, 4561, 2505, 285, 247, 3806, 2505, 15, 8801, 7103, 8687, 1907, 1966, 16006, 2281, 253, 3290, 273, 253, 4561, 2505, 1754, 327, 2616, 824, 347, 25253, 13, 2938, 1371, 13, 285, 17200, 15, 733, 310, 8521, 281, 897, 247, 5019, 273, 841, 17082, 323, 247, 11088, 7103, 273, 253, 1566, 434, 3045, 15], [5804, 309, 1089, 1491, 670, 253, 2127, 434, 2746, 281, 10885, 1048, 14, 24220, 8892, 285, 4114, 7375, 32, 4374, 13, 253, 2127, 3797, 3082, 323, 29315, 7375, 13, 12669, 2628, 3708, 13, 285, 48484, 2628, 1543, 15, 733, 671, 3797, 247, 1332, 323, 14002, 272, 7375, 15, 9157, 

# Prepare Train/Test Split

In [61]:
import numpy as np

In [62]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, shuffle=True, seed=123)
print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1134
    })
    test: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 126
    })
})


In [63]:
taylor_swift_dataset = "lamini/taylor_swift"
bts_dataset = "lamini/bts"
open_llms = "lamini/open_llms"