# Quantized LLMs for Synthetic Dataset Generation

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

#model_name_or_path = "TheBloke/SOLAR-10.7B-Instruct-v1.0-GPTQ"
model_name_or_path = "TheBloke/OpenZephyrChat-v0.2-GPTQ"
revision = "main"

model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             device_map="auto",
                                             trust_remote_code=False,
                                             revision=revision)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

prompt = "Write a story about llamas"

prompt_template=f"{prompt}"

In [2]:
print("\n\n*** Generate:")

input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=256)
print(tokenizer.decode(output[0]))



*** Generate:




<s> Write a story about llamas.

Once upon a time, in a far-off land, there was a herd of llamas who lived in a lush green valley. These llamas were not ordinary llamas; they were the wisest and most intelligent creatures in the land.

Their leader was a majestic llama named Llama Llama, who was known for his wisdom and kindness. Llama Llama would often gather the herd to share his thoughts and teachings. He believed that by working together, they could achieve great things.

One day, Llama Llama received a vision in his dreams. In the dream, he saw a great darkness approaching their valley. It was a force of evil that threatened to destroy their home and everything they held dear.

Determined to protect his herd, Llama Llama called a meeting to discuss their next steps. The llamas listened intently as Llama Llama explained his vision. They knew that they had to act quickly to defend their home.

The llamas came up with a plan. They would create a magical barrier around their valley, u

In [6]:
print("Testing parallelism...")
input_ids = tokenizer(8*[prompt_template], return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=512)

Testing parallelism...


In [5]:
print(tokenizer.decode(output[0]))

<s> Write a story about llamas and space.

In the year 3015, humans had finally conquered space travel. They had explored far and wide, discovering new planets and galaxies. But one day, something strange happened. As they were exploring a new planet, they stumbled upon a group of llamas.

The llamas were not like the llamas they knew back on Earth. These llamas had evolved and adapted to the harsh conditions of space. They had long, slender legs that allowed them to move quickly through the vast expanse of space. Their fur was thick and warm, protecting them from the cold temperatures of space. And they had developed the ability to breathe in the vacuum of space, thanks to a special oxygen tank attached to their backs.

The humans were amazed by these space llamas. They were unlike anything they had ever seen before. They named the llamas "Cosmic Camelids" and began to study them closely. They discovered that the llamas had developed a unique way of communicating with each other. They

# Other Tests

In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

#checkpoint = "t5-base"
checkpoint = "t5-small"
#checkpoint = "facebook/bart-base"

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=checkpoint)

In [2]:
model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path=checkpoint)
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [3]:
text = """
The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. 
It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. 
It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. 
And no one making under $400,000 per year will pay a penny more in taxes.
"""

In [4]:
input_ids = tokenizer(f"summarize: {text}", return_tensors="pt").input_ids
decoder_ids = tokenizer("", return_tensors="pt").input_ids

In [5]:
outputs = model.generate(input_ids=input_ids, max_new_tokens=100)

In [6]:
outputs

tensor([[    0,     8,    16,    89,  6105,  4709,  1810,  1364,     7,  7744,
          2672,  1358,     6,   533,   124,  1358,     6,    11,   827,  1358,
             3,     5,    34,    31,     7,     8,   167,  8299,  1041,    30,
             3, 26074,     8,  3298,  5362,    16, 10211,   892,     3,     5,
            34,    56,   987,     8,  6173,    18,  1123,   138,   189,    63,
            11, 11711,    12,   726,    70,  2725,   698,     3,     5,     1]])

In [7]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

"the inflation reduction act lowers prescription drug costs, health care costs, and energy costs. it's the most aggressive action on tackling the climate crisis in american history. it will ask the ultra-wealthy and corporations to pay their fair share."

# Summarization Fine-tuning

In [8]:
from datasets import load_dataset

billsum = load_dataset(path="billsum", split="ca_test")

In [9]:
billsum = billsum.train_test_split(test_size=0.2)
billsum["train"][0]

{'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nChapter 2.92 (commencing with Section 1001.85) is added to Title 6 of Part 2 of the Penal Code, to read:\nCHAPTER  2.92. Law Enforcement Assisted Diversion (LEAD) Pilot Program\n1001.85.\n(a) The Law Enforcement Assisted Diversion (LEAD) pilot program is hereby established. The purpose of the LEAD program is to improve public safety and reduce recidivism by increasing the availability and use of social service resources while reducing costs to law enforcement agencies and courts stemming from repeated incarceration.\n(b) LEAD pilot programs shall be consistent with the following principles, implemented to address and reflect the priorities of the community in which the program exists:\n(1) Providing intensive case management services and an individually tailored intervention plan that acts as a blueprint for assisting LEAD participants.\n(2) Prioritizing temporary and permanent housing that includes i

In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=checkpoint)

In [11]:
from transformers import BatchEncoding

prefix = "summarize: "


def preprocess_function(examples) -> BatchEncoding:
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [12]:
tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map:   0%|          | 0/989 [00:00<?, ? examples/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

In [13]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [14]:
import evaluate

rouge = evaluate.load("rouge")

In [15]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [16]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

if model is None:
    print("new model...")
    model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [17]:
training_args = Seq2SeqTrainingArguments(
    output_dir="local",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    report_to="none",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 