In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
import torch

model_name = "EleutherAI/pythia-410m"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto"
)


tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/911M [00:00<?, ?B/s]

In [3]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="lamini_docs.jsonl")["train"]
dataset[0]  


Generating train split: 0 examples [00:00, ? examples/s]

{'question': 'How can I evaluate the performance and quality of the generated text from Lamini models?',
 'answer': "There are several metrics that can be used to evaluate the performance and quality of generated text from Lamini models, including perplexity, BLEU score, and human evaluation. Perplexity measures how well the model predicts the next word in a sequence, while BLEU score measures the similarity between the generated text and a reference text. Human evaluation involves having human judges rate the quality of the generated text based on factors such as coherence, fluency, and relevance. It is recommended to use a combination of these metrics for a comprehensive evaluation of the model's performance."}

# Prepare data for supervised fine-tuning (SFT) with loss only on the answer

In [5]:
def format_example(example):
    question = example["question"]
    answer = example["answer"]

    prompt = f"Question: {question}\n\nAnswer:"
    answer_text = " " + answer  # leading space helps tokenization

    prompt_ids = tokenizer(prompt, add_special_tokens=False)["input_ids"]
    answer_ids = tokenizer(answer_text, add_special_tokens=False)["input_ids"]

    input_ids = prompt_ids + answer_ids
    labels = [-100] * len(prompt_ids) + answer_ids  # loss only on answer

    # Truncate if too long
    max_length = 512
    if len(input_ids) > max_length:
        input_ids = input_ids[:max_length]
        labels = labels[:max_length]

    return {
        "input_ids": input_ids,
        "labels": labels,
    }

processed_dataset = dataset.map(format_example, remove_columns=dataset.column_names)


Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

In [6]:
def pad_to_max_length(batch, max_length=512):
    input_ids = batch["input_ids"]
    labels = batch["labels"]

    pad_id = tokenizer.pad_token_id

    padded_input_ids = []
    padded_labels = []

    for ids, lbls in zip(input_ids, labels):
        pad_len = max_length - len(ids)
        padded_input_ids.append(ids + [pad_id] * pad_len)
        padded_labels.append(lbls + [-100] * pad_len)

    return {
        "input_ids": padded_input_ids,
        "labels": padded_labels,
    }

processed_dataset = processed_dataset.map(
    pad_to_max_length,
    batched=True,
)


Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

# Add LoRA to the model

In [7]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()  # should show only a small number of params trainable


trainable params: 786,432 || all params: 406,120,448 || trainable%: 0.1936


# Define Trainer & train

In [8]:
from transformers import TrainingArguments, Trainer
import os

output_dir = "pythia_lamini_lora"

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
    weight_decay=0.0,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset,
)

trainer.train()


The model is already on multiple devices. Skipping the move to device specified in `args`.


Step,Training Loss
50,1.9905
100,1.8659
150,1.7932
200,1.687
250,1.7015


TrainOutput(global_step=264, training_loss=1.799481254635435, metrics={'train_runtime': 1441.4769, 'train_samples_per_second': 2.914, 'train_steps_per_second': 0.183, 'total_flos': 4575309122764800.0, 'train_loss': 1.799481254635435, 'epoch': 3.0})

# Save the LoRA adapter

In [9]:
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('pythia_lamini_lora/tokenizer_config.json',
 'pythia_lamini_lora/special_tokens_map.json',
 'pythia_lamini_lora/tokenizer.json')

In [10]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch

base_model_name = "EleutherAI/pythia-410m"
adapter_dir = "pythia_lamini_lora"

tokenizer = AutoTokenizer.from_pretrained(adapter_dir)
tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto"
)

model = PeftModel.from_pretrained(base_model, adapter_dir)
model.eval()


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPTNeoXForCausalLM(
      (gpt_neox): GPTNeoXModel(
        (embed_in): Embedding(50304, 1024)
        (emb_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-23): 24 x GPTNeoXLayer(
            (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (post_attention_dropout): Dropout(p=0.0, inplace=False)
            (post_mlp_dropout): Dropout(p=0.0, inplace=False)
            (attention): GPTNeoXAttention(
              (query_key_value): lora.Linear(
                (base_layer): Linear(in_features=1024, out_features=3072, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1024, out_features=8, bias=Fal

In [11]:
def inference(question, model, tokenizer, max_new_tokens=100):
    prompt = f"Question: {question}\n\nAnswer:"
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)

    with torch.no_grad():
        generated = model.generate(
            input_ids=input_ids,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            top_p=0.9,
            temperature=0.7,
        )

    output = tokenizer.decode(generated[0], skip_special_tokens=True)
    # Optionally strip the prompt part:
    return output[len(prompt):].strip()

print(inference("What is lamini?", model, tokenizer))


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  test_elements = torch.tensor(test_elements)
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Lamini is a language model that can be used to create natural language text models. Lamini’s goal is to make it easier for developers and researchers to create and test language models. Lamini’s open source project provides a Python API for developers to integrate Lamini into their existing projects. Lamini’s documentation provides detailed information on how to use the model, and it is recommended that developers follow the instructions provided. Lamini also provides a community of users
