In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType
from datasets import load_dataset
import json
import os

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [2]:
# !pip install peft

In [3]:
with open("../keys.json", "r") as file:
    token_data = json.load(file)
HUGGINGFACE_TOKEN = token_data["huggingface_access_token"]

os.environ["HF_TOKEN"] = HUGGINGFACE_TOKEN 

In [4]:
# Define the instruct-tuned model checkpoint
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
scratch_dir = "../models"

# Load the tokenizer (force using the slow tokenizer)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=scratch_dir)

# Load the model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir=scratch_dir,
    device_map="auto",
    torch_dtype=torch.float16
)

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [None]:
# Define the LoRA configuration.
lora_config = LoraConfig(
    r=8,                      # rank of update matrices
    lora_alpha=32,            # scaling factor
    target_modules=["q_proj", "v_proj"],  # which modules to adapt (example modules)
    lora_dropout=0.1,         # dropout probability for lora layers
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

# Wrap the model with LoRA
model = get_peft_model(model, lora_config)

# Load and preprocess a dataset (here we use Wikitext-2 as an example)
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")


README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

In [6]:
sample_dataset = dataset.select(range(100))

In [7]:
# Set the pad_token if it isn't already defined
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    
def tokenize_function(example):
    outputs = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=512  # adjust max_length as required
    )
    outputs["labels"] = outputs["input_ids"].copy()
    return outputs

tokenized_dataset = sample_dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [8]:

# Define training arguments.
training_args = TrainingArguments(
    output_dir="./lora_mistral",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,  # simulate larger batch sizes
    learning_rate=1e-4,
    fp16=True,                     # mixed precision training
    logging_steps=10,
    save_steps=100,
    evaluation_strategy="no"
)





In [9]:
# Create the Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# Start training
trainer.train()

# Save the LoRA adapter weights (or the entire model if desired)
model.save_pretrained("./lora_mistral_adapter")

Step,Training Loss


config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

## Predict pipeline for Mistral-7b

In [12]:
# Define the instruct-tuned model checkpoint
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
scratch_dir = "../models"

# Load the tokenizer (force using the slow tokenizer)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=scratch_dir)

# Load the model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir=scratch_dir,
    device_map="auto",
    torch_dtype=torch.float16
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [13]:

# Example prompt for inference
prompt = "Once upon a time"

# Tokenize the prompt
inputs = tokenizer(prompt, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")

# Generate output (adjust parameters like max_new_tokens as desired)
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=50,   # maximum number of new tokens to generate
        do_sample=True,      # use sampling; set to False for greedy decoding
        temperature=0.7      # adjust temperature for randomness
    )

# Decode and print the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Once upon a time, there was a man named David. He was a shepherd in the land of Israel. He was tall, strong and handsome. But what really set him apart was his heart for God.

David's family was part of the


## Prediction pipeline (LORA)

In [14]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

In [16]:
# Define the instruct-tuned model checkpoint
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
scratch_dir = "../models"

# Load the tokenizer (force using the slow tokenizer)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=scratch_dir)

# Load the model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir=scratch_dir,
    device_map="auto",
    torch_dtype=torch.float16
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [17]:
# Load the LoRA adapter weights into the base model
model = PeftModel.from_pretrained(model, "./lora_mistral_adapter")
model.eval()  # set model to evaluation mode

# Example prompt for inference
prompt = "Once upon a time"

# Tokenize the prompt
inputs = tokenizer(prompt, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")

# Generate output (adjust parameters like max_new_tokens as desired)
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=50,   # maximum number of new tokens to generate
        do_sample=True,      # use sampling; set to False for greedy decoding
        temperature=0.7      # adjust temperature for randomness
    )

# Decode and print the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Once upon a time, a friend gave me a book with a simple title, The Bible. I had heard of it before, but I had never read it. I had no idea what I was in for. I never imagined it would become an essential part of my
