In [None]:
!pip install -q accelerate
!pip install -q bitsandbytes
!pip install -q trl
!pip install -q peft
!pip install -q transformers

In [None]:
model_name="mistralai/Mistral-7B-Instruct-v0.2"
# target_modules = ["q_proj", "v_proj"]

In [None]:

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTTrainer
import torch

In [None]:

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:

from google.colab import userdata
from getpass import getpass

# If the token is not already stored
if not userdata.get('HF'):
    token = getpass("Enter your Hugging Face token: ")
    userdata.set('HF', token)

# Retrieve the token
hf_token = userdata.get('HF')

In [None]:

from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:

device_map = {"": 0}
foundation_model = AutoModelForCausalLM.from_pretrained(model_name,
                    quantization_config=bnb_config,
                    device_map=device_map,
                    use_cache = False)

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [None]:

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [None]:
def generate_text(reference_text, model, tokenizer, max_length=200, do_sample=True, top_p=0.95):
    """
    Generate text based on a reference text using a preloaded model and tokenizer.

    Parameters:
    - reference_text: str, the text to use as a reference for generation.
    - model: Preloaded language model (e.g., Mistral-7B).
    - tokenizer: Preloaded tokenizer corresponding to the model.
    - max_length: int, the maximum length of the generated text.
    - do_sample: bool, whether to use sampling for generation.
    - top_p: float, the cumulative probability for nucleus sampling.

    Returns:
    - generated_text: str, the generated text.
    """
    # Tokenize the input
    inputs = tokenizer(reference_text, return_tensors="pt").to(model.device)

    # Generate text
    output = model.generate(**inputs, max_length=max_length, do_sample=do_sample, top_p=top_p)

    # Decode the generated tokens
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    return generated_text

# Example usage:
import logging
logging.getLogger("transformers").setLevel(logging.ERROR)

# Now use your text generation function
reference_text = "In the midst of the forest, there was a hidden treasure."
generated_text = generate_text(reference_text, foundation_model, tokenizer)

In [None]:
generated_text

'In the midst of the forest, there was a hidden treasure. This treasure was known only to a few, and it was guarded by a powerful beast. This beast, a massive and fearsome T-Rex, was the last guardian of the treasure, a treasure known as the Crystal of Life. It was said that this Crystal held the power to heal any ailment, restore any decay, and bring back the dead. It was a treasure of unimaginable value, and it was sought after by many.\n\nNow, there were two groups of treasure hunters who had learned of the Crystal of Life. One group was led by a cunning and ruthless mercenary named Captain Coldheart, and the other was led by a noble and just knight named Sir Dauntless. Both groups had set out on their journey to find the Crystal and bring it back to their respective rulers.\n\nCaptain Coldheart'

In [None]:
from datasets import load_dataset
dataset = "fka/awesome-chatgpt-prompts"

#Create the Dataset to create prompts.
data = load_dataset(dataset)

data = data.map(lambda samples: tokenizer(samples["prompt"]), batched=True)
train_sample = data["train"].select(range(50))

del data
train_sample = train_sample.remove_columns('act')

display(train_sample)


Downloading readme:   0%|          | 0.00/274 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.6k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/153 [00:00<?, ? examples/s]

Map:   0%|          | 0/153 [00:00<?, ? examples/s]

Dataset({
    features: ['prompt', 'input_ids', 'attention_mask'],
    num_rows: 50
})

In [None]:

import peft
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=16, #As bigger the R bigger the parameters to train.
    lora_alpha=16, # a scaling factor that adjusts the magnitude of the weight matrix. It seems that as higher more weight have the new training.
    target_modules=target_modules,
    lora_dropout=0.05, #Helps to avoid Overfitting.
    bias="none", # this specifies if the bias parameter should be trained.
    task_type="CAUSAL_LM"
)


In [None]:
import os
working_dir = './'

output_directory = os.path.join(working_dir, "peft_lab_forest")


In [None]:
import transformers
from transformers import TrainingArguments # , Trainer
training_args = TrainingArguments(
    output_dir=output_directory,
    auto_find_batch_size=True, # Find a correct bvatch size that fits the size of Data.
    learning_rate= 2e-4, # Higher learning rate than full fine-tuning.
    num_train_epochs=5
)


In [None]:
tokenizer.pad_token = tokenizer.eos_token
trainer = SFTTrainer(
    model=foundation_model,
    args=training_args,
    train_dataset=train_sample,
    peft_config = lora_config,
    dataset_text_field="prompt",
    tokenizer=tokenizer,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
trainer.train()



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


TrainOutput(global_step=65, training_loss=1.590706810584435, metrics={'train_runtime': 454.477, 'train_samples_per_second': 0.55, 'train_steps_per_second': 0.143, 'train_loss': 1.590706810584435, 'epoch': 5.0})

In [None]:
peft_model_path = os.path.join(output_directory, f"lora_model_forest")
trainer.model.save_pretrained(peft_model_path)


In [None]:
import gc
import torch
del foundation_model
del trainer
del train_sample
torch.cuda.empty_cache()
gc.collect()


47

In [None]:
from peft import AutoPeftModelForCausalLM, PeftConfig
#import os

device_map = {"": 0}
working_dir = './'

output_directory = os.path.join(working_dir, "peft_lab_forest")
peft_model_path = os.path.join(output_directory, f"lora_model_forest")


In [None]:
loaded_model = AutoPeftModelForCausalLM.from_pretrained(
                                        peft_model_path,
                                        is_trainable=False,
                                        quantization_config=bnb_config,
                                        device_map = 'cuda')


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# Example usage:
reference_text = "In the midst of the forest, there was a hidden treasure."
generated_text = generate_text(reference_text, loaded_model, tokenizer)
generated_text

'In the midst of the forest, there was a hidden treasure. It had been hidden there for decades and only the bravest of explorers had ever been able to find it. The treasure belonged to an ancient king who had buried it deep within the ground to keep it protected. The treasure was known to have immense power and only one person at a time could wield its abilities. For centuries, many had attempted to search for the treasure but it had only been found twice in history. It was said that the treasure could make one immortal, but it came with great price as the one who wielded the treasure had to face a never ending battle. The forest was a difficult maze to navigate, but there were people who could help make the journey easier. There were wise men who could guide you through the forest and point you in the right direction. These wise men would only reveal the path to those who were truly brave and had a strong will. Many people'