In [1]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers trl peft accelerate bitsandbytes

In [1]:
import torch
import gc
gc.collect()
torch.cuda.empty_cache()

In [2]:
new_model = "DTT-unsloth-gemma-2b-it-v1" 

In [3]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


In [4]:
from unsloth import FastLanguageModel
import torch

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-2b-it-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth: Fast Gemma patching release 2024.5
   \\   /|    GPU: NVIDIA GeForce RTX 3050. Max memory: 7.779 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


In [5]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.5 patched 18 layers with 18 QKV layers, 18 O layers and 18 MLP layers.


In [6]:
#Prompt

alpaca_prompt = """Below is a question that requires a detailed answer. Provide a response that fully addresses the question.

### Question:
{}

### Response:
{}"""

In [7]:
# alpaca_prompt = You MUST copy from above!

inputs = tokenizer(
[
    alpaca_prompt.format(
        "Hi, I'm Sergie Who r you?", # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

["<bos>Below is a question that requires a detailed answer. Provide a response that fully addresses the question.\n\n### Question:\nHi, I'm Sergie Who r you?\n\n### Response:\nHello! My name is Sergie. It's nice to meet you. What can I do for you today?<eos>"]

In [8]:
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["Question"]
    outputs      = examples["Answer"]
    texts = []
    for instruction, output in zip(instructions, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("SerchiBoi/DTT-Info", split = "train")
print(dataset)
dataset = dataset.map(formatting_prompts_func, batched = True,)
dataset

Dataset({
    features: ['Question', 'Answer'],
    num_rows: 502
})


Dataset({
    features: ['Question', 'Answer', 'text'],
    num_rows: 502
})

In [9]:
print(dataset[1]['text'])

Below is a question that requires a detailed answer. Provide a response that fully addresses the question.

### Question:
What is DTT used for?

### Response:
The DTT project serves as the link between undergraduate students in the final Practical School of Engineering in Sciences and Systems and the professional world, research, entrepreneurship, and the government. This connection allows students to become acquainted with elements of the job market and technological needs in Guatemala. Additionally, it creates a communication channel for socializing and disseminating research and innovative projects developed in the School of Engineering in Sciences and Systems, through the publication of articles prepared by students on the website and on the designated Fan Page for this project.<eos>


In [10]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 140000,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "output-DTT-unsloth-gemma-2b-it-v1",
    ),
)

In [11]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 3050. Max memory = 7.779 GB.
2.91 GB of memory reserved.


In [12]:
trainer_stats = trainer.train(resume_from_checkpoint=True)

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 502 | Num Epochs = 2,259
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 140,000
 "-____-"     Number of trainable parameters = 19,611,648


Step,Training Loss
129001,0.0804
129002,0.0824
129003,0.0816
129004,0.0722
129005,0.0778
129006,0.0788
129007,0.0709
129008,0.0728
129009,0.0808
129010,0.0764


In [13]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

17384.4491 seconds used for training.
289.74 minutes used for training.
Peak reserved memory = 3.721 GB.
Peak reserved memory for training = 0.811 GB.
Peak reserved memory % of max memory = 47.834 %.
Peak reserved memory for training % of max memory = 10.426 %.


In [1]:
# Load the extension of TensorBoard
%load_ext tensorboard 
#Start TensorBoard using the info of the directory specified in logdir
%tensorboard --logdir="output-DTT-unsloth-gemma-2b-it-v1"

In [15]:
# alpaca_prompt = You MUST copy from above!

inputs = tokenizer(
[
    alpaca_prompt.format(
        "Hi, I'm Sergie Who r you?", # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

["<bos>Below is a question that requires a detailed answer. Provide a response that fully addresses the question.\n\n### Question:\nHi, I'm Sergie Who r you?\n\n### Response:\nAccording to the final internship regulations and article 12, this Regulation is issued by the Board of Directors of the Faculty of Engineering.<eos>"]

In [16]:
# alpaca_prompt = You MUST copy from above!

inputs = tokenizer(
[
    alpaca_prompt.format(
        "What is DTT?", # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

['<bos>Below is a question that requires a detailed answer. Provide a response that fully addresses the question.\n\n### Question:\nWhat is DTT?\n\n### Response:\n DTT us the acronym for "Development of Technology Transfer"<eos>']

In [17]:
model.save_pretrained(new_model) # Local saving
tokenizer.save_pretrained(new_model)

('DTT-unsloth-gemma-2b-it-v1/tokenizer_config.json',
 'DTT-unsloth-gemma-2b-it-v1/special_tokens_map.json',
 'DTT-unsloth-gemma-2b-it-v1/tokenizer.model',
 'DTT-unsloth-gemma-2b-it-v1/added_tokens.json',
 'DTT-unsloth-gemma-2b-it-v1/tokenizer.json')

In [18]:
if True:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = new_model, # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

==((====))==  Unsloth: Fast Gemma patching release 2024.5
   \\   /|    GPU: NVIDIA GeForce RTX 3050. Max memory: 7.779 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


In [19]:
# alpaca_prompt = You MUST copy from above!

inputs = tokenizer(
[
    alpaca_prompt.format(
        "What are the modalities for the final practice?", # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

['<bos>Below is a question that requires a detailed answer. Provide a response that fully addresses the question.\n\n### Question:\nWhat are the modalities for the final practice?\n\n### Response:\nAccording to the final internship regulations and article 10, the modalities for the final internship are through workshops.<eos>']

In [20]:
# alpaca_prompt = You MUST copy from above!

inputs = tokenizer(
[
    alpaca_prompt.format(
        "Hi, I'm Sergie Who r you?", # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

["<bos>Below is a question that requires a detailed answer. Provide a response that fully addresses the question.\n\n### Question:\nHi, I'm Sergie Who r you?\n\n### Response:\nAccording to the final internship regulations and article 12, this Regulation is issued by the Board of Directors of the Faculty of Engineering.<eos>"]

In [21]:
if True: model.push_to_hub_merged("SerchiBoi/DTT-Chatbot-Piloto-v1", tokenizer, save_method = "lora", token = "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")

Unsloth: Saving LoRA adapters. Please wait...


README.md:   0%|          | 0.00/578 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/78.5M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

Saved lora model to https://huggingface.co/SerchiBoi/DTT-Chatbot-Piloto-v1


In [22]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

In [23]:
# alpaca_prompt = You MUST copy from above!

inputs = tokenizer(
[
    alpaca_prompt.format(
        "Continue the fibonnaci sequence.", # instruction
        "1, 1, 2, 3, 5, 8", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

['<bos>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nContinue the fibonnaci sequence.\n\n### Input:\n1, 1, 2, 3, 5, 8\n\n### Response:\nAccording to the Behavior Manual document. The sequence begins with the recording of the laboratory, in which the tutor enters the session to answer the laboratory questions.<eos>']

In [24]:
alpaca_prompt = """Keep in mind that if they ask you for an instruction or ask you a topic that you don't know about, politely respond that you cannot answer the topic because it is not within your focus.

Having said the above, below is a question that requires a detailed answer. Provide a response that fully addresses the question.
### Question:
{}

### Response:
{}"""


In [25]:
# alpaca_prompt = You MUST copy from above!

inputs = tokenizer(
[
    alpaca_prompt.format(
        "Can you Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8?", # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 1000, use_cache = True)
tokenizer.batch_decode(outputs)

["<bos>Keep in mind that if they ask you for an instruction or ask you a topic that you don't know about, politely respond that you cannot answer the topic because it is not within your focus.\n\nHaving said the above, below is a question that requires a detailed answer. Provide a response that fully addresses the question.\n### Question:\nCan you Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8?\n\n### Response:\nAccording to the Behavior Manual document. Yes, you can continue the sequence with the assumption that the initial question was answered with the response of the forum.<eos>"]