ref: https://github.com/unslothai/unsloth

In [None]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
# Must install separately since Colab has torch 2.2.1, which breaks packages
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 8192 # Choose any!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/llama-2-13b-bnb-4bit",
    "unsloth/codellama-34b-bnb-4bit",
    "unsloth/tinyllama-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",
    "unsloth/gemma-2b-bnb-4bit",
]

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = True,
    random_state = 3407,
    use_rslora = False,  # support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Answer the question based on context, and answer that you don't know the answer if not present in the context or there is no context. Never provide an answer that is not based on the context, even if it is a well known fact.

### Input:
<---- CONTEXT ---->
{context}
<---- END CONTEXT ---->

<---- QUESTION ---->
{question}
<---- END QUESTION ---->
### Response:
{answer}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    questions = examples["question"]
    contexts  = examples["context"]
    answers   = examples["answer"]
    texts = []
    for question, context, answer in zip(questions, contexts, answers):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(question=question, context=context, answer=answer) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts}
pass

import json
from datasets import Dataset

file_path = '/content/sample_data/general_dataset.jsonl'
dataset = []

with open(file_path, 'r') as file:
    for line in file:
        entry = json.loads(line)
        dataset.append(entry)

# Convert the list of dictionaries to a Hugging Face datasets.Dataset object
columns = {key: [dic[key] for dic in dataset] for key in dataset[0]}
dataset_hf = Dataset.from_dict(columns)
print(dataset_hf)
dataset = dataset_hf.select(range(1000)).map(formatting_prompts_func, batched = True)

In [None]:
answer_num = 0
non_answer_num_text = 0
non_answer_num_notext = 0
boolq_true_num = 0
boolq_false_num = 0

idk_list = ["I'm sorry, I don't have that information in the course documents",
            "I apologize, but that information isn't included in the course materials.",
            "Sorry, that information is not contained in the course materials.",
            "That information isn't included in the course materials, I'm sorry.",
            "I'm sorry, but I could not find that information in the course documents."]

for example in dataset:

    if example['answer'] in idk_list:
        if example['context'] == '':
            non_answer_num_notext += 1
        else:
            non_answer_num_text += 1
    elif example['answer'] == "True":
        boolq_true_num += 1
    elif example['answer'] == "False":
        boolq_false_num += 1
    else:
        answer_num += 1

import matplotlib.pyplot as plt

categories = ['Answerable', 'Non-answerable-with-text', 'Non-answerable-without-text', 'True', 'False']
counts = [answer_num, non_answer_num_text, non_answer_num_notext, boolq_true_num, boolq_false_num]

# Plot the graph to compare Answerable and Non-answerable Questions in SQuAD 2.0
plt.figure(figsize=(10, 6))
plt.bar(categories, counts, color=['blue', 'red', 'yellow', 'green'])
plt.title('Comparison of Answerable and Non-answerable Questions in SQuAD 2.0')
plt.xlabel('Category')
plt.ylabel('Number of Questions')
plt.show()

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs=1,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "",
        "", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
      context="",
      question="",
      answer=""  # output - leave this blank for generation
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

In [None]:
model.save_pretrained("lora_model") # Local saving
# model.push_to_hub("lora_model", token = "...") # Online saving

In [None]:
model_path = "/content/drive/My Drive/lora_model"
model.save_pretrained(model_path) # Local saving
