In [4]:
%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate 
#bitsandbytes

In [None]:
!pip install C:/triton-2.1.0-cp310-cp310-win_amd64.whl
!pip install C:/bitsandbytes-0.43.0.dev0-cp310-cp310-win_amd64.whl
!pip install C:/deepspeed-0.13.1+unknown-py3-none-any.whl

In [None]:
!pip freeze

In [7]:
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
import tqdm as notebook_tqdm

max_seq_length = 4096 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


model, tokenizer = FastLanguageModel.from_pretrained(
    # model_name = "unsloth/tinyllama", # "unsloth/tinyllama-bnb-4bit" for 16bit loading
    model_name = "unsloth/tinyllama-bnb-4bit", #for 16bit loading
    # model_name = "unsloth/Qwen2-0.5B-Instruct",
    # model_name = "unsloth/Qwen2-1.5B-Instruct",

    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.43.3.
   \\   /|    GPU: NVIDIA GeForce RTX 3060 Ti. Max memory: 7.999 GB. Platform = Windows.
O^O/ \_/ \    Pytorch: 2.3.0. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unsloth: unsloth/tinyllama-bnb-4bit can only handle sequence lengths of at most 2048.
But with kaiokendev's RoPE scaling of 2.0, it can be magically be extended to 4096!
  self.register_buffer("cos_cached", emb.cos().to(dtype=dtype, device=device, non_blocking=True), persistent=False)


In [8]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 2,#32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 4,#32,
    lora_dropout = 0, # Currently only supports dropout = 0
    bias = "none",    # Currently only supports bias = "none"
    use_gradient_checkpointing = True, # @@@ IF YOU GET OUT OF MEMORY - set to True @@@
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.8 patched 22 layers with 22 QKV layers, 22 O layers and 22 MLP layers.


In [18]:
EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
  full_text = []
  if not isinstance(examples['prompt'], list):
    # print("Non lista:", len(examples['prompt']))
    full_text = [f"{examples['prompt']} {examples['completion'].strip()}{EOS_TOKEN}"]
  else:
    # print(len(examples['prompt']))
    for i in range(len(examples['prompt'])):
      full_text.append(f"{examples['prompt'][i]} {examples['completion'][i].strip()}{EOS_TOKEN}")
  return full_text


from datasets import load_dataset
dataset = load_dataset("Paoloc99/dataset", split="train[:40000]")
eval_dataset = load_dataset("Paoloc99/dataset", split="train[-1000:]")


response_template_with_context = "\nAnswer:"  # We added context here: "\n". This is enough for this tokenizer
response_template_ids = tokenizer.encode(response_template_with_context, add_special_tokens=False)[2:]  # Now we have it like in the dataset texts: `[22550, 29901]`
collator = DataCollatorForCompletionOnlyLM(response_template_ids, tokenizer=tokenizer)

In [4]:
import re
import os
import datetime

now = datetime.datetime.now()
timestamp = now.strftime("%Y-%m-%d")

checkpoint_dir = 'checkpoints'
output_dir = f'checkpoints/{timestamp}'

def get_latest_checkpoint_dir(base_dir):
    # Ottieni tutte le sottocartelle con formato di timestamp
    timestamp_dirs = [d for d in os.listdir(base_dir) if re.match(r'\d{4}-\d{2}-\d{2}', d)]

    # Ordina le directory per timestamp
    timestamp_dirs.sort(key=lambda date: datetime.datetime.strptime(date, "%Y-%m-%d"), reverse=True)

    # Prendi la directory con il timestamp più recente
    latest_timestamp_dir = timestamp_dirs[0] if timestamp_dirs else None
    return os.path.join(base_dir, latest_timestamp_dir) if latest_timestamp_dir else None

latest_timestamp_dir = get_latest_checkpoint_dir(checkpoint_dir)

def get_latest_checkpoint(checkpoint_dir):
    if checkpoint_dir is None:
        return None

    # Ottieni tutte le sottocartelle con formato "checkpoint-{numero}"
    checkpoint_dirs = [d for d in os.listdir(checkpoint_dir) if re.match(r'checkpoint-\d+', d)]

    # Estrai i numeri dai nomi delle cartelle e ordina in base al numero
    checkpoint_dirs.sort(key=lambda x: int(re.search(r'\d+', x).group()), reverse=True)

    # Prendi la directory con il numero più grande
    latest_checkpoint_dir = checkpoint_dirs[0] if checkpoint_dirs else None
    return os.path.join(checkpoint_dir, latest_checkpoint_dir) if latest_checkpoint_dir else None

checkpoint_path = get_latest_checkpoint(latest_timestamp_dir)
# print(checkpoint_path)
if checkpoint_path and not os.path.exists(checkpoint_path):
    print(f"Checkpoint {checkpoint_path} does not exist.")
else:
    print(f"Checkpoint {checkpoint_path} found.")

Checkpoint None found.


In [2]:
!wandb login 372f5c298afc4be9b40dd7b97523d394c3d30d05

wandb: Appending key for api.wandb.ai to your netrc file: C:\Users\paolo\_netrc


In [5]:
import wandb
# set the wandb project where this run will be logged
os.environ["WANDB_PROJECT"]="lost-in-the-middle"
# set the wandb project where this run will be logged
os.environ["WANDB_NOTEBOOK_NAME "]="lost-in-the-middle"
# save your trained model checkpoint to wandb
os.environ["WANDB_LOG_MODEL"]="true"
# turn off watch to log faster
os.environ["WANDB_WATCH"]="false"
wandb.login()

True

In [22]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    # eval_dataset = val_dataset,
    #dataset_text_field = "text",
    max_seq_length = max_seq_length,
    # dataset_num_proc = 2,
    packing = False, # Packs short sequences together to save time!
    formatting_func=formatting_prompts_func,
    data_collator=collator,
    args = TrainingArguments(
        report_to="wandb",
        per_device_train_batch_size = 8,
        gradient_accumulation_steps = 4,
        warmup_ratio = 0.1,
        num_train_epochs = 1,
        learning_rate = 2e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        optim = "adamw_8bit",
        weight_decay = 0.1,
        lr_scheduler_type = "cosine",#"linear",
        seed = 3407,
        output_dir = output_dir,
        # eval_steps=100,
        # eval_strategy="steps",
        logging_steps=5,
        log_level='debug', #'info',
        save_steps=100,
        save_total_limit=5,
        # resume_from_checkpoint=checkpoint_path
    ),
)

PyTorch: setting up devices

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Map: 100%|██████████| 40000/40000 [00:49<00:00, 803.84 examples/s]
Setting `WANDB_LOG_MODEL` from true to `end` instead
Using auto half precision backend


In [23]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 3060 Ti. Max memory = 7.999 GB.
7.273 GB of memory reserved.


In [24]:
trainer_stats = trainer.train(resume_from_checkpoint=checkpoint_path)

  0%|          | 0/468 [04:16<?, ?it/s]
Currently training with a batch size of: 8
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 40,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 4
\        /    Total batch size = 32 | Total steps = 1,250
 "-____-"     Number of trainable parameters = 1,576,960
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
                                                   
  0%|          | 5/1250 [03:30<13:46:28, 39.83s/it]

{'loss': 3.5834, 'grad_norm': 4.28305721282959, 'learning_rate': 8.000000000000001e-07, 'epoch': 0.0}


                                                    
  1%|          | 10/1250 [06:13<12:28:03, 36.20s/it]

{'loss': 3.4092, 'grad_norm': 4.039161205291748, 'learning_rate': 1.6000000000000001e-06, 'epoch': 0.01}


  1%|          | 12/1250 [07:04<10:30:58, 30.58s/it]

KeyboardInterrupt: 

In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

In [None]:
index = 3
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    eval_dataset[index]["prompt"]
], return_tensors = "pt").to("cuda")

# print(eval_dataset[2]["prompt"])
outputs = model.generate(**inputs, use_cache = True)
print(tokenizer.batch_decode(outputs)[0])
print("-------------")
print("Risposta esatta: ", eval_dataset[index]["completion"])