This notebook is based on one of [unsloth' notebooks](https://colab.research.google.com/drive/1tEd1FrOXWMnCU9UIvdYhs61tkxdMuKZu?usp=sharing).

Please use A100.

### Install

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# login huggingface if you want to access models that need authorization like llama
# !huggingface-cli login --token YOUR_TOKEN # unsafe, you'd better load your token from your drive

In [None]:
%%capture
! pip install unsloth
! pip install evaluate
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [None]:
from unsloth import FastLanguageModel, FastVisionModel
import torch
import json
import numpy as np
import pandas as pd
import datasets
from datasets import Dataset
from evaluate import load
import math
from tqdm import tqdm
import re
import os

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
root_path = "/content/drive/MyDrive/Welding_LLM"  # the google drive path you store data and results

pretrain_dataset_root = os.path.join(root_path, "data/pretrain")
sft_dataset_root = os.path.join(root_path, "data/instruct")

pretrain_model_root = os.path.join(root_path, "model/pretrain")
sft_model_root = os.path.join(root_path, "model/sft")

eval_root = os.path.join(root_path, "data/eval")
result_root = os.path.join(root_path, "result")

### Continued Pretrianing

#### Load Model

In [None]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

train_local = False

if not train_local:
  model, tokenizer = FastLanguageModel.from_pretrained(
      model_name = "unsloth/Mistral-Nemo-Base-2407", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
      max_seq_length = max_seq_length,
      dtype = dtype,
      load_in_4bit = load_in_4bit,
      # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
  )
else:
  # load from local checkpoint
  local_model = "unsloth--Mistral-Nemo-Base-2407_4bit_pt/checkpoint-413"
  model, tokenizer = FastLanguageModel.from_pretrained(
      model_name = os.path.join(pretrain_model_root, local_model), # YOUR MODEL YOU USED FOR TRAINING
      max_seq_length = max_seq_length,
      dtype = dtype,
      load_in_4bit = load_in_4bit,
  )

==((====))==  Unsloth 2024.12.12: Fast Mistral patching. Transformers: 4.47.1.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/165k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.31G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/177k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.26M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

We also add `embed_tokens` and `lm_head` to allow the model to learn out of distribution data.

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",

                      "embed_tokens", "lm_head",], # Add for continual pretraining
    lora_alpha = 32,
    lora_dropout = 0,
    bias = "none",
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,   # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth: Offloading input_embeddings to disk to save VRAM


  offloaded_W = torch.load(filename, map_location = "cpu", mmap = True)


Unsloth: Offloading output_embeddings to disk to save VRAM


Unsloth 2024.12.12 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.


Unsloth: Training embed_tokens in mixed precision to save VRAM
Unsloth: Training lm_head in mixed precision to save VRAM


<a name="Data"></a>
#### Data Prep


In [None]:
pretrain_dataset_name = "pretrain_books_all.json"
pretrain_dataset_path = os.path.join(pretrain_dataset_root, pretrain_dataset_name)

# Load the JSON file as a Hugging Face dataset
pretrain_dataset = datasets.load_dataset("json", data_files=pretrain_dataset_path)
pretrain_dataset = pretrain_dataset["train"]
# Access the dataset
print(pretrain_dataset)

# last_100_samples for checking correctness
last_100_samples = pretrain_dataset[-100:]
# Create a new Hugging Face dataset
last_100_dataset = Dataset.from_dict(last_100_samples)
print(last_100_dataset)

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 66078
})
Dataset({
    features: ['text'],
    num_rows: 100
})


In [None]:
EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
    return { "text" : [example + EOS_TOKEN for example in examples["text"]] }
pretrain_dataset = pretrain_dataset.map(formatting_prompts_func, batched = True,)
last_100_dataset = last_100_dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/66078 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
print(pretrain_dataset)
print(len(pretrain_dataset[0]["text"]), pretrain_dataset[0])
print("=" * 20)
print(last_100_dataset)
print(len(last_100_dataset[1]["text"]), last_100_dataset[1])

Dataset({
    features: ['text'],
    num_rows: 66078
})
420 {'text': 'Electron beam welding produces a fusion weld with heat obtained by impinging a beam composed of high-energy electrons onto the workpiece. Electrons are fundamental particles of matter, characterized by a negative charge and a very small mass. As used in electron beam welding, the electrons are raised to a high-energy state by acceleration to velocities in the range of  $30\\%$   to  $70\\%$   of the speed of light.</s>'}
Dataset({
    features: ['text'],
    num_rows: 100
})
8026 {'text': 'Friction stir welding can be used to join a large variety of wrought and cast magnesium alloys, including AZ (AZ31, AZ61, AZ91), ZK (ZK60), and the AM (AM50, AM60) series. Even though the processing route of the base material during manufacture has a substantial influence on the resulting mechanical properties, the effect on weld ability is negligible.  \n\nPreweld cleaning of the joint and the vicinity of the joint is essential t

We only use 1% of the dataset to speed things up! Use more for longer runs!

<a name="Train"></a>
#### Pretraining
Now let's use Unsloth's `UnslothTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). If you don't want to run a full run, use "max_steps".

Also set `embedding_learning_rate` to be a learning rate at least 2x or 10x smaller than `learning_rate` to make continual pretraining work!

In [None]:
# save checkpoints into google drive, please make sure your drive has enough capacity
pretrain_output_dir = "unsloth--Mistral-Nemo-Base-2407_4bit_pt_v2"
pretrain_output_dir = os.path.join(pretrain_model_root, pretrain_output_dir)

In [None]:
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments

trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = pretrain_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 8,

    args = UnslothTrainingArguments(
        per_device_train_batch_size = 10,
        gradient_accumulation_steps = 16,

        # Use warmup_ratio and num_train_epochs for longer runs!
        # max_steps = 120,
        # warmup_steps = 10,
        warmup_ratio = 0.1,
        num_train_epochs = 2,

        # Select a 2 to 10x smaller learning rate for the embedding matrices!
        learning_rate = 5e-5,
        embedding_learning_rate = 1e-5,

        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = pretrain_output_dir,
        report_to = "wandb", # Use this for WandB etc

        # save checkpoints
        save_strategy = "steps",
        save_steps = 150,
    ),
)

Map (num_proc=8):   0%|          | 0/66078 [00:00<?, ? examples/s]

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train(resume_from_checkpoint = False)
# trainer_stats = trainer.train(resume_from_checkpoint = True)

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 66,078 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 10 | Gradient Accumulation steps = 16
\        /    Total batch size = 160 | Total steps = 826
 "-____-"     Number of trainable parameters = 1,798,307,840
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
1,1.9939
2,1.9652
3,1.9728
4,1.9857
5,1.9374
6,1.9783
7,1.9431
8,1.9488
9,1.9429
10,1.8907


#### Inference

In [None]:
if True:
    from unsloth import FastLanguageModel
    max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
    dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "/content/drive/MyDrive/Welding_LLM/model/pretrain/unsloth--Mistral-Nemo-Base-2407_4bit_pt_v2/checkpoint-600", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

if False:
  model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Mistral-Nemo-Base-2407", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    # model_name = "unsloth/mistral-7b-v0.3",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
  )
  FastLanguageModel.for_inference(model) # Enable native 2x faster inference



inputs = tokenizer(
[
        "Open-joint hot pressure welding machines must provide more accurate alignment and be ruggedly constructed to ", # instruction
], return_tensors = "pt").to("cuda")


from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer,
                   max_new_tokens = 128,
                  #  repetition_penalty = 0.1,
                   )

==((====))==  Unsloth 2024.12.12: Fast Mistral patching. Transformers: 4.47.1.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

<s>Open-joint hot pressure welding machines must provide more accurate alignment and be ruggedly constructed to  withstand the high thrust forces.  

The most common heating head is a flat, multipleflame burner, such as the one shown in Figure 15.12, which produces a uniform flame pattern conforming to the  

Figure 15.12—Torch and Typical Setup for Open-Joint Hot Pressure Welding  

proﬁle of the joint. The heating torch is equipped with a rack-mounted fuel valve and adjustable ﬂame strikers. A natural gas or propane mixture is the most commonly used fuel. However, ofﬁ-peak cylinder manifolds limited to


#### Evaluate

Evaluate performance of pretrained model using perplexity

In [None]:
perplexity = load("perplexity", module_type="metric")

In [None]:
# load pretraining data to eval PPL
pretrain_dataset_name = "pretrain_books_all.json"
pretrain_dataset_path = os.path.join(pretrain_dataset_root, pretrain_dataset_name)
with open(pretrain_dataset_path, "r") as f:
  ppl_data = json.load(f)
predictions = []
for text in ppl_data:
  if len(text["text"]) > 8000:
    predictions.append(text["text"])
print(len(predictions), len(predictions[0]))


7691 8074


In [None]:
# models to evaluate
model_ids = [
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit",
    "/content/drive/MyDrive/Welding_LLM/model/pretrain/unsloth--Mistral-Nemo-Base-2407_4bit_pt_v2/checkpoint-150",
    "/content/drive/MyDrive/Welding_LLM/model/pretrain/unsloth--Mistral-Nemo-Base-2407_4bit_pt_v2/checkpoint-300",
    "/content/drive/MyDrive/Welding_LLM/model/pretrain/unsloth--Mistral-Nemo-Base-2407_4bit_pt_v2/checkpoint-450",
    "/content/drive/MyDrive/Welding_LLM/model/pretrain/unsloth--Mistral-Nemo-Base-2407_4bit_pt_v2/checkpoint-600"
]

output_dir = os.path.join(result_root, "result")

results = []
for model_id in model_ids:
  result = perplexity.compute(predictions=predictions[:100], model_id=model_id)
  mean_ppl = result["mean_perplexity"]
  print(mean_ppl)
  results.append((model_id, mean_ppl))

  with open(output_dir, "a") as f:
    f.write(f"{model_id}: {mean_ppl}\n")

for result in results:
  print(result)

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


7.7408984375


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 7.75 GiB. GPU 0 has a total capacity of 39.56 GiB of which 3.80 GiB is free. Process 114245 has 35.76 GiB memory in use. Of the allocated memory 35.20 GiB is allocated by PyTorch, and 53.97 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

### Instruction Finetuning

#### Load checkpoint (optional)

In [None]:
# train from local checkpoints
if True:
    max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
    dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "/content/drive/MyDrive/Welding_LLM/model/pretrain/unsloth--Mistral-Nemo-Base-2407_4bit_pt_v2/checkpoint-600", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )

==((====))==  Unsloth 2024.12.4: Fast Mistral patching. Transformers:4.47.0.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/165k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.31G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

Unsloth 2024.12.4 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.


#### Data Prep

In [None]:
from datasets import load_dataset
alpaca_dataset = load_dataset("yahma/alpaca-cleaned", split = "train")

custom_dataset_name = "sft_all_alpaca_format.json"
custom_dataset_path = os.path.join(sft_dataset_root, custom_dataset_name)

if False:
  # only use custom dataset
  # Load the JSON file as a Hugging Face dataset
  alpaca_dataset = datasets.load_dataset("json", data_files=custom_dataset_path)
  alpaca_dataset = alpaca_dataset["train"]
  # Access the dataset
  print(alpaca_dataset)

if True:
  # use custom & alpaca dataset
  # Load the JSON file as a Hugging Face dataset
  custom_dataset = datasets.load_dataset("json", data_files=custom_dataset_path)
  custom_dataset = custom_dataset["train"]

  alpaca_dataset = datasets.concatenate_datasets([alpaca_dataset, custom_dataset])
  print(alpaca_dataset)

Dataset({
    features: ['output', 'input', 'instruction'],
    num_rows: 76440
})


We print 1 example:

In [None]:
print(alpaca_dataset[-1])

{'output': 'GTAW is ideal for welding thin base metals, root passes in inaccessible joints, and avoiding flux residues.', 'input': '', 'instruction': 'What are suitable applications for gas tungsten arc welding?'}


In [None]:
alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

In [None]:

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

def formatting_prompts_func(qa_pairs):
    instruction = qa_pairs["instruction"]
    user_input = qa_pairs["input"] if qa_pairs["input"] != "" else "None"
    output = qa_pairs["output"]
    text = alpaca_prompt.format(instruction, user_input, output) + EOS_TOKEN
    return { "text" : text, }

alpaca_dataset = alpaca_dataset.map(formatting_prompts_func)
print(alpaca_dataset[0]["text"])

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Give three tips for staying healthy.

### Input:
None

### Response:
1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.

2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.

3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.</s>


#### Finetuning

We again employ `UnslothTrainer` and do instruction finetuning!

In [None]:
# save checkpoints into google drive, please make sure your drive has enough capacity
sft_output_dir = "unsloth--Mistral-Nemo-Base-2407_4bit_sft_wo_bookpretrain"
sft_output_dir = os.path.join(sft_model_root, sft_output_dir)

In [None]:
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments

trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = alpaca_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 8,

    args = UnslothTrainingArguments(
        per_device_train_batch_size = 12,
        gradient_accumulation_steps = 16,

        # Use num_train_epochs and warmup_ratio for longer runs!
        # max_steps = 120,
        # warmup_steps = 10,
        warmup_ratio = 0.1,
        num_train_epochs = 1,

        # Select a 2 to 10x smaller learning rate for the embedding matrices!
        learning_rate = 5e-5,
        embedding_learning_rate = 1e-5,

        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.00,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = sft_output_dir,
        report_to = "wandb", # Use this for WandB etc
        save_strategy = "steps",
        save_steps = 400,
    ),
)

In [None]:
trainer_stats = trainer.train(resume_from_checkpoint = False)
# trainer_stats = trainer.train(resume_from_checkpoint=True)

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 76,440 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 12 | Gradient Accumulation steps = 16
\        /    Total batch size = 192 | Total steps = 398
 "-____-"     Number of trainable parameters = 1,798,307,840
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
1,1.8291
2,1.7997
3,1.7342
4,1.7871
5,1.6036
6,1.5223
7,1.3447
8,1.3
9,1.2115
10,1.1802


In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

NameError: name 'start_gpu_memory' is not defined

#### Inference

In [None]:

if False:
    from unsloth import FastLanguageModel
    max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
    dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "/content/drive/MyDrive/Welding_LLM/model/sft/unsloth--Mistral-Nemo-Base-2407_4bit_sft_alpaca&bookqa_v2/checkpoint-400", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

if False:
  model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "mistralai/Mistral-Nemo-Instruct-2407", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    # model_name = "unsloth/mistral-7b-v0.3",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
  )
  FastLanguageModel.for_inference(model) # Enable native 2x faster inference



# alpaca_prompt = Copied from above
prompt = "Continue the fibonacci sequence: 1, 1, 2, 3, 5, 8,"
prompt = "What is the advantage of using corner joints in laser beam welding? Explain it using one sentence."
prompt = "What effects does sulfur have on welds?"

# question = "When gas welding non-ferrous metals, () is not the function of the flux."
# options = "\n".join([
#             "A. Improving the fluidity of the molten metal",
#             "B. Removing oxides from the surface of the workpiece",
#             "C. Introducing alloying elements into the weld",
#             "D. Providing some protection to the molten pool metal"
#         ])
# prompt = f"Question: {question}\n Options: {options}\n Select one option and only output one letter like 'A', 'B', 'C' and etc. Don't output other text."
# prompt = f"Ture or False?\n{question}\n"
inputs = tokenizer(
[
    alpaca_prompt.format(
        prompt, # instruction
        "",
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 256, use_cache = True)
tokenizer.batch_decode(outputs)

['<s>Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nWhat effects does sulfur have on welds?\n\n### Input:\n\n\n### Response:\nSulfur can cause hot cracking in welds, particularly in the heat-affected zone, due to its segregation during solidification.</s>']

#### Evaluate

In [None]:
# templates and functions

alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

llama_template = """<|begin_of_text|><|start_header_id|>user<|end_header_id|>

{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""


def build_prompt(question, options, multiple_choice=False, template="alpaca"):
    options_str = "\n".join(options)
    choices_prompt = f"There is only one correct option. Please select one option." \
                        if not multiple_choice else \
                    f"There may be one or multiple correct options. Please select all options you think right."
    constraint_prompt = f"Only output the letter of the correct option(s). For example, 'A', 'B', 'C', 'D', etc." \
                        f"If there are multiple correct options, please output all of them without separators. For example, 'AB', 'BCD', 'ACBE', etc."

    final_prompt = f"Question: {question}\nOptions:\n{options_str}\n\n{choices_prompt}\n{constraint_prompt}"

    if template == "llama":
      prompt = llama_template.format(final_prompt)
    else:
      prompt = alpaca_prompt.format(
          final_prompt, # instruction
          "",
          "", # output - leave this blank for generation!
      )


    return prompt

def batch_infer_model(model, tokenizer, prompts, max_new_tokens=250, template="aplaca"):
  inputs = tokenizer(prompts, return_tensors = "pt").to("cuda")
  outputs = model.generate(**inputs, max_new_tokens = max_new_tokens)
  responses = tokenizer.batch_decode(outputs)
  if template == "llama":
    responses = [r.split("<|end_header_id|>")[-1].strip().rstrip(tokenizer.eos_token) for r in responses]
  else:
    responses = [r.split("### Response:")[1].strip().rstrip(tokenizer.eos_token) for r in responses]
  return responses


def extract_answer(responses: str):
  pattern = r"(?<=\\n)|(?<=\b)?[A-K](?:, *[A-K])*"
  matches = re.findall(pattern, responses)
  target = matches[0] if matches else "<NOT_FOUND>"
  pattern = r",? *"
  target = re.sub(pattern, "", target)
  target = target.upper()
  return target


def check_correctness(predicts, answers):
    res = {
        "correct": 0,
        "correct_type": {
            "single-choice": 0,
            "multi-choice": 0
        },
        "total": len(predicts),
        "accuracy": 0.0,
        "correct_indices": [],
        "wrong_format_indices": []
    }

    for i, (p, a) in enumerate(zip(predicts, answers)):
        _p = [c.upper() for c in p if c.isalpha()]
        if len(_p) != len(p):
            res["wrong_format_indices"].append(i)
            continue

        _p = set(_p)
        _a = set([c.upper() for c in a if c.isalpha()])
        if _p == _a:
                res["correct"] += 1
                res["correct_type"]["single-choice" if len(_a) == 1 else "multi-choice"] += 1
                res["correct_indices"].append(i)

    res["accuracy"] = res["correct"] / res["total"]
    return res

def eval(model, tokenizer, eval_data_path, eval_output_path, template="alpaca"):
    with open(eval_data_path, "r") as f:
        eval_data = json.load(f)

    N = len(eval_data)
    batch_size = 1
    batchs = math.ceil(N / batch_size)
    print(f"Total {N} questions, {batchs} batchs with batch size {batch_size}")

    output_data = {
        "eval_data": [],
        "eval_metrics": {
            "correct": 0,
            "correct_type": {
                "single-choice": 0,
                "multi-choice": 0
            },
            "total": N,
            "type_total": {
                "single-choice": 0,
                "multi-choice": 0
            },
            "accuracy": 0.0,
            "wrong_format_n": 0,
            "wrong_format_indices": []
        },
    }
    eval_metrics = output_data["eval_metrics"]

    for i in tqdm(range(batchs)):
        batched_data = eval_data[i * batch_size: (i + 1) * batch_size]

        questions = [d["question"] for d in batched_data]
        options = [d["options"] for d in batched_data]
        answers = [d["answer"] for d in batched_data]

        prompts = [build_prompt(q, o, multiple_choice=(True if len(answers[_i]) > 1 else False), template=template) for _i, (q, o) in enumerate(zip(questions, options))]

        predicts = batch_infer_model(model,tokenizer, prompts, max_new_tokens=250,template=template)
        predicts = [extract_answer(p) for p in predicts]
        for _i, p in enumerate(predicts):
          eval_data[i * batch_size + _i]["predict"] = p
        # predicts = answers

        eval_res = check_correctness(predicts, answers)

        eval_metrics["correct"] += eval_res["correct"]
        eval_metrics["correct_type"]["single-choice"] += eval_res["correct_type"]["single-choice"]
        eval_metrics["correct_type"]["multi-choice"] += eval_res["correct_type"]["multi-choice"]
        eval_metrics["type_total"]["single-choice"] += sum([1 if (len(_a) == 1) else 0 for _a in answers])
        eval_metrics["type_total"]["multi-choice"] += sum([1 if (len(_a) > 1) else 0 for _a in answers])
        eval_metrics["wrong_format_n"] += len(eval_res["wrong_format_indices"])
        eval_metrics["wrong_format_indices"].extend([_idx + i * batch_size for _idx in eval_res["wrong_format_indices"]])

    eval_metrics["accuracy"] = eval_metrics["correct"] / eval_metrics["total"]
    print(eval_metrics)

    output_data["eval_data"] = eval_data

    with open(eval_output_path, "w") as f:
        json.dump(output_data, f, indent=4, ensure_ascii=False)

In [None]:
# evaluate

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.
if False:
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "/content/drive/MyDrive/Welding_LLM/model/sft/unsloth--Mistral-Nemo-Base-2407_4bit_sft_alpaca&bookqa_v2/checkpoint-400", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference
    model_name = "mistral_nemo_ft_v2_400"
    template = "alpaca"

if False:
  model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "mistralai/Mistral-Nemo-Instruct-2407", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    # model_name = "unsloth/mistral-7b-v0.3",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
  )
  FastLanguageModel.for_inference(model) # Enable native 2x faster inference
  model_name = "mistral_nemo_instruct"
  template = "alpaca"

if False:
  model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "meta-llama/Llama-3.1-8B-Instruct", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    # model_name = "unsloth/mistral-7b-v0.3",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
  )
  FastLanguageModel.for_inference(model) # Enable native 2x faster inference
  model_name = "llama3.1_8B_instruct"
  template = "llama"

eval_name = "exams.json"
eval_data = os.path.join(eval_root, eval_name)
eval_output_path = os.path.join(result_root, f"eval_result_{model_name}_{eval_name}.json")  # output to result dir

eval(model, tokenizer, eval_data_path=eval_data, eval_output_path=eval_output_path, template=template)

==((====))==  Unsloth 2024.12.12: Fast Qwen2 patching. Transformers: 4.47.1.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 8.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/266 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/7.51k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

AttributeError: Qwen2TokenizerFast has no attribute tokenizer

<a name="Save"></a>
### Saving, loading finetuned models (optional)
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [None]:
if False:
    from unsloth import FastLanguageModel
    max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
    dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "/content/drive/MyDrive/Welding_LLM/model/pretrain/checkpoint-413", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# alpaca_prompt = You MUST copy from above!

inputs = tokenizer(
[
    alpaca_prompt.format(
        "Describe the planet Earth extensively.", # instruction
        # "지구를 광범위하게 설명하세요.",
        "", # output - leave this blank for generation!
    ),
], return_tensors = "pt").to("cuda")


from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
                   repetition_penalty = 0.1)

By using https://translate.google.com/ we get
```
Earth refers to all things including natural disasters such as local derailment

and local depletion that occur in one space along with the suppression of water, gases, and living things.

Most of the Earth's water comes from oceans, atmospheric water, underground water layers, and rivers and rivers.
```

Yikes the language model is a bit whacky! Change the temperature and using sampling will definitely make the output much better!

You can also use Hugging Face's `AutoModelForPeftCausalLM`. Only use this if you do not have `unsloth` installed. It can be hopelessly slow, since `4bit` model downloading is not supported, and Unsloth's **inference is 2x faster**.

In [None]:
if False:
    # I highly do NOT suggest - use Unsloth if possible
    from peft import AutoPeftModelForCausalLM
    from transformers import AutoTokenizer
    model = AutoPeftModelForCausalLM.from_pretrained(
        "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit = load_in_4bit,
    )
    tokenizer = AutoTokenizer.from_pretrained("lora_model")

### Saving to float16 for VLLM (optional)

We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.

In [None]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

### GGUF / llama.cpp Conversion (optional)
To save to `GGUF` / `llama.cpp`, we support it natively now! We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.

Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)):
* `q8_0` - Fast conversion. High resource use, but generally acceptable.
* `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
* `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.

In [None]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q5_k_m", token = "")

Now, use the `model-unsloth.gguf` file or `model-unsloth-Q4_K_M.gguf` file in `llama.cpp` or a UI based system like `GPT4All`. You can install GPT4All by going [here](https://gpt4all.io/index.html).

And we're done! If you have any questions on Unsloth, we have a [Discord](https://discord.gg/u54VK8m8tk) channel! If you find any bugs or want to keep updated with the latest LLM stuff, or need help, join projects etc, feel free to join our Discord!

Some other links:
1. Zephyr DPO 2x faster [free Colab](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing)
2. Llama 7b 2x faster [free Colab](https://colab.research.google.com/drive/1lBzz5KeZJKXjvivbYvmGarix9Ao6Wxe5?usp=sharing)
3. TinyLlama 4x faster full Alpaca 52K in 1 hour [free Colab](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)
4. CodeLlama 34b 2x faster [A100 on Colab](https://colab.research.google.com/drive/1y7A0AxE3y8gdj4AVkl2aZX47Xu3P1wJT?usp=sharing)
5. Mistral 7b [free Kaggle version](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook)
6. We also did a [blog](https://huggingface.co/blog/unsloth-trl) with 🤗 HuggingFace, and we're in the TRL [docs](https://huggingface.co/docs/trl/main/en/sft_trainer#accelerate-fine-tuning-2x-using-unsloth)!
7. `ChatML` for ShareGPT datasets, [conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing)
8. Text completions like novel writing [notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing)
9. Gemma 6 trillion tokens is 2.5x faster! [free Colab](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing)

<div class="align-center">
  <a href="https://github.com/unslothai/unsloth"><img src="https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png" width="115"></a>
  <a href="https://discord.gg/u54VK8m8tk"><img src="https://github.com/unslothai/unsloth/raw/main/images/Discord.png" width="145"></a>
  <a href="https://ko-fi.com/unsloth"><img src="https://github.com/unslothai/unsloth/raw/main/images/Kofi button.png" width="145"></a></a> Support our work if you can! Thanks!
</div>

In [None]:
from google.colab import runtime

runtime.unassign()

# Huggingface Code (optional)

Just a playground

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
model_id = "meta-llama/Llama-3.1-8B-Instruct"
model_id = "mistralai/Mistral-Nemo-Instruct-2407"
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
model_id = "Qwen/Qwen2.5-7B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_id, device_map="auto", load_in_4bit=False
)
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [None]:
model_name = "llama3.1_8B_instruct"
template = "llama"

eval_data = "/content/drive/MyDrive/Welding_LLM/data/eval/exams.json"
# eval_data = "/content/drive/MyDrive/Welding_LLM/data/eval/vol1_eval_corrected_choice.json"
eval_name = os.path.splitext(os.path.split(eval_data)[1])[0]
eval_output_path = f"eval_result_{model_name}_{eval_name}.json"
eval(model, tokenizer, eval_data_path=eval_data, eval_output_path=eval_output_path, template=template)

Total 810 questions, 810 batchs with batch size 1


  1%|          | 5/810 [00:31<1:25:40,  6.39s/it]


KeyboardInterrupt: 