# Package

In [1]:
%%capture
!pip install numpy==1.26.4
!pip install install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia
!pip install transformer
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate
!pip install -U bitsandbytes
!pip install wandb


In [2]:
import torch
# Determine CUDA version and Pytorch version
cuda_version = torch.version.cuda
pytorch_version = torch.__version__
major_version, minor_version = torch.cuda.get_device_capability()
print(f"CUDA version: {cuda_version}")
print(f"PyTorch version: {pytorch_version}")
print(f"CUDA Device compute capability: {major_version}.{minor_version}")

CUDA version: 12.1
PyTorch version: 2.3.1+cu121
CUDA Device compute capability: 8.0


In [3]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
# wandb: 
import wandb
APIKEY = ""
wandb.login(key=APIKEY, relogin=True)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/user/.netrc


True

# Load the model

In [5]:
# load the model
cache_dir = 'cache_dir'
model_id = 'unsloth/Meta-Llama-3.1-8B-Instruct'

In [6]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_id,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    cache_dir = cache_dir,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


config.json:   0%|          | 0.00/915 [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Llama patching release 2024.7
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.394 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/50.9k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

# Checks

In [7]:
# Check there are no parameters overflowing onto cpu (meta)
for n, p in model.named_parameters():
  if p.device.type == "meta":
    print(f"{n} is on meta")

In [8]:
print(tokenizer.padding_side)

left


In [9]:
print(tokenizer.bos_token)
print(tokenizer.eos_token)
print(tokenizer.pad_token)

<|begin_of_text|>
<|eot_id|>
<|finetune_right_pad_id|>


In [10]:
# tokenizer.padding_side = 'right'

In [11]:
print(model.config)

LlamaConfig {
  "_name_or_path": "unsloth/Meta-Llama-3.1-8B-Instruct",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pad_token_id": 128004,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 8.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.43.1",
  "unsloth_version": "2024.7",
  "use_cache": true,
  "vocab_size": 128256
}



In [12]:
print(model.generation_config)

GenerationConfig {
  "bos_token_id": 128000,
  "do_sample": true,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "temperature": 0.6,
  "top_p": 0.9
}



# LoRA

In [13]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj","embed_tokens", "lm_head"],
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = True, # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,  # We support rank stabilized LoRA
    # modules_to_save = [], # And LoftQ
)

Unsloth 2024.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Unsloth: Casting embed_tokens to float32
Unsloth: Casting lm_head to float32


In [14]:
model.print_trainable_parameters()

trainable params: 1,092,616,192 || all params: 9,122,877,440 || trainable%: 11.9767


# Load the data

In [15]:
from datasets import load_dataset
dataset = load_dataset("Rajeeb321/main_dataset_qwen-pretrain-2", split = "train", token = "hf_", cache_dir = "cache_dir_dataset")

Downloading readme:   0%|          | 0.00/290 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/183M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/184M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/183M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/183M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/182M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/184M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/182M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/182M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/183M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/184M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/181M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/182M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/184M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1202150 [00:00<?, ? examples/s]

# Train the model

In [16]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments

trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = True, # Can make training 5x faster for short sequences.
    args =  UnslothTrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 16,
        num_train_epochs = 1,
        warmup_ratio = 0.03,
        # max_steps = 60,
        learning_rate = 5e-5,
        embedding_learning_rate = 1e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_torch",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "llama3.1_pretrain_0",
        gradient_checkpointing = True,
        gradient_checkpointing_kwargs = {"use_reentrant":True},
    ),
    data_collator = None,
)

Generating train split: 0 examples [00:00, ? examples/s]

Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]

### gpu check

In [17]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.394 GB.
21.756 GB of memory reserved.


### training

In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 620,912 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 16
\        /    Total batch size = 32 | Total steps = 19,403
 "-____-"     Number of trainable parameters = 1,092,616,192


Unsloth: Setting lr = 1.00e-05 instead of 5.00e-05 for embed_tokens.
Unsloth: Setting lr = 1.00e-05 instead of 5.00e-05 for lm_head.


[34m[1mwandb[0m: Currently logged in as: [33mdonrajeep321[0m ([33mrajeeb[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
1,1.815
2,1.7725
3,1.7675
4,1.8227
5,1.7768
6,1.8028
7,1.8033
8,1.8221
9,1.7926
10,1.7613


# Save the model

In [23]:
print("hello")

hello


In [None]:
# model.push_to_hub("Rajeeb321/pretrain_llama3.1_0_lora_model", token = "hf_") # Online saving
# tokenizer.push_to_hub("Rajeeb321/pretrain_llama3.1_0_lora_model", token = "hf_") # Online saving

### 16bit merged

In [None]:
# Merge to 16bit
# model.push_to_hub_merged("Rajeeb321/pretrain_llama3.1_0", tokenizer, save_method = "merged_16bit", token = "hf_")

In [24]:
print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): ModulesToSaveWrapper(
          (original_module): Embedding(128256, 4096)
          (modules_to_save): ModuleDict(
            (default): Embedding(128256, 4096)
          )
        )
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                

# Inference

In [57]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response in Nepali language that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

In [59]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Write a News article about Balen shah", # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 512)

Below is an instruction that describes a task, paired with an input that provides further context. Write a response in Nepali language that appropriately completes the request.

### Instruction:
Write a News article about Balen shah

### Input:


### Response:
बलेन शाह: काठमाडौंमा स्थानीय निर्वाचनको परिणाम

काठमाडौं, २०७८ मंसिर २६ गते

काठमाडौं महानगरपालिकाको प्रमुखमा बलेन शाह निर्वाचित भएका छन्। सोमबार भएको मतगणनाको अन्तिम परिणाम अनुसार शाहले २८ हजार ८ सय ७४ मत प्राप्त गर्दै विजयी भएका हुन्। उनका निकटतम प्रतिद्वन्द्वी कांग्रेसका राजुराज जोशीले २४ हजार ७ सय ८३ मत प्राप्त गरेका छन्।

नेकपा एमालेका तर्फबाट उम्मेदवार बनेका शाहले ४३.४६ प्रतिशत मत प्राप्त गरेका छन् भने कांग्रेसका जोशीले ३६.३९ प्रतिशत मत प्राप्त गरेका छन्। यसैगरी नेकपा माओवादी केन्द्रका तर्फबाट उम्मेदवार बनेका गजेन्द्र महर्जनले ११.४४ प्रतिशत मत प्राप्त गरेका छन्।

काठमाडौं महानगरपालिकामा प्रमुख र उपप्रमुखका लागि २३ जना उम्मेदवार चुनावी मैदानमा उत्रिएका थिए। निर्वाचन आयोगले २७ गते राति १२ बजे मतगणना सम्पन्न भएको घोषणा गरेको छ