# Fine tuning Llama-3.2-1B with tahrirchi/uz-crawl

## Installing the required libraries and packages

In [5]:
!pip install torch torchvision transformers datasets bitsandbytes peft



In [6]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset

## Configure torch with GPU

In [7]:
# Switch to GPU (cuda)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


## Model Loading from Huggingface

In [8]:
model_name = "Vikhrmodels/Vikhr-Llama-3.2-1B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:  73%|#######2  | 3.62G/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.9k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

## Data Loading & Preprocessing

In [9]:
from datasets import load_dataset

# Загружаем полный набор данных
full_dataset = load_dataset("tahrirchi/uz-crawl")

# Выбираем первый 15000 примеров из раздела 'news'
dataset = full_dataset['news'].select(range(15000))

# Разделяем данные на обучающий и тестовый наборы
train_test_split = dataset.train_test_split(test_size=0.2, shuffle=True)

# Получаем обучающий и тестовый наборы
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

README.md:   0%|          | 0.00/313 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/5.14M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/38385 [00:00<?, ? examples/s]

In [10]:
print(full_dataset, full_dataset.keys())

DatasetDict({
    train: Dataset({
        features: ['Header', 'Content'],
        num_rows: 38385
    })
}) dict_keys(['train'])


In [11]:
# Let's have a look at our dataset and how a single data point looks like
print("Number of rows:", len(dataset))
print("Columns:", dataset.column_names)

Number of rows: 10000
Columns: ['Header', 'Content']


In [12]:
from transformers import AutoTokenizer

# Assuming tokenizer is already defined
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Define prompt builder function
def prompt_builder(row):
    return {"text": f"Definition in Uzbek: {row['text']}"}

# Apply prompt builder
train_dataset = train_dataset.map(prompt_builder)
test_dataset = test_dataset.map(prompt_builder)

print("Train dataset example:", train_dataset[0])
print("Test dataset example:", test_dataset[0])

# Define tokenize function
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length")

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Convert datasets to PyTorch tensors
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Remove unnecessary columns
train_dataset = train_dataset.remove_columns(['text'])
test_dataset = test_dataset.remove_columns(['text'])


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Train dataset example: {'Header': 'Uyaning «nafas olishi»', 'Content': 'Tarmoq yuklamasi, xalaqitli vaziyat va boshqa omillarga bog‘liq ravishda, yacheyka (uya) o‘lchamlarining o‘zgarishi.', 'text': 'Definition of Uyaning «nafas olishi» in Uzbek: Tarmoq yuklamasi, xalaqitli vaziyat va boshqa omillarga bog‘liq ravishda, yacheyka (uya) o‘lchamlarining o‘zgarishi.'}
Test dataset example: {'Header': 'MBBT-ilova', 'Content': 'Foydalanuvchining ma’lumotlar bazasi ma’lumotlaridan foydalana olishini ta’minlaydigan dastur. Ma’lumotlarni kiritish, so‘rovlar va hisobotlar shaklida amalga oshiriladi.', 'text': 'Definition of MBBT-ilova in Uzbek: Foydalanuvchining ma’lumotlar bazasi ma’lumotlaridan foydalana olishini ta’minlaydigan dastur. Ma’lumotlarni kiritish, so‘rovlar va hisobotlar shaklida amalga oshiriladi.'}


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

## Turn the model into LoRA model

In [13]:
from peft import prepare_model_for_kbit_training

peft_model = prepare_model_for_kbit_training(model)

peft_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear8bitLt(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear8bitLt(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear8bitLt(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear8bitLt(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNor

In [14]:
from peft import LoraConfig, TaskType, get_peft_model

lora_config = LoraConfig(
    r=32, #the rank of the update matrices, expressed in int. Lower rank results in smaller update matrices with fewer trainable parameters
    lora_alpha=32, #LoRA scaling factor
    lora_dropout=0.1,
    task_type=TaskType.CAUSAL_LM,
    #target_modules='all-linear' # The modules (for example, attention blocks) to apply the LoRA update matrices.
)

lora_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, r=32, target_modules=None, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False))

In [15]:
lora_model = get_peft_model(peft_model, lora_config)
lora_model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear8bitLt(
                (base_layer): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_pro

In [16]:
lora_model.print_trainable_parameters()

trainable params: 13,631,488 || all params: 8,043,892,736 || trainable%: 0.1695


In [19]:
import numpy as np
from transformers import DataCollatorWithPadding,DataCollatorForLanguageModeling, Trainer, TrainingArguments

trainer = Trainer(
    model=lora_model,
    args=TrainingArguments(
        output_dir="trained",
        learning_rate=2e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=4,
        weight_decay=0.01,
        load_best_model_at_end=True,
        logging_steps=1,
        report_to="none"
    ),
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
)

  trainer = Trainer(


## Let's start the training!

In [None]:
trainer.train()

  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,1.5469,1.666677
2,1.4782,1.630027


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


## Generate the first response

In [33]:
from peft import PeftModel

# Load the fine-tuned model with the adapter attached
model_with_adapter = PeftModel.from_pretrained(model, "Vikhr-Llama-3.2-1B-Instruct/checkpoint-8000").to("cuda")
model_with_adapter.eval()
inputs = tokenizer("Samarqandda ta’limni rivojlantirish", return_tensors="pt")

outputs = model_with_adapter.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=100)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Biometriya ma'noni anglatadi: biologik xususiyatlar va biometrik o‘lchamlar asosida shaxsni aniqlash va tasdiqlash. Biometriyaning asosiy prinsipi shaxsning biometrik xususiyatlari bo‘yicha olingan o‘lchovlar to‘plamiga asoslangan shaxsni tasdiqlash va aniqlash hisoblanadi. Biomet
