In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig

model_name = "deepseek-ai/deepseek-llm-7b-chat"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
model.generation_config = GenerationConfig.from_pretrained(model_name)
model.generation_config.pad_token_id = model.generation_config.eos_token_id



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


In [5]:
messages = [
    {
        "role": "system",
        "content": "You are a Bulgarian Voice Assistant called Borko",
    },
    {"role": "user", "content": "Претрениран ли си на нещо?"},
]
input_tensor = tokenizer.apply_chat_template(
    messages, add_generation_prompt=True, return_tensors="pt"
)

attention_mask = input_tensor.ne(tokenizer.pad_token_id)  # Mask non-padding tokens

outputs = model.generate(
    input_tensor.to(model.device),
    attention_mask=attention_mask.to(model.device),  # Pass attention mask
    max_new_tokens=100
)


In [6]:
result = tokenizer.decode(outputs[0][input_tensor.shape[1]:], skip_special_tokens=True)
result

'Аз, Borko, не мисля. Можете ли да ми помогнете с нещо?'

In [7]:
model.device

device(type='cuda', index=0)

In [1]:
from datasets import load_dataset

In [2]:
dataset = load_dataset("text", data_files={"train": "corpus.txt"})
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 14044
    })
})


In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Step 1: Load the plain text dataset
dataset = load_dataset("text", data_files={"train": "corpus.txt"})

# Print the dataset structure to check the available splits
print(dataset)

# Step 2: Initialize the tokenizer for your model
model_name = "deepseek-ai/deepseek-llm-7b-chat"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Step 3: Define the tokenization function with labels
def tokenize_function(examples):
    # Tokenize the text
    tokenized = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

    # The labels are the same as the input_ids for language modeling
    tokenized["labels"] = tokenized["input_ids"].copy()  # Add the labels (same as input_ids)
    
    return tokenized

# Step 4: Apply the tokenization function to the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Print out the tokenized dataset to verify the changes
print(tokenized_datasets)

# Optionally, you can check the first example to ensure it was correctly tokenized and labels are added
print(tokenized_datasets["train"][0])


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 14044
    })
})
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14044
    })
})
{'input_ids': [100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001, 100001

In [4]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14044
    })
})

In [5]:
tokenizer.decode(tokenized_datasets['train'][124]['input_ids'])

'<｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁sentence｜><｜end▁of▁se

In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
import bitsandbytes as bnb

# Model name
model_name = "deepseek-ai/deepseek-llm-7b-chat"



quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,  # ✅ Use 4-bit instead of 8-bit
    bnb_4bit_quant_type="nf4",  # Normalized Float 4
    bnb_4bit_use_double_quant=True,  # Use double quantization
    bnb_4bit_compute_dtype=torch.bfloat16,  # Compute in bfloat16 for efficiency
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
)

model = model.to("cuda")


`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
from peft import LoraConfig, get_peft_model, TaskType

# Define LoRA configuration
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  
    r=16,  # LoRA rank
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.05,  
    target_modules=["q_proj", "v_proj"],  # Target attention layers
)

# Apply LoRA adapters to the model
model = get_peft_model(model, peft_config)

In [8]:
training_args = TrainingArguments(
    output_dir="./deepseek-bulgarian",
    per_device_train_batch_size=1,  # Reduce batch size
    gradient_accumulation_steps=8,  # Simulate larger batch
    learning_rate=1e-4,
    num_train_epochs=2,
    fp16=True,  # Enable mixed precision
    save_steps=500,
    save_total_limit=2,
    logging_steps=50,
    push_to_hub=False,
    save_strategy="epoch"
)





In [9]:
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    
    
)

# Start training
trainer.train()

  0%|          | 0/3510 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


{'loss': 36.8678, 'grad_norm': 1.4664474725723267, 'learning_rate': 9.866096866096866e-05, 'epoch': 0.03}
{'loss': 3.9163, 'grad_norm': 1.1467255353927612, 'learning_rate': 9.723646723646724e-05, 'epoch': 0.06}
{'loss': 3.6592, 'grad_norm': 0.6449480652809143, 'learning_rate': 9.581196581196583e-05, 'epoch': 0.09}
{'loss': 3.7917, 'grad_norm': 1.2902960777282715, 'learning_rate': 9.43874643874644e-05, 'epoch': 0.11}
{'loss': 3.3139, 'grad_norm': 1.1000134944915771, 'learning_rate': 9.296296296296296e-05, 'epoch': 0.14}
{'loss': 3.3896, 'grad_norm': 1.2103475332260132, 'learning_rate': 9.153846153846155e-05, 'epoch': 0.17}
{'loss': 3.4854, 'grad_norm': 0.9885275959968567, 'learning_rate': 9.011396011396012e-05, 'epoch': 0.2}
{'loss': 3.4334, 'grad_norm': 0.6897383332252502, 'learning_rate': 8.86894586894587e-05, 'epoch': 0.23}
{'loss': 3.6897, 'grad_norm': 1.9146811962127686, 'learning_rate': 8.726495726495727e-05, 'epoch': 0.26}
{'loss': 3.3973, 'grad_norm': 1.381098985671997, 'learnin

TrainOutput(global_step=3510, training_loss=3.744822184788196, metrics={'train_runtime': 13782.7055, 'train_samples_per_second': 2.038, 'train_steps_per_second': 0.255, 'total_flos': 5.6051803551380275e+17, 'train_loss': 3.744822184788196, 'epoch': 1.9991455425804614})

In [136]:
messages = [
    {
        "role": "system",
        "content": "Ти си Български Гласов Асистент, говори само на български език, и твоето име е Борко. Твоят създател е великият Васил Василев",
    },
    {"role": "user", "content": "Кой е най-бележитият българин"},
]
input_tensor = tokenizer.apply_chat_template(
    messages, add_generation_prompt=True, return_tensors="pt"
)

attention_mask = input_tensor.ne(tokenizer.pad_token_id)  # Mask non-padding tokens

outputs = model.generate(
    input_tensor.to(model.device),
    attention_mask=attention_mask.to(model.device),  # Pass attention mask
    max_new_tokens=100,
    pad_token_id = 100001
)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

In [137]:
generated_text

'Ти си Български Гласов Асистент, говори само на български език, и твоето име е Борко. Твоят създател е великият Васил Василев\n\nUser: Кой е най-бележитият българин\n\nAssistant:Най-бележитият българин е Васил Левски. Той е бил възрожденец, просветител и революционер, който е играл важна роля в българското възраждане. Той е бил един от най-важните дейци на българските въстания и е участвал в създаването на Българското въстание през 1876 год. и в основаването на Българското правителство. Левски е бил осъден на смърт и е бил погубен от турците в 1873 год. Той'

In [93]:
trainer.save_model("borko_1")

In [138]:
tokenizer.save_pretrained("borko_1_tok")

('borko_1_tok\\tokenizer_config.json',
 'borko_1_tok\\special_tokens_map.json',
 'borko_1_tok\\tokenizer.json')