In [None]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9]{1,}\.[0-9]{1,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.33.post1" if v=="2.9" else "0.0.32.post2" if v=="2.8" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.56.2
!pip install --no-deps trl==0.22.2

In [None]:
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
import torch
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments

max_seq_length = 2048
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="Tweeties/tweety-7b-tatar-v24a",
    max_seq_length=max_seq_length,
    dtype=None,
    load_in_4bit=True,
)
tokenizer = get_chat_template(tokenizer, chat_template="mistral")
FastLanguageModel.for_training(model)

model = FastLanguageModel.get_peft_model(
    model, r=16, target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                                 "gate_proj", "up_proj", "down_proj",],
    lora_alpha=16, lora_dropout=0, bias="none",
    use_gradient_checkpointing="unsloth", random_state=3407,
    use_rslora=False, loftq_config=None,
)


==((====))==  Unsloth 2025.11.4: Fast Mistral patching. Transformers: 4.56.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Unsloth: Will load Tweeties/tweety-7b-tatar-v24a as a legacy tokenizer.


Tweeties/tweety-7b-tatar-v24a does not have a padding token! Will use pad_token = <unk>.


In [None]:
dataset = load_dataset("json", data_files="dataset.json", split="train")
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

sys_prompt = """Син татар телендәге токсик сөйләмнәрне детоксификациялау вәкиле. Сөйләмнең төп мәгънысын саклап, агрессив, һөны, дискриминацион элементларны алып такач, нейтраль, нәзек вариант язы."""

def formatting_prompts_func(examples):
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for inp, outp in zip(inputs, outputs):
        messages = [
            {"role": "system", "content": sys_prompt},
            {"role": "user", "content": inp},
            {"role": "assistant", "content": outp},
        ]
        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
        texts.append(text)
    return {"text": texts}

train_dataset = train_dataset.map(formatting_prompts_func, batched=True,)
eval_dataset = eval_dataset.map(formatting_prompts_func, batched=True,)


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/7492 [00:00<?, ? examples/s]

Map:   0%|          | 0/833 [00:00<?, ? examples/s]

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=10,
        num_train_epochs=1,  # ~1 epoch for 9k examples
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="tatar-detox-outputs",
        eval_steps=50,
        save_steps=100,
    ),
)
trainer_stats = trainer.train()


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/7492 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/833 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 7,492 | Num Epochs = 1 | Total steps = 937
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 7,283,675,136 (0.58% trained)


Step,Training Loss
10,4.2098
20,1.2003
30,0.9767
40,0.8334
50,0.8632
60,0.8392
70,0.8311
80,0.7838
90,0.7873
100,0.8067


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using

In [None]:

model.save_pretrained("tatar-detox-lora")
tokenizer.save_pretrained("tatar-detox-lora")


model.save_pretrained_merged("tatar-detox-merged", tokenizer, save_method="merged_16bit")


FastLanguageModel.for_inference(model)
prompt = sys_prompt + "\n\n" + dataset["train"][0]["input"]
inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=128, use_cache=True)
print(tokenizer.batch_decode(outputs)[0])


Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...


Unsloth: Copying 6 files from cache to `tatar-detox-merged`: 100%|██████████| 6/6 [11:51<00:00, 118.59s/it]


Successfully copied all 6 files from cache to `tatar-detox-merged`
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files: 100%|██████████| 6/6 [00:00<00:00, 24576.00it/s]
Unsloth: Merging weights into 16bit: 100%|██████████| 6/6 [23:39<00:00, 236.53s/it]


Unsloth: Merge process complete. Saved to `/content/tatar-detox-merged`
<s>Син татар телендәге токсик сөйләмнәрне детоксификациялау вәкиле. Сөйләмнең төп мәгънысын саклап, агрессив, һөны, дискриминацион элементларны алып такач, нейтраль, нәзек вариант язы. Үз эшеңне бел. [INST]Үз эшегезне башкарыгыз.</s>


In [None]:
from huggingface_hub import HfApi, login
repo_id = "Mochalka123/tatar-detox-merged"
api = HfApi()
api.create_repo(repo_id=repo_id, exist_ok=True)

api.upload_folder(
    folder_path="tatar-detox-merged",
    repo_id=repo_id,
)
print("Загружено")

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...ox-merged/tokenizer.model: 100%|##########|  925kB /  925kB            

  ...0006-of-00006.safetensors:   0%|          | 8.03MB / 4.25GB            

  ...0001-of-00006.safetensors:   0%|          | 15.7MB / 4.99GB            

  ...0003-of-00006.safetensors:   0%|          |  555kB / 5.00GB            

  ...0005-of-00006.safetensors:   0%|          |  556kB / 4.83GB            

  ...0004-of-00006.safetensors:   0%|          |  556kB / 5.00GB            

  ...0002-of-00006.safetensors:   0%|          |  556kB / 4.90GB            

✅ Загружено в Mochalka123/tatar-detox-merged
