### News

### Installation

In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [2]:
# Imports (modified for your use case)
from json import JSONDecodeError
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import Dataset
import pandas as pd
import random

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
def format_example(entry):
    prompt = """Generate all name variants including:
- Arabic script versions
- Latin transliterations
- Common alternative spellings
STRICT RULES:
-NO additionnal links
- NEVER modify name semantics (e.g., keep "ben"/"bin" as-is)
- NO translations or meanings
- NO titles/honorifics
- NO political/religious references
Primary name: {primary_name}
Type: {type}""".format(
        primary_name=entry["primary_name"],
        type=entry["type"]
    )

    variants = "\n".join([f"- {x['name']} ({x['script']})" for x in entry["names"]])

    return {
        "text": f"<|user|>\n{prompt}</s><|assistant|>\n{variants}</s>"
    }

In [6]:
import json
# Load your dataset
try:
      with open('cleaned_names.json','r') as f:
          data = json.load(f)
      print(f"✅ Validation passed! Found {len(data)} records")
except JSONDecodeError as e:
        print(f"Found error at line {e.lineno}, column {e.colno}: {e.msg}")
except Exception as e:
    print(f"❌ Validation failed: {str(e)}")


# Create training dataset
train_data = [format_example(entry) for entry in data]
df = pd.DataFrame(train_data)




✅ Validation passed! Found 16588 records


In [7]:
# Load 4bit model
model, tokenizer = FastLanguageModel.from_pretrained(
    "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = 2048,
    dtype = torch.float16,
    load_in_4bit = True,
)

# Add special tokens for chat format
tokenizer.add_special_tokens({
    "additional_special_tokens": ["<|user|>", "<|assistant|>"],
})
model.resize_token_embeddings(len(tokenizer))

# ======== ADD HERE ======== #
# Add Arabic special tokens
tokenizer.add_tokens(["<ar>", "</ar>"])  # Arabic delimiters
tokenizer.add_special_tokens({
    "additional_special_tokens": [
        "<|user|>",
        "<|assistant|>",
        "<ar>", "</ar>"  # Optional: Also add here if you want them as special tokens
    ],
})
model.resize_token_embeddings(len(tokenizer))  # Crucial!

# Prepare for LoRA training
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                     "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = True,
    random_state = 3407,
    max_seq_length = 2048,
)



==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.50.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
Unsloth 2025.3.19 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [8]:

train_data = [format_example(entry) for entry in data]

# Convert to HuggingFace Dataset
from datasets import Dataset
dataset = Dataset.from_list(train_data)

In [9]:
# Verify dataset
print("\n=== Dataset Sample ===")
print(dataset[0])  # Should show your formatted example
print(f"\nTotal examples: {len(dataset)}")

# Optimized based on your loss patterns
args = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=10,              # Reduced from 100 (since best loss occurred early)
    max_steps=40,                 # Stopping before first major spike (step 28-30)
    learning_rate=5e-5,           # Between original 2e-4 and 5e-5
    weight_decay=0.01,            # Increased regularization to prevent late-stage spikes
    fp16=True,
    logging_steps=1,
    output_dir="outputs",
    optim="adamw_8bit",
    save_strategy="steps",
    evaluation_strategy="no",
    lr_scheduler_type="cosine",   # Smoother decay observed in early steps
    save_steps=5                  # Frequent checkpoints
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=1024,
    tokenizer=tokenizer,
    args=args,
    packing=False,
)

# Train
trainer.train()

# Save model
model.save_pretrained("multilingual_name_generator")
tokenizer.save_pretrained("multilingual_name_generator")



=== Dataset Sample ===
{'text': '<|user|>\nGenerate all name variants including:\n- Arabic script versions\n- Latin transliterations\n- Common alternative spellings\nSTRICT RULES:\n-NO additionnal links\n- NEVER modify name semantics (e.g., keep "ben"/"bin" as-is)\n- NO translations or meanings\n- NO titles/honorifics\n- NO political/religious references\nPrimary name: Lukashenka Dzmitry Aliaksandravich\nType: individual</s><|assistant|>\n- Lukashenka Dzmitry Aliaksandravich (Latin)\n- Lukashenko Dmitri Aleksandrovich (LATN)\n- ЛУКАШЭНКА Дзмітрый Аляксандравіч (CYRL)\n- ЛУКАШЕНКО Дмитрий Александрович (CYRL)</s>'}

Total examples: 16588




Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/16588 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 16,588 | Num Epochs = 1 | Total steps = 40
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mguerricheoussama[0m ([33mguerricheoussama-istic[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,3.2621
2,2.9301
3,3.3501
4,3.0194
5,3.0586
6,3.3231
7,2.7201
8,2.8442
9,2.8758
10,2.7146




('multilingual_name_generator/tokenizer_config.json',
 'multilingual_name_generator/special_tokens_map.json',
 'multilingual_name_generator/tokenizer.json')

In [39]:
def generate_variants(primary_name, person_type="individual"):
    # Use the same prompt structure as in training
    prompt = """Generate all name variants including:
- Arabic script versions
- Latin transliterations
- Common alternative spellings

STRICT TEXT-ONLY OUTPUT (ELIMINATE HTML, XML tags or MARKUP):
(1) Latin: (exact transliteration)
(2) Arabic: (script only)
(3) Alternatives: (comma-separated)

STRICT RULES:
- ELIMINATE  additional links or markup (< >, [ ])
- NO explanations or extra text
- NEVER modify name semantics (keep "ben"/"bin" as-is)
- NO translations or meanings
- NO titles/honorifics
- NO political/religious references
- OUTPUT MUST BE IN PLAIN TEXT FORMAT ONLY.
 WARNING: If any part of your output contains HTML, it is INVALID. You must regenerate the response in plain text only.
Primary name: {primary_name}
Type: {type}""".format(
        primary_name=primary_name,
        type=person_type
    )


    # Format the input exactly as during training
    formatted_input = f"<|user|>\n{prompt}</s><|assistant|>\n"

    inputs = tokenizer(formatted_input,
                     return_tensors="pt",
                     truncation=True,
                     max_length=1024).to("cuda")

    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        repetition_penalty=1.5,
        temperature=0.1,
        top_k=40,
        top_p=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,

    )

    # Decode and extract just the assistant's response
    full_output = tokenizer.decode(outputs[0], skip_special_tokens=False)
    response = full_output.split("<|assistant|>")[-1].replace("</s>", "").strip()

    return response

# Test the function
print(generate_variants( "سيف الدين براهمي", "individual"))

Latin:</b>
سفيدين براهمي<br />
سيدين پرهامي <br/>
sefidin brahmi </p></div><span class="arabic">براهيمي، سياف الدّين بن محمد.</spanspan>

<div id='variants'>
Arabicsifidīn brahami< / p >
<p>Seyyedine Parhamiy<span style=font-family:'Times New Roman', Times, serif; font-size :12px ; color:#000;">,</ span>sayf al-d&iacute;n bin muhammad.< br />seyfeddin parahami<s pan s tyle = 't imes new roman' ></spa n >< spa ns ty le ='timesnewroman'>sayefeddînebrahmê,saydineddâwûddibrahmï,< sp an st yle =" times ne w rom a no "> say f ed d ienpar h ami. sa ye fe dd ine pa ra ha mi.s ayfe de din ba rha me.i/sa yi fa e di na barra ma ni./sa ya fi da enba rr ah m ee.sa yy efde dinebar ham iy.sayfae deddi neb ar
