### News

### Installation

In [24]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [25]:
def format_example(entry):
    prompt = """Generate all name variants including:
- Arabic script versions
- Latin transliterations
- Common alternative spellings
STRICT RULES:
-NO additionnal links
- NEVER modify name semantics (e.g., keep "ben"/"bin" as-is)
- NO translations or meanings
- NO titles/honorifics
- NO political/religious references
Primary name: {primary_name}
Type: {type}""".format(
        primary_name=entry["primary_name"],
        type=entry["type"]
    )

    variants = "\n".join([f"- {x['name']} ({x['script']})" for x in entry["names"]])

    return {
        "text": f"<|user|>\n{prompt}</s><|assistant|>\n{variants}</s>"
    }

In [26]:

# Load your dataset
try:
      with open('/content/cleaned_names.json','r') as f:
          data = json.load(f)
      print(f"✅ Validation passed! Found {len(data)} records")
except JSONDecodeError as e:
        print(f"Found error at line {e.lineno}, column {e.colno}: {e.msg}")
except Exception as e:
    print(f"❌ Validation failed: {str(e)}")


# Create training dataset
train_data = [format_example(entry) for entry in data]
df = pd.DataFrame(train_data)




✅ Validation passed! Found 16588 records


In [27]:
# Load 4bit model
model, tokenizer = FastLanguageModel.from_pretrained(
    "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = 2048,
    dtype = torch.float16,
    load_in_4bit = True,
)

# Add special tokens for chat format
tokenizer.add_special_tokens({
    "additional_special_tokens": ["<|user|>", "<|assistant|>"],
})
model.resize_token_embeddings(len(tokenizer))

# ======== ADD HERE ======== #
# Add Arabic special tokens
tokenizer.add_tokens(["<ar>", "</ar>"])  # Arabic delimiters
tokenizer.add_special_tokens({
    "additional_special_tokens": [
        "<|user|>",
        "<|assistant|>",
        "<ar>", "</ar>"  # Optional: Also add here if you want them as special tokens
    ],
})
model.resize_token_embeddings(len(tokenizer))  # Crucial!

# Prepare for LoRA training
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                     "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = True,
    random_state = 3407,
    max_seq_length = 2048,
)



==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.50.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [19]:

train_data = [format_example(entry) for entry in data]

# Convert to HuggingFace Dataset
from datasets import Dataset
dataset = Dataset.from_list(train_data)

In [21]:
# Verify dataset
print("\n=== Dataset Sample ===")
print(dataset[0])  # Should show your formatted example
print(f"\nTotal examples: {len(dataset)}")

# Optimized based on your loss patterns
args = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=10,              # Reduced from 100 (since best loss occurred early)
    max_steps=40,                 # Stopping before first major spike (step 28-30)
    learning_rate=5e-5,           # Between original 2e-4 and 5e-5
    weight_decay=0.01,            # Increased regularization to prevent late-stage spikes
    fp16=True,
    logging_steps=1,
    output_dir="outputs",
    optim="adamw_8bit",
    save_strategy="steps",
    evaluation_strategy="no",
    lr_scheduler_type="cosine",   # Smoother decay observed in early steps
    save_steps=5                  # Frequent checkpoints
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=1024,
    tokenizer=tokenizer,
    args=args,
    packing=False,
)

# Train
trainer.train()

# Save model
model.save_pretrained("multilingual_name_generator")
tokenizer.save_pretrained("multilingual_name_generator")



=== Dataset Sample ===
{'text': '<|user|>\nGenerate all name variants including:\n- Arabic script versions\n- Latin transliterations\n- Common alternative spellings\nSTRICT RULES: \n-NO additionnal links \n- NEVER modify name semantics (e.g., keep "ben"/"bin" as-is)\n- NO translations or meanings\n- NO titles/honorifics\n- NO political/religious references\nPrimary name: Lukashenka Dzmitry Aliaksandravich\nType: individual</s><|assistant|>\n- Lukashenka Dzmitry Aliaksandravich (Latin)\n- Lukashenko Dmitri Aleksandrovich (LATN)\n- ЛУКАШЭНКА Дзмітрый Аляксандравіч (CYRL)\n- ЛУКАШЕНКО Дмитрий Александрович (CYRL)</s>'}

Total examples: 16588




Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/16588 [00:00<?, ? examples/s]

ValueError: You cannot perform fine-tuning on purely quantized models. Please attach trainable adapters on top of the quantized model to correctly perform fine-tuning. Please see: https://huggingface.co/docs/transformers/peft for more details

In [13]:
def generate_variants(primary_name, person_type="individual"):
    # Use the same prompt structure as in training
    prompt = """Generate all name variants including:
- Arabic script versions
- Latin transliterations
- Common alternative spellings
STRICT RULES:
-NO additionnal links
-NEVER modify name semantics (e.g., keep "ben"/"bin" as-is)
- NO translations or meanings
- NO titles/honorifics
- NO political/religious references
-NO additional text/links
Primary name: {primary_name}
Type: {type}""".format(
        primary_name=primary_name,
        type=person_type
    )

    # Format the input exactly as during training
    formatted_input = f"<|user|>\n{prompt}</s><|assistant|>\n"

    inputs = tokenizer(formatted_input,
                     return_tensors="pt",
                     truncation=True,
                     max_length=1024).to("cuda")

    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        repetition_penalty=1.5,
        temperature=0.5,
        top_k=40,
        top_p=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode and extract just the assistant's response
    full_output = tokenizer.decode(outputs[0], skip_special_tokens=False)
    response = full_output.split("<|assistant|>")[-1].replace("</s>", "").strip()

    return response

# Test the function
print(generate_variants("Heithem benmoussa", "individual"))

Arabic (script): هيثم بن موسى </p>
Latin transcription(s):
Heïthem Ben Moussâ, Héthém Bén Mousa,
Hetham Bin Musaa, Heytam bin moussi,
Heytem b. mûsa'în 1) - [transliteration]
Variant names in English:</b></font><ul class="list"><li>Hayther Mohammed Al-Musaibih / Haytheer Muhammad al-musaihibihi2)</u>, <br />Hayder Mohamed Elmoasawi3),<BR/>Al-Haidar Mohd Ali4)<span style='color:#FF0000'>5)
6).7).
8).</div>.9.)10.)
11.).12.
13.</h3>.
14.),15.,16.<sup>)17.,
18.).

19.)

20).

21)

22.

23).[24]

25.[26].27)[28]29

30].[31].

32)..33)>34>

35>.

36)]37][38]</strong>[39]<sub>]40[41](42)(43(44))45^46*47+48=49#50$51%52&53@54!55?56~
