### News

### Installation

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
# Imports (modified for your use case)
from json import JSONDecodeError
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import Dataset
import pandas as pd
import random

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
def format_example(entry, training_mode=True):
    """
    Prepares examples with strict JSON format.

    Training: Shows only available scripts from original data
    Inference: Forces generation of all script variants (Latn, Arab, Cyrl, Hani)
               using common transliteration rules when needed
    """

    # SYSTEM PROMPT - Dual behavior
    system_prompt = """You are a UNICODE script conversion expert. Rules:
1. Output JSON with these EXACT fields:
   - "Latn": [Latin transcriptions]
   - "Arab": [Arabic script variants]
   - "Cyrl": [Cyrillic script variants]
   - "Hani": [Chinese character variants]
   - "alternatives": [Other spellings]
2. Key Behaviors:"""

    if training_mode:
        # TRAINING MODE - Show only existing scripts
        system_prompt += """
   - Include ONLY scripts present in the input
   - Preserve original examples exactly"""
    else:
        # INFERENCE MODE - Force all script variants
        system_prompt += """
   - MUST generate variants for ALL scripts when possible:
     * Arab: Use common Arabic transliterations
     * Cyrl: Use standard Cyrillization rules
     * Hani: Use common Chinese transcriptions
   - Return empty arrays for impossible conversions"""

    # USER PROMPT
    user_prompt = f"""Generate complete script variants for:
Primary Name: {entry['primary_name']}
Type: {entry['type']}"""

    # Build variants from available data
    variants = {
        "Latn": [],
        "Arab": [],
        "Cyrl": [],
        "Hani": [],
        "alternatives": []
    }

    for x in entry["names"]:
      if x["script"]!=None:
        script = x["script"].upper()
        if script == "LATN":
            variants["Latn"].append(x["name"])
        elif script == "ARAB":
            variants["Arab"].append(x["name"])
        elif script == "CYRL":
            variants["Cyrl"].append(x["name"])
        elif script == "HANI":
            variants["Hani"].append(x["name"])
        else:
            variants["alternatives"].append(x["name"])

    # Clean empty arrays for training (keep all in inference)
    if training_mode:
        variants = {k: v for k, v in variants.items() if v}

    return {
        "text": f"<|system|>\n{system_prompt}</s>\n<|user|>\n{user_prompt}</s>\n<|assistant|>\n{json.dumps(variants, ensure_ascii=False)}</s>"
    }

In [None]:
import json
# Load your dataset
try:
      with open('cleaned_names1.json','r') as f:
          data = json.load(f)
      print(f"✅ Validation passed! Found {len(data)} records")
except JSONDecodeError as e:
        print(f"Found error at line {e.lineno}, column {e.colno}: {e.msg}")
except Exception as e:
    print(f"❌ Validation failed: {str(e)}")


# Create training dataset
train_data = [format_example(entry) for entry in data]
df = pd.DataFrame(train_data)




✅ Validation passed! Found 16588 records


In [None]:

#train_data = [format_example(entry) for entry in data]
# Use this for dataset creation
#train_data = [format_example(entry, True) for entry in data]
# Convert to HuggingFace Dataset
from datasets import Dataset
dataset = Dataset.from_list(train_data)

In [None]:
# Load 4bit model with safer settings
model, tokenizer = FastLanguageModel.from_pretrained(
    "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = 512,  # Reduced from 2048 to prevent OOM
    dtype = torch.float16,
    load_in_4bit = True,
)

# First get original tokenizer length
original_tokenizer_len = len(tokenizer)

# Define all special tokens at once
special_tokens = {
    "additional_special_tokens": [
        "<|user|>",
        "<|assistant|>",
        "<ar>",
        "</ar>"
    ]
}

# Add all special tokens in a single operation
tokenizer.add_special_tokens(special_tokens)

# Only resize if new tokens were added
if len(tokenizer) > original_tokenizer_len:
    model.resize_token_embeddings(len(tokenizer))
    print(f"Added {len(tokenizer) - original_tokenizer_len} new tokens")
else:
    print("No new tokens added")

# Prepare for LoRA training with consistent max_seq_length
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    use_gradient_checkpointing=True,
    random_state=3407,
    max_seq_length=512,  # Must match the model's max_seq_length
)

# Verify tokenizer has pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.50.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!




ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [None]:
from transformers import DataCollatorForLanguageModeling

# Verify dataset
print("\n=== Dataset Sample ===")
print(dataset[0])
print(f"\nTotal examples: {len(dataset)}")

# Calculate optimal training parameters
total_examples = len(dataset)
batch_size = 2
grad_accum = 4
examples_per_step = batch_size * grad_accum
steps_per_epoch = total_examples // examples_per_step
max_steps = steps_per_epoch * 1  # Train for 1 epoch (~2,000 steps)
warmup_steps = int(0.1 * max_steps)  # 10% warmup (~200 steps)
max_steps=1000
warmup_steps=100
# Initialize data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Important: Set to False for causal LM
)

args = TrainingArguments(
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=grad_accum,
    warmup_steps=warmup_steps,
    max_steps=max_steps,
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=True,
    logging_steps=10,
    output_dir="outputs",
    optim="adamw_8bit",
    lr_scheduler_type="cosine",
    save_strategy="steps",
    save_steps=500,
    eval_strategy="no",  # Disabled evaluation since we don't have validation set
    load_best_model_at_end=False,  # Disabled since no eval set
)

# Ensure tokenizer is properly initialized
if not hasattr(tokenizer, 'pad_token') or tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=512,
    tokenizer=tokenizer,
    args=args,
    packing=False,
    data_collator=data_collator,  # Explicitly set our collator
)

# Train
trainer.train()

# Save model
model.save_pretrained("multilingual_name_generator")
tokenizer.save_pretrained("multilingual_name_generator")


=== Dataset Sample ===
{'text': '<|system|>\nYou are a UNICODE script conversion expert. Rules:\n1. Output JSON with these EXACT fields:\n   - "Latn": [Latin transcriptions]\n   - "Arab": [Arabic script variants]\n   - "Cyrl": [Cyrillic script variants]\n   - "Hani": [Chinese character variants]\n   - "alternatives": [Other spellings]\n2. Key Behaviors:\n   - Include ONLY scripts present in the input\n   - Preserve original examples exactly</s>\n<|user|>\nGenerate complete script variants for:\nPrimary Name: Lukashenka Dzmitry Aliaksandravich\nType: individual</s>\n<|assistant|>\n{"Latn": ["Lukashenka Dzmitry Aliaksandravich", "Lukashenko Dmitri Aleksandrovich"], "Cyrl": ["ЛУКАШЭНКА Дзмітрый Аляксандравіч", "ЛУКАШЕНКО Дмитрий Александрович"]}</s>'}

Total examples: 16588


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/16588 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 16,588 | Num Epochs = 1 | Total steps = 1,000
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mguerricheoussama[0m ([33mguerricheoussama-istic[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,2.7837
20,2.6497
30,2.4513
40,1.9455
50,1.1188
60,0.5811
70,0.5268
80,0.4875
90,0.4747
100,0.4376




('multilingual_name_generator/tokenizer_config.json',
 'multilingual_name_generator/special_tokens_map.json',
 'multilingual_name_generator/tokenizer.json')

In [None]:
def generate_variants(primary_name, person_type="individual", max_retries=3):
    """
    Generates name variants in all scripts (Latn/Arab/Cyrl/Hani) with strict JSON formatting.

    Args:
        primary_name (str): Input name to convert
        person_type (str): "individual" or other type
        max_retries (int): Retry attempts if output is invalid

    Returns:
        dict: {
            "Latn": [str],       # Latin transcriptions
            "Arab": [str],       # Arabic script variants
            "Cyrl": [str],       # Cyrillic script variants
            "Hani": [str],       # Chinese character variants
            "alternatives": [str] # Other spellings
        }
        or {"error": str} if failed
    """
    # SYSTEM PROMPT - Forces all script generation
    system_prompt = """You are a UNICODE script conversion expert. Rules:
1. OUTPUT MUST BE VALID JSON with these EXACT fields:
   - "Latn": [Latin transcriptions]
   - "Arab": [Arabic script variants]
   - "Cyrl": [Cyrillic script variants]
   - "Hani": [Chinese character variants]
   - "alternatives": [Other spellings]
2. Generation Rules:
   - MUST attempt conversions for ALL scripts
   - Use common transliteration rules when needed
   - Return empty arrays [] for impossible conversions
3. Strict Formatting:
   - No explanations or non-JSON text
   - Preserve original name semantics"""

    # USER PROMPT - Clear task specification
    user_prompt = f"""Generate complete script variants for:
Primary Name: {primary_name}
Type: {person_type}"""


        # Structured prompt with JSON priming
prompt = f"""<|system|>
{system_prompt}</s>
<|user|>
{user_prompt}</s>
<|assistant|>
{{"""  # Intentional opening brace to force JSON

inputs = tokenizer(
            prompt,
            return_tensors="pt",
            max_length=1024,
            truncation=True
        ).to("cuda")

outputs = model.generate(
            **inputs,
            max_new_tokens=500,  # Extra space for multiple script conversions
            temperature=0.1,     # Balanced creativity/accuracy
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            no_repeat_ngram_size=2
        )
raw_output = tokenizer.decode(outputs[0], skip_special_tokens=False)



# Example Usage:
print(
    generate_variants("محمد علي  مسلماني"),
    )

NameError: name 'system_prompt' is not defined

In [None]:
def generate_variants(primary_name, person_type="individual", max_retries=3):
    """
    Generates name variants in all scripts (Latn/Arab/Cyrl/Hani) and returns raw model output.

    Args:
        primary_name (str): Input name to convert
        person_type (str): "individual" or other type
        max_retries (int): Retry attempts if generation fails

    Returns:
        str: Raw model output (may include JSON and other text)
        or {"error": str} if failed
    """
    # SYSTEM PROMPT - Forces all script generation
    system_prompt = """You are a UNICODE script conversion expert. Rules:
1. OUTPUT MUST BE VALID JSON with these EXACT fields:
   - "Latn": [Latin transcriptions]
   - "Arab": [Arabic script variants]
   - "Cyrl": [Cyrillic script variants]
   - "Hani": [Chinese character variants]
   - "alternatives": [Other spellings]
2. Generation Rules:
   - MUST attempt conversions for ALL scripts
   - Use common transliteration rules when needed
   - Return empty arrays [] for impossible conversions
3. Strict Formatting:
   - No explanations or non-JSON text
   - Preserve original name semantics"""

    # USER PROMPT - Clear task specification
    user_prompt = f"""Generate complete script variants for:
Primary Name: {primary_name}
Type: {person_type}"""

    # Structured prompt with JSON priming
    prompt = f"""<|system|>
{system_prompt}</s>
<|user|>
{user_prompt}</s>
<|assistant|>
{{"""  # Intentional opening brace to force JSON

    for attempt in range(max_retries):
        try:
            inputs = tokenizer(
                prompt,
                return_tensors="pt",
                max_length=1024,
                truncation=True
            ).to("cuda")

            outputs = model.generate(
                **inputs,
                max_new_tokens=500,
                temperature=0.1,
                top_p=0.9,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id,
                no_repeat_ngram_size=2
            )
            raw_output = tokenizer.decode(outputs[0], skip_special_tokens=False)
            return raw_output  # Return the complete raw output

        except Exception as e:
            if attempt == max_retries - 1:
                return {"error": f"Failed after {max_retries} attempts: {str(e)}"}
            continue

    return {"error": "Unknown error occurred during generation"}

# Example Usage:
print(
generate_variants(" أسامة قريش")
)

<|begin_of_text|><|system|>
You are a UNICODE script conversion expert. Rules:
1. OUTPUT MUST BE VALID JSON with these EXACT fields:
   - "Latn": [Latin transcriptions]
   - "Arab": [Arabic script variants]
   - "Cyrl": [Cyrillic script variants]
   - "Hani": [Chinese character variants]
   - "alternatives": [Other spellings]
2. Generation Rules:
   - MUST attempt conversions for ALL scripts
   - Use common transliteration rules when needed
   - Return empty arrays [] for impossible conversions
3. Strict Formatting:
   - No explanations or non-JSON text
   - Preserve original name semantics</s>
<|user|>
Generate complete script variants for:
Primary Name:  أسامة قريش
Type: individual</s>
<|assistant|>
{ "Latin": ["OSAMA QARISH", "Osama Qarish", "'Osamah Qariş"]}</s><|end_of_text|>
