In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
# Imports (modified for your use case)
from json import JSONDecodeError
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import Dataset
import pandas as pd
import random

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
def format_example(entry, training_mode=True):
    """
    Prepares examples with strict JSON format.

    Training: Shows only available scripts from original data
    Inference: Forces generation of all script variants (Latn, Arab, Cyrl, Hani)
               using common transliteration rules when needed
    """

    # SYSTEM PROMPT - Dual behavior
    system_prompt = """You are a specialized Name Variant Generator with expertise in Arabic, Latin, and Cyrillic writing systems. Rules:

1. Output JSON with these EXACT fields:
   - "Latn": [Latin script variants]
   - "Arab": [Arabic script variants]
   - "Cyrl": [Cyrillic script variants]

2. Quality Guidelines:
   - Generate ONLY linguistically accurate and culturally appropriate variants in  latn, cyrl and arab
   - Provide up to 3-5 high-quality variants per script - NO MORE
   - NEVER include low-quality or "filler" variants to reach a quota
   - If fewer than 3 legitimate variants exist, return only those valid forms
   - Empty array [] if no valid transliteration is possible

3. Transliteration Principles:
   - Preserve phonetic integrity across writing systems
   - Include common regional spelling variations when appropriate
   - Apply proper diacritics and character mappings

4. Output Format:
   - RETURN ONLY VALID JSON with no explanations or commentary"""

    if training_mode:
        # TRAINING MODE - Show only existing scripts
        system_prompt += """
   - Include ONLY scripts present in the input
   - Preserve original examples exactly"""
    else:
        # INFERENCE MODE - Force all script variants
        system_prompt += """
   - - Generate variants for  latn, cyrl and arab  scripts when linguistically possible
   - Prioritize accuracy over completeness"""

    # USER PROMPT
    #changed complete to authentic script variants
    user_prompt = f"""Generate authentic name variants across  latn, cyrl and arab scripts:
Primary Name: {entry['primary_name']}
"""

    # Build variants from available data
    variants = {
        "Latn": [],
        "Arab": [],
        "Cyrl": [],
    }

    for x in entry["names"]:
      if x["script"]!=None:
        script = x["script"].upper()
        if script == "LATN":
            variants["Latn"].append(x["name"])
        elif script == "ARAB":
            variants["Arab"].append(x["name"])

    # Clean empty arrays for training (keep all in inference)
    if training_mode:
        variants = {k: v for k, v in variants.items() if v}

    return {
        "text": f"<|system|>\n{system_prompt}</s>\n<|user|>\n{user_prompt}</s>\n<|assistant|>\n{json.dumps(variants, ensure_ascii=False)}</s>"
    }

In [None]:
import json
# Load your dataset
try:
      with open('dataset_train_fine_tuning.json','r') as f:
          data = json.load(f)
      print(f"✅ Validation passed! Found {len(data)} records")
except JSONDecodeError as e:
        print(f"Found error at line {e.lineno}, column {e.colno}: {e.msg}")
except Exception as e:
    print(f"❌ Validation failed: {str(e)}")


# Create training dataset
train_data = [format_example(entry) for entry in data]
df = pd.DataFrame(train_data)




✅ Validation passed! Found 11562 records


In [None]:

#train_data = [format_example(entry) for entry in data]
# Use this for dataset creation
#train_data = [format_example(entry, True) for entry in data]
# Convert to HuggingFace Dataset
from datasets import Dataset
dataset = Dataset.from_list(train_data)

In [None]:
# Load 4bit model with safer settings
model, tokenizer = FastLanguageModel.from_pretrained(
    "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = 512,  # Reduced from 2048 to prevent OOM
    dtype = torch.float16,
    # Enables 4-bit quantization, reducing memory use 4× for fine-tuning on 16GB GPUs
    load_in_4bit = True,
)

# First get original tokenizer length
original_tokenizer_len = len(tokenizer)

# Define all special tokens at once
special_tokens = {
    "additional_special_tokens": [
        "<|user|>",
        "<|assistant|>",
        "<ar>",
        "</ar>"
    ]
}

# Add all special tokens in a single operation
tokenizer.add_special_tokens(special_tokens)

# Only resize if new tokens were added
if len(tokenizer) > original_tokenizer_len:
    model.resize_token_embeddings(len(tokenizer))
    print(f"Added {len(tokenizer) - original_tokenizer_len} new tokens")
else:
    print("No new tokens added")

# Prepare for LoRA training with consistent max_seq_length
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    use_gradient_checkpointing=True,
    random_state=3407,
    max_seq_length=512,  # Must match the model's max_seq_length
)

# Verify tokenizer has pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

==((====))==  Unsloth 2025.5.3: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.


Added 4 new tokens


Unsloth 2025.5.3 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [None]:
from transformers import DataCollatorForLanguageModeling, EarlyStoppingCallback

# Verify dataset
print("\n=== Dataset Sample ===")
print(dataset[0])
print(f"\nTotal examples: {len(dataset)}")

# Calculate optimal training parameters
total_examples = len(dataset)
batch_size = 4
grad_accum = 4
warmup_steps=150
num_train_epochs = 2 ## 1-2 epochs for LoRA
# Initialize data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Important: Set to False for causal LM
)

args = TrainingArguments(
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=grad_accum,
    warmup_steps=warmup_steps,
    #max_steps=max_steps,
    num_train_epochs=num_train_epochs,
    learning_rate=2e-5, #intiial one was 5e-5
    #weight_decay :  penalizing large weights to prevent overfitting (large weights : value of weight 10.5 -8.3 )
    #designed for AdamW
    #Helps model generalize better, especially for small datasets during fine-tuning.
    weight_decay=0.01,
    #16-bit floating point
    #mixed precision : some  computation (backwords and forward passes) are performed in 16-bit-floating-point instead of fp32
    #reduce memory usage
    fp16=True,
    #log the  training loss (the used training metric here) every 10 steps  and display it
    #rappel : steps are caluated as total_examples/batch_size*gradient_accumulation_steps
    logging_steps=10,
    output_dir="outputs",
    # a variant of Adam optimizer that use weight decay
    #Standard AdamW stores momentum and variance in FP32 (32 bits per value), which is memory-intensive for large models with millions of parameters
    #adamw_8bit quantizes these optimizer states to 8-bit integers
    optim="adamw_8bit",
    #lr_scheduler_type : the learning rate follows a cosine function in its evolution
    #starting from the initial value learning_rate  to a minimum  lr
    #in our case we have a warmup phase (warmup_step= 100)
    #steps 0–100 (warmup): Learning rate increases linearly from 0 to 2e-5 lr(t: current step) = learning_rate *(t/ warmup_steps
    #remaining  steps follows a cosine decay /function
    #lr_scheduler_type="linear", can't adapt with fine-tuning tasks
    lr_scheduler_type="cosine",
    save_strategy="epoch", #save the model at the end of each epoch
    #save the model after a specified number of save_steps  if save_strategy="steps"
    #else save the model at the end of each epoch if save_strategy="epoch"
    #save_steps=500,
    eval_strategy="no",  # Disabled evaluation since we don't have validation set
    load_best_model_at_end=False,  # Disabled since no eval set
    #we didn't validation set
    # as the dataset is not aimed at something that has concrete answer (yes or no ...)
    #val dataset would be almost crucial or we may be judging the mode loss by how well it memorized examples
)

# Ensure tokenizer is properly initialized
if not hasattr(tokenizer, 'pad_token') or tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

#early stopping is added based on validation loss
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=512,
    tokenizer=tokenizer,
    args=args,
    packing=False,
     #this can be added only if there is a validation set
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    data_collator=data_collator,  # Explicitly set our collator
)

# Train
trainer.train()

# Save model
model.save_pretrained("multilingual_name_generator")
tokenizer.save_pretrained("multilingual_name_generator")


=== Dataset Sample ===
{'text': '<|system|>\nYou are a specialized Name Variant Generator with expertise in Arabic, Latin, and Cyrillic writing systems. Rules:\n\n1. Output JSON with these EXACT fields:\n   - "Latn": [Latin script variants]\n   - "Arab": [Arabic script variants]\n   - "Cyrl": [Cyrillic script variants]\n\n2. Quality Guidelines:\n   - Generate ONLY linguistically accurate and culturally appropriate variants in  latn, cyrl and arab \n   - Provide up to 3-5 high-quality variants per script - NO MORE\n   - NEVER include low-quality or "filler" variants to reach a quota\n   - If fewer than 3 legitimate variants exist, return only those valid forms\n   - Empty array [] if no valid transliteration is possible\n\n3. Transliteration Principles:\n   - Preserve phonetic integrity across writing systems\n   - Include common regional spelling variations when appropriate\n   - Apply proper diacritics and character mappings\n\n4. Output Format:\n   - RETURN ONLY VALID JSON with no e

Unsloth: Tokenizing ["text"] (num_proc=12):   0%|          | 0/11562 [00:00<?, ? examples/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 11,562 | Num Epochs = 2 | Total steps = 1,444
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mguerricheoussama[0m ([33mguerricheoussama-istic[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,2.7081
20,2.6898
30,2.6521
40,2.5431
50,2.3948
60,2.1666
70,1.8269
80,1.3757
90,0.8493
100,0.4446




('multilingual_name_generator/tokenizer_config.json',
 'multilingual_name_generator/special_tokens_map.json',
 'multilingual_name_generator/tokenizer.json')

In [None]:
def generate_variants(primary_name, max_retries=3):
    """
    Generates name variants in all scripts (Latn/Arab/Cyrl/Hani) and returns raw model output.

    Args:
        primary_name (str): Input name to convert
        person_type (str): "individual" or other type
        max_retries (int): Retry attempts if generation fails

    Returns:
        str: Raw model output (may include JSON and other text)
        or {"error": str} if failed
    """
    # SYSTEM PROMPT - Forces all script generation
    system_prompt = """You are a specialized Name Variant Generator with expertise in Arabic, Latin, and Cyrillic writing systems. Rules:

    1. Output JSON with these EXACT fields:
    - "Latn": [Latin script variants]
    - "Arab": [Arabic script variants]
    - "Cyrl": [Cyrillic script variants]

    2. Quality Guidelines:
    - Generate ONLY linguistically accurate and culturally appropriate variants in  latn, cyrl and arab
    - Provide up to 3-5 high-quality variants per script - NO MORE
    - NEVER include low-quality or "filler" variants to reach a quota
    - If fewer than 3 legitimate variants exist, return only those valid forms
    - Empty array [] if no valid transliteration is possible

    3. Transliteration Principles:
    - Preserve phonetic integrity across writing systems
    - Include common regional spelling variations when appropriate
    - Apply proper diacritics and character mappings

    4. Output Format:
    - RETURN ONLY VALID JSON with no explanations or commentary"""
    # - Preserve original name semantics"""

        # USER PROMPT - Clear task specification
    user_prompt = f"""Generate authentic name variants across  latn, cyrl and arab scripts:
    Name: {primary_name}
    """

    # Structured prompt with JSON priming
    prompt = f"""<|system|>
{system_prompt}</s>
<|user|>
{user_prompt}</s>
<|assistant|>
{{"""  # Intentional opening brace to force JSON

    for attempt in range(max_retries):
        try:
            inputs = tokenizer(
                prompt,
                return_tensors="pt",
                max_length=1024,
                truncation=True
            ).to("cuda")

            outputs = model.generate(
                **inputs,
                max_new_tokens=500,
                temperature=0.1,
                top_p=0.9,
                do_sample=True,
                repetition_penalty=1.1,    # Add this to discourage repetitive variants
                pad_token_id=tokenizer.eos_token_id,
                no_repeat_ngram_size=2 # can change this to 3 to avoid repetitive patterns
            )
            raw_output = tokenizer.decode(outputs[0], skip_special_tokens=False)
            return raw_output  # Return the complete raw output

        except Exception as e:
            if attempt == max_retries - 1:
                return {"error": f"Failed after {max_retries} attempts: {str(e)}"}
            continue

    return {"error": "Unknown error occurred during generation"}

# Example Usage:
print(
generate_variants(" أسامة قريش")
)

In [None]:
import json
import re

def generate_variants(primary_name, max_retries=3):
    """
    Generates name variants in all scripts (Latn/Arab/Cyrl) and returns only the JSON data.

    Args:
        primary_name (str): Input name to convert
        max_retries (int): Retry attempts if generation fails

    Returns:
        dict: Clean JSON with name variants, or {"error": str} if failed
    """
    # SYSTEM PROMPT - Forces all script generation
    system_prompt = """You are a specialized Name Variant Generator with expertise in Arabic, Latin, and Cyrillic writing systems. Rules:

    1. Output JSON with these EXACT fields:
    - "Latn": [Latin script variants]
    - "Arab": [Arabic script variants]
    - "Cyrl": [Cyrillic script variants]

    2. Quality Guidelines:
    - Generate ONLY linguistically accurate and culturally appropriate variants
    - Provide up to 3-5 high-quality variants per script - NO MORE
    - NEVER include low-quality or "filler" variants to reach a quota
    - If fewer than 3 legitimate variants exist, return only those valid forms
    - Empty array [] if no valid transliteration is possible

    3. Transliteration Principles:
    - Preserve phonetic integrity across writing systems
    - Include common regional spelling variations when appropriate
    - Apply proper diacritics and character mappings

    4. Output Format:
    - RETURN ONLY VALID JSON with no explanations or commentary"""

    # USER PROMPT - Clear task specification
    user_prompt = f"""Generate authentic name variants across latn, cyrl and arab scripts:
    Name: {primary_name}
    """

    # Structured prompt with JSON priming
    prompt = f"""<|system|>
{system_prompt}</s>
<|user|>
{user_prompt}</s>
<|assistant|>
{{"""  # Intentional opening brace to force JSON

    for attempt in range(max_retries):
        try:
            inputs = tokenizer(
                prompt,
                return_tensors="pt",
                max_length=1024,
                truncation=True
            ).to("cuda")

            outputs = model.generate(
                **inputs,
                max_new_tokens=500,
                temperature=0.1,
                top_p=0.9,
                do_sample=True,
                repetition_penalty=1.1,
                pad_token_id=tokenizer.eos_token_id,
                no_repeat_ngram_size=2
            )

            raw_output = tokenizer.decode(outputs[0], skip_special_tokens=False)

            # Extract just the JSON part using regex
            json_pattern = r'\{.*\}'
            json_match = re.search(json_pattern, raw_output, re.DOTALL)

            if json_match:
                json_str = json_match.group(0)
                # Remove any trailing text after the final closing brace
                if json_str.count('{') == json_str.count('}'):
                    json_data = json.loads(json_str)
                    return json_data
                else:
                    # Try to find a valid JSON substring
                    for i in range(len(json_str), 0, -1):
                        try:
                            json_data = json.loads(json_str[:i])
                            return json_data
                        except:
                            continue

            # If we get here, parsing failed
            if attempt == max_retries - 1:
                return {"error": f"Could not extract valid JSON after {max_retries} attempts"}

        except Exception as e:
            if attempt == max_retries - 1:
                return {"error": f"Failed after {max_retries} attempts: {str(e)}"}
            continue

    return {"error": "Unknown error occurred during generation"}

def test_multiple_names():
    """Test the generator with multiple names and print formatted results."""
    test_names = [
        "محمد علي مسلماني",
        "أسامة قريش",
        "mohamed neji dridi",
        "نور مازني"
    ]

    print("Testing name generation:")
    for name in test_names:
        print(f"\nInput: {name}")
        result = generate_variants(name)
        print(json.dumps(result, ensure_ascii=False, indent=2))

# Example Usage:
# For a single name:
result = generate_variants(" oussema guerriche")
print(json.dumps(result, ensure_ascii=False, indent=2))

# Or test with multiple names:
# test_multiple_names()

{
  "error": "Failed after 3 attempts: Expecting ':' delimiter: line 1 column 55 (char 54)"
}


In [None]:
def generate_variants(primary_name, max_retries=3):
    """
    Generates name variants in all scripts (Latn/Arab/Cyrl/Hani) and returns raw model output.

    Args:
        primary_name (str): Input name to convert
        person_type (str): "individual" or other type
        max_retries (int): Retry attempts if generation fails

    Returns:
        str: Raw model output (may include JSON and other text)
        or {"error": str} if failed
    """
    # SYSTEM PROMPT - Forces all script generation
    system_prompt = """You are a specialized Name Variant Generator with expertise in Arabic, Latin, and Cyrillic writing systems. Rules:

    1. Output JSON with these EXACT fields:
    - "Latn": [Latin script variants]
    - "Arab": [Arabic script variants]
    - "Cyrl": [Cyrillic script variants]

    2. Quality Guidelines:
    - Generate ONLY linguistically accurate and culturally appropriate variants
    - Provide up to 3-5 high-quality variants per script - NO MORE
    - NEVER include low-quality or "filler" variants to reach a quota
    - If fewer than 3 legitimate variants exist, return only those valid forms
    - Empty array [] if no valid transliteration is possible

    3. Transliteration Principles:
    - Preserve phonetic integrity across writing systems
    - Include common regional spelling variations when appropriate
    - Apply proper diacritics and character mappings

    4. Output Format:
    - RETURN ONLY VALID JSON with no explanations or commentary"""
    # - Preserve original name semantics"""

        # USER PROMPT - Clear task specification
    user_prompt = f"""Generate authentic name variants across  latn, cyrl and arab scripts:
    Name: {primary_name}
    """

    # Structured prompt with JSON priming
    prompt = f"""<|system|>
{system_prompt}</s>
<|user|>
{user_prompt}</s>
<|assistant|>
{{"""  # Intentional opening brace to force JSON

    for attempt in range(max_retries):
        try:
            inputs = tokenizer(
                prompt,
                return_tensors="pt",
                max_length=1024,
                truncation=True
            ).to("cuda")

            outputs = model.generate(
                **inputs,
                max_new_tokens=500,
                temperature=0.1,
                top_p=0.9,
                do_sample=True,
                repetition_penalty=1.1,    # Add this to discourage repetitive variants
                pad_token_id=tokenizer.eos_token_id,
                no_repeat_ngram_size=2 # can change this to 3 to avoid repetitive patterns
            )
            raw_output = tokenizer.decode(outputs[0], skip_special_tokens=False)
            return raw_output  # Return the complete raw output

        except Exception as e:
            if attempt == max_retries - 1:
                return {"error": f"Failed after {max_retries} attempts: {str(e)}"}
            continue

    return {"error": "Unknown error occurred during generation"}

# Example Usage:
print(
generate_variants("Qalis aboulahab")
)

<|begin_of_text|><|system|>
You are a specialized Name Variant Generator with expertise in Arabic, Latin, and Cyrillic writing systems. Rules:

    1. Output JSON with these EXACT fields:
    - "Latn": [Latin script variants]
    - "Arab": [Arabic script variants]
    - "Cyrl": [Cyrillic script variants]

    2. Quality Guidelines:
    - Generate ONLY linguistically accurate and culturally appropriate variants
    - Provide up to 3-5 high-quality variants per script - NO MORE
    - NEVER include low-quality or "filler" variants to reach a quota
    - If fewer than 3 legitimate variants exist, return only those valid forms
    - Empty array [] if no valid transliteration is possible

    3. Transliteration Principles:
    - Preserve phonetic integrity across writing systems
    - Include common regional spelling variations when appropriate
    - Apply proper diacritics and character mappings

    4. Output Format:
    - RETURN ONLY VALID JSON with no explanations or commentary</s>
<|use

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import json
import re

def generate_variants(primary_name, max_retries=3):
    """
    Generates name variants in all scripts (Latn/Arab/Cyrl) and returns only the JSON data.
    System prompt has been removed.

    Args:
        primary_name (str): Input name to convert
        max_retries (int): Retry attempts if generation fails

    Returns:
        dict: Clean JSON with name variants, or {"error": str} if failed
    """
    # Simple user prompt without system instructions
    user_prompt = f"""Generate authentic name variants across Latin, Arabic, and Cyrillic scripts:
    Name: {primary_name}
    """

    # Structured prompt with JSON priming but no system prompt
    prompt = f"""<|user|>
{user_prompt}</s>
<|assistant|>
{{"""  # Intentional opening brace to force JSON

    for attempt in range(max_retries):
        try:
            inputs = tokenizer(
                prompt,
                return_tensors="pt",
                max_length=1024,
                truncation=True
            ).to("cuda")

            outputs = model.generate(
                **inputs,
                max_new_tokens=500,
                temperature=0.1,
                top_p=0.9,
                do_sample=True,
                repetition_penalty=1.1,
                pad_token_id=tokenizer.eos_token_id,
                no_repeat_ngram_size=2
            )

            raw_output = tokenizer.decode(outputs[0], skip_special_tokens=False)

            # Extract just the JSON part using regex
            json_pattern = r'\{.*\}'
            json_match = re.search(json_pattern, raw_output, re.DOTALL)

            if json_match:
                json_str = json_match.group(0)
                # Remove any trailing text after the final closing brace
                if json_str.count('{') == json_str.count('}'):
                    json_data = json.loads(json_str)
                    return json_data
                else:
                    # Try to find a valid JSON substring
                    for i in range(len(json_str), 0, -1):
                        try:
                            json_data = json.loads(json_str[:i])
                            return json_data
                        except:
                            continue

            # If we get here, parsing failed
            if attempt == max_retries - 1:
                return {"error": f"Could not extract valid JSON after {max_retries} attempts"}

        except Exception as e:
            if attempt == max_retries - 1:
                return {"error": f"Failed after {max_retries} attempts: {str(e)}"}
            continue

    return {"error": "Unknown error occurred during generation"}

# Example Usage:
result = generate_variants("oussema guerriche")
print(json.dumps(result, ensure_ascii=False, indent=2))

def test_multiple_names():
    """Test the generator with multiple names and print formatted results."""
    test_names = [
        "محمد علي مسلماني",
        "أسامة قريش",
        "mohamed neji dridi",
        "نور مازني"
    ]

    print("Testing name generation:")
    for name in test_names:
        print(f"\nInput: {name}")
        result = generate_variants(name)
        print(json.dumps(result, ensure_ascii=False, indent=2))

# Uncomment to test with multiple names

test_multiple_names()

{
  "name": "oussama guerriche",
  "nationality": [
    "tunisian"
  ],
  "aliases": [],
  "dob": "",
  "dod": ""
}
Testing name generation:

Input: محمد علي مسلماني
{
  "name": "محمد علی مسلمی",
  "variants": [
    "Mohammad Ali Muslimani"
  ]
}

Input: أسامة قريش
{
  "name": "أسامة بن محمد بن عبد الله بن غيث القريشي",
  "nationality": [
    "سعودي"
  ],
  "aliases": [],
  "dob": "",
  "gender": ""
}

Input: mohamed neji dridi
{
  "error": "Failed after 3 attempts: Extra data: line 2 column 1 (char 110)"
}

Input: نور مازني
{
  "error": "Failed after 3 attempts: Extra data: line 2 column 1 (char 50)"
}


In [None]:
model.push_to_hub(
    repo_id="OussemaGuerriche/multilingual-name-generator-collab-pro",
    token="hf_token"  # Replace with your access token(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)
)
tokenizer.push_to_hub(
    repo_id="OussemaGuerriche/multilingual-name-generator-collab-pro",
    token="hf_token"
)

README.md:   0%|          | 0.00/583 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

Saved model to https://huggingface.co/OussemaGuerriche/multilingual-name-generator-collab-pro


  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]