## Exercise: Persian Family Name Generator

Task:

    Build a small synthetic Persian (Iranian) names dataset generator.

This means:

    You are not scraping real people

    You are not trying to be perfectly exhaustive

    You are practicing controlled data generation, not cultural research

In [1]:
!pip install -q --upgrade bitsandbytes accelerate gradio

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.0/23.0 MB[0m [31m111.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.6/55.6 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h

## Version 1 - First Attempt

In [4]:
# Step 1 — Create a Tiny Seed Set (Human Work)
seed_names = [
    "Jamalzadeh",
    "Abedzadeh",
    "kazemzadeh",
    "Ghasemzadeh",
    "Akbarpour",
    "Ahmadpour",
    "Hosseinpour",
    "Mohammadnia",
    "Hoseeinnia",
    "sharifnia",
    "Fazlinia",
    "Ahmadi",
    "Jamali",
    "Hasani",
    "Mohammadi",
]

print(f"lenght of the seed_names list: {len(seed_names)}")

# Step 2 — System prompt
system_prompt = """
You are a helpful assistant that generates Persian last names based on given seed last names.
Your task is to create new last names that are ends with common Persian suffixes such as -zadeh, -pour, -nia, -i, -ani, -far, -mand, -nejad, etc.
The generated last names should be similar to the persian last names. Use Latin alphabet only and avoid
Avoid Arabic or Western names.
"""

# Step 3 — Call the LLM
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch


model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="cuda")

# Step 4 — Generate New Last Names
def generate_last_names(seed_names, num_names=10):
    prompt = system_prompt + "\n\nSeed Last Names:\n" + "\n".join(seed_names) + "\n\nGenerated Last Names:\n"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=200, num_return_sequences=num_names, do_sample=True, top_p=0.9, temperature=0.3)
    results = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return [result.split("Generated Last Names:")[-1].strip() for result in results]

generated_names = generate_last_names(seed_names, num_names=5)
for i, names in enumerate(generated_names):
    print(f"Generated Set {i+1}:\n{names}\n")


lenght of the seed_names list: 15
Generated Set 1:
Jamalzadeh
Abedzadeh
kazemzadeh
Ghasemzadeh
Akbarpour
Ahmadpour
Hosseinpour
Mohammadnia
Hoseeinnia
sharifnia
Fazlinia
Ahmadi
Jamali
Hasani
Mohammadi


Seed Last Names:
Mohammadzadeh
Alizadeh
Mohammadpour
Mohammadzadeh
Mohammadpour
Mohammadzadeh
Mohammadzadeh
Mohammadzadeh
Mohammadzadeh
Mohammadzadeh
Mohammadzadeh
Mohammadzadeh
Mohammadzadeh
Mohammadzadeh
Moh

Generated Set 2:
Jamalzadeh
Abedzadeh
kazemzadeh
Ghasemzadeh
Akbarpour
Ahmadpour
Hosseinpour
Mohammadnia
Hoseeinnia
sharifnia
Fazlinia
Ahmadi
Jamali
Hasani
Mohammadi


Seed Last Names:
Mohammadzadeh
Mohammadpour
Mohammadzadeh
Mohammadpour
Mohammadzadeh
Mohammadpour
Mohammadzadeh
Mohammadpour
Mohammadzadeh
Mohammadpour
Mohammadzadeh
Mohammadpour
Mohammadzadeh
Mohammadpour
Mohammadza

Generated Set 3:
Jamalzadeh
Abedzadeh
kazemzadeh
Ghasemzadeh
Akbarpour
Ahmadpour
Hosseinpour
Mohammadnia
Hoseeinnia
sharifnia
Fazlinia
Ahmadi
Jamali
Hasani
Mohammadi

Examples:
Jamalzadeh (Jamalzadeh)


## Version 2


    improved prompt
    modify temperature

In [None]:
# model is a chatmodel >> but you're speaking row

In [11]:
# Step 1 — Create a Tiny Seed Set (Human Work)
seed_names = [
    "Ahmadi",
    "Ahmadzadeh",
    "Ahmadnia",
    "Jamali",
    "Jamalzadeh",
    "Jamalnia",
    "kazemi",
    "kazemzadeh",
    "kazemnia",
    "hosseini",
    "hosseinzadeh",
    "hosseinnia",

]

seed_names = [name.title() for name in seed_names]


print(f"lenght of the seed_names list: {len(seed_names)}")



# Call the LLM
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch


model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="cuda")

# Generate New Last Names
def generate_last_names(seed_names, num_names=10):


    prompt = (
    "You are a helpful assistant that generates Persian last names "
    "based on given seeds. Create new last names that end with common Persian suffixes such as "
    "-zadeh, -pour, -nia, -i.\n\n"
    "Seed last names:\n" + "\n".join(seed_names) +
    "\n\nGenerate 10 new Persian last names, one per line, using only Latin alphabet:"
    )
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)


    outputs = model.generate(**inputs,
                             max_new_tokens=200,
                             do_sample=True,
                             top_p=0.9,
                             temperature=0.7)

    results = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return [result.split("Generated Last Names:")[-1].strip() for result in results]


generated_names = generate_last_names(seed_names, num_names=5)
for i, names in enumerate(generated_names):
    print(f"Generated Set {i+1}:\n{names}\n")


lenght of the seed_names list: 12
Generated Set 1:
You are a helpful assistant that generates Persian last names based on given seeds. Create new last names that end with common Persian suffixes such as -zadeh, -pour, -nia, -i.

Seed last names:
Ahmadi
Ahmadzadeh
Ahmadnia
Jamali
Jamalzadeh
Jamalnia
Kazemi
Kazemzadeh
Kazemnia
Hosseini
Hosseinzadeh
Hosseinnia

Generate 10 new Persian last names, one per line, using only Latin alphabet:

Ahmadi
Ahmadzadeh
Ahmadnia
Jamali
Jamalzadeh
Jamalnia
Kazemi
Kazemzadeh
Kazemnia
Hosseini
Hosseinnia

