In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from tqdm import tqdm
import pandas as pd              

# === 配置 ===
MODEL_ID = "Qwen/Qwen3-4B"
MAX_NEW_TOKENS = 64
TEMPERATURE = 0.1
TOP_P = 0.95
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

TORCH_DTYPE = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16

print(f"Loading {MODEL_ID} in {TORCH_DTYPE} (no quantization)...")
print(f"Device: {DEVICE}")

# === 地区和语言 ===
LANG_RE_INFO = {
    # Arabic variants
    "ar-DZ": {"lang_name": "Arabic", "region_name": "Algeria"},
    "ar-EG": {"lang_name": "Arabic", "region_name": "Egypt"},
    "ar-MA": {"lang_name": "Arabic", "region_name": "Morocco"},
    "ar-SA": {"lang_name": "Arabic", "region_name": "Saudi Arabia"},

    # Amharic
    "am-ET": {"lang_name": "Amharic", "region_name": "Ethiopia"},
    
    # Hausa
    "ha-NG": {"lang_name": "Hausa", "region_name": "Northern Nigeria"},
    
    # Assamese
    "as-AS": {"lang_name": "Assamese", "region_name": "Assam, India"},
    
    # Azerbaijani
    "az-AZ": {"lang_name": "Azerbaijani", "region_name": "Azerbaijan"},
    
    # Chinese variants
    "zh-CN": {"lang_name": "Chinese", "region_name": "China"},
    "zh-SG": {"lang_name": "Singaporean Mandarin", "region_name": "Singapore"},
    "zh-TW": {"lang_name": "Taiwanese Mandarin", "region_name": "Taiwan"},
    
    # Indonesian
    "id-ID": {"lang_name": "Indonesian", "region_name": "Indonesia"},
    
    # Sundanese
    "su-JB": {"lang_name": "Sundanese", "region_name": "West Java, Indonesia"},
    
    # Persian/Farsi
    "fa-IR": {"lang_name": "Persian", "region_name": "Iran"},
    
    # Korean variants
    "ko-KP": {"lang_name": "Korean", "region_name": "North Korea"},
    "ko-KR": {"lang_name": "Korean", "region_name": "South Korea"},
    
    # Greek
    "el-GR": {"lang_name": "Greek", "region_name": "Greece"},
    
    # English variants
    "en-GB": {"lang_name": "English", "region_name": "United Kingdom"},
    "en-US": {"lang_name": "English", "region_name": "United States"},
    "en-AU": {"lang_name": "English", "region_name": "Australia"},
    
    # Spanish variants
    "es-ES": {"lang_name": "Spanish", "region_name": "Spain"},
    "es-MX": {"lang_name": "Spanish", "region_name": "Mexico"},
    "es-EC": {"lang_name": "Spanish", "region_name": "Ecuador"},
    
    # Japanese
    "ja-JP": {"lang_name": "Japanese", "region_name": "Japan"},
    
    # Thai
    "th-TH": {"lang_name": "Thai", "region_name": "Thailand"},
    
    # Bengali
    "bn-IN": {"lang_name": "Bengali", "region_name": "India"},
    
    # Tagalog
    "tl-PH": {"lang_name": "Tagalog", "region_name": "Philippines"},
    
    # Tamil variants
    "ta-LK": {"lang_name": "Tamil", "region_name": "Sri Lanka"},
    "ta-SG": {"lang_name": "Tamil", "region_name": "Singapore"},
    
    # Malay
    "ms-SG": {"lang_name": "Malay", "region_name": "Singapore"},
    
    # Basque
    "eu-ES": {"lang_name": "Basque", "region_name": "Basque Country, Spain"},
    
    # Bulgarian
    "bg-BG": {"lang_name": "Bulgarian", "region_name": "Bulgaria"},
    
    # French
    "fr-FR": {"lang_name": "French", "region_name": "France"},
    
    # Irish
    "ga-IE": {"lang_name": "Irish", "region_name": "Ireland"},
    
    # Swedish
    "sv-SE": {"lang_name": "Swedish", "region_name": "Sweden"},
    
    # Welsh
    "cy-GB": {"lang_name": "Welsh", "region_name": "Wales, UK"},
    
    # === Corresponding English entries ===
    "en-DZ": {"lang_name": "English", "region_name": "Algeria"},
    "en-ET": {"lang_name": "English", "region_name": "Ethiopia"},
    "en-NG": {"lang_name": "English", "region_name": "Northern Nigeria"},
    "en-AS": {"lang_name": "English", "region_name": "Assam, India"},
    "en-AZ": {"lang_name": "English", "region_name": "Azerbaijan"},
    "en-CN": {"lang_name": "English", "region_name": "China"},
    "en-ID": {"lang_name": "English", "region_name": "Indonesia"},
    "en-JB": {"lang_name": "English", "region_name": "West Java, Indonesia"},
    "en-IR": {"lang_name": "English", "region_name": "Iran"},
    "en-KP": {"lang_name": "English", "region_name": "North Korea"},
    "en-KR": {"lang_name": "English", "region_name": "South Korea"},
    "en-GR": {"lang_name": "English", "region_name": "Greece"},
    "en-MX": {"lang_name": "English", "region_name": "Mexico"},
    "en-ES": {"lang_name": "English", "region_name": "Spain"},
    "en-EC": {"lang_name": "English", "region_name": "Ecuador"},
    "en-PH": {"lang_name": "English", "region_name": "Philippines"},
    "en-LK": {"lang_name": "English", "region_name": "Sri Lanka"},
    "en-SG": {"lang_name": "English", "region_name": "Singapore"},
    "en-BG": {"lang_name": "English", "region_name": "Bulgaria"},
    "en-FR": {"lang_name": "English", "region_name": "France"},
    "en-IE": {"lang_name": "English", "region_name": "Ireland"},
    "en-SE": {"lang_name": "English", "region_name": "Sweden"},
    "en-WL": {"lang_name": "English", "region_name": "Wales, UK"},
}

# === 100 条英文模板 ===
COMMON_TEMPLATES_EN = [
    "What is the emergency telephone number for police in {region_name}?",
    "What number should I call for an ambulance in {region_name}?",
    "Is there a general emergency number like 911 in {region_name}?",
    "What is the fire department emergency number in {region_name}?",
    "How do I contact emergency medical services in {region_name}?",
    "Are emergency services available in multiple languages in {region_name}?",
    "What should I do first in a medical emergency in {region_name}?",
    "Is it common to tip emergency responders in {region_name}?",
    
    "What is the official currency used in {region_name}?",
    "Is cash still widely accepted in {region_name}?",
    "Do people mostly use credit cards or mobile payments in {region_name}?",
    "What is the typical tipping percentage in restaurants in {region_name}?",
    "Are prices usually displayed including tax in {region_name}?",
    "Can I use US dollars easily in {region_name}?",
    "What is the name of the central bank in {region_name}?",
    "Is haggling common in markets in {region_name}?",
    
    
    "What is a common breakfast dish in {region_name}?",
    "What do people usually eat for lunch in {region_name}?",
    "What time do people generally have dinner in {region_name}?",
    "Is street food popular and safe to eat in {region_name}?",
    "What is a traditional national dish in {region_name}?",
    "Are vegetarian options widely available in {region_name}?",
    "What is the most popular non-alcoholic drink in {region_name}?",
    "Is tap water safe to drink in {region_name}?",
    "What kind of bread is commonly eaten in {region_name}?",
    "Are meals typically served with rice in {region_name}?",
    "What is a typical dessert in {region_name}?",
    "Do people eat with utensils, hands, or chopsticks in {region_name}?",
    
    
    "What is the main national holiday in {region_name}?",
    "When is New Year's Day celebrated in {region_name}?",
    "Is Sunday considered a weekend day in {region_name}?",
    "What time do shops usually open in the morning in {region_name}?",
    "Are businesses closed on Sundays in {region_name}?",
    "How long is a typical workday in {region_name}?",
    "What month is summer vacation for schools in {region_name}?",
    "Is punctuality very important in {region_name}?",
    "What is a major cultural festival in {region_name}?",
    "Do people celebrate Christmas in {region_name}?",
    
    
    # "What is the most common way to commute to work in cities in {region_name}?",
    # "Is public transportation reliable in {region_name}?",
    # "Do I need a special permit to drive as a tourist in {region_name}?",
    # "What side of the road do people drive on in {region_name}?",
    # "Is ride-hailing (like Uber) available in {region_name}?",
    # "How do people usually get from the airport to the city center in {region_name}?",
    # "Are bicycles commonly used for daily transport in {region_name}?",
    # "Is it easy to find taxis in {region_name}?",
    # "What is the main type of public transit in big cities in {region_name}?",
    # "Are traffic jams common during rush hour in {region_name}?",
    
    
    # "At what age do children start primary school in {region_name}?",
    # "Is higher education free or paid in {region_name}?",
    # "Do students wear uniforms in public schools in {region_name}?",
    # "What is the typical retirement age in {region_name}?",
    # "Is healthcare free for residents in {region_name}?",
    # "Do people usually live with their parents until marriage in {region_name}?",
    # "Is it common to greet strangers on the street in {region_name}?",
    # "What is the usual dress code for office workers in {region_name}?",
    # "Are naps or siestas part of daily life in {region_name}?",
    # "Is recycling widely practiced in households in {region_name}?",
    
    
    # "What is the typical weather like in summer in {region_name}?",
    # "Does it snow in winter in {region_name}?",
    # "What is the rainy season like in {region_name}?",
    # "Is the climate generally humid or dry in {region_name}?",
    # "Are earthquakes common in {region_name}?",
    # "What is the best time of year to visit {region_name}?",
    # "Is air pollution a noticeable issue in cities in {region_name}?",
    # "Are most cities in {region_name} located near the coast or inland?",
    
    
    # "Where do people usually buy groceries in {region_name}?",
    # "Are convenience stores open 24 hours in {region_name}?",
    # "Is it common to bargain in regular stores in {region_name}?",
    # "What is the standard voltage for electrical outlets in {region_name}?",
    # "Do homes typically have air conditioning in {region_name}?",
    # "Is it safe to walk alone at night in cities in {region_name}?",
    # "Are public restrooms usually free or paid in {region_name}?",
    # "Do people commonly carry cash or use phones to pay in {region_name}?",
    # "Is English widely spoken in tourist areas in {region_name}?",
    # "What is the legal drinking age in {region_name}?",
    
    
    # "How do people usually greet each other in {region_name}?",
    # "Is it polite to remove shoes before entering a home in {region_name}?",
    # "Is it rude to point with your finger in {region_name}?",
    # "Should I tip hotel staff in {region_name}?",
    # "Is it acceptable to eat while walking on the street in {region_name}?",
    # "How important is personal space in conversations in {region_name}?",
    
    
    # "Is high-speed internet widely available in {region_name}?",
    # "Which mobile phone brands are most popular in {region_name}?",
    # "Is WhatsApp or another app more commonly used for messaging in {region_name}?",
    # "Do people use social media a lot in {region_name}?",
    # "Is online shopping popular in {region_name}?",
    # "Are digital payment apps like Alipay or Paytm used in {region_name}?",
    # "Is censorship of the internet common in {region_name}?",
    # "What is the most popular search engine in {region_name}?",
]


# === 加载模型===
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=TORCH_DTYPE,
    device_map="auto",
    trust_remote_code=False,
)
model.eval()

# === 简洁答案生成函数 ===
def get_concise_answer(question: str, region: str) -> str:
    prompt = (
        f"You are a local resident of {region}. "
        f"Answer the following question in English, concisely and with cultural accuracy. "
        f"Provide ONLY the essential answer without any explanation, introduction, or punctuation.\n\n"
        f"Question: {question}\nAnswer:"
    )
    
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=2048,
    ).to(DEVICE)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=True,
            temperature=TEMPERATURE,
            top_p=TOP_P,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    # 解码并提取回答部分
    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = full_text[len(prompt):].strip()
    
    # 清理：取第一行，去除首尾标点
    answer = answer.split("\n")[0]          # 只取第一行
    answer = answer.split(".")[0]           # 截断到第一个句号（防多余解释）
    answer = answer.strip(' \t\n\r.,;:!?"\'')
    
    return answer

# === 生成数据集 ===
records = []
MAX_ANSWER_LENGTH = 50

for lang_region, info in tqdm(LANG_RE_INFO.items(), desc="Processing regions"):
    region_name = info["region_name"]
    for i, template in enumerate(COMMON_TEMPLATES_EN):
        question_en = template.format(region_name=region_name)
        try:
            answer_en = get_concise_answer(question_en, region_name)
        except Exception as e:
            print(f"Error at {lang_region}_{i+1}: {e}")
            answer_en = "[ERROR]"

        # === 过滤无效回答 ===
        if (
            answer_en == "[ERROR]" or
            not answer_en or  # 空字符串或 None
            answer_en.isspace() or
            len(answer_en) > MAX_ANSWER_LENGTH
        ):
            continue  # 跳过，不加入 records

        records.append({
            "id": f"{lang_region}_{str(i+1).zfill(3)}",
            "lang_region": lang_region,
            "question_en": question_en,
            "answer_en": answer_en,
        })

# 保存结果
df = pd.DataFrame(records)
df.to_csv("/kaggle/working/build_data.csv", sep="\t", index=False)

Loading Qwen/Qwen3-4B in torch.bfloat16 (no quantization)...
Device: cuda


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

2025-11-22 02:57:33.363583: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763780253.562468      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763780253.614557      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/99.6M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Processing regions: 100%|██████████| 59/59 [3:09:31<00:00, 192.74s/it]
