In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from transformers import pipeline
from tqdm import tqdm
import pandas as pd

# === 配置 ===
MODEL_ID = "FacebookAI/xlm-roberta-large"
TOP_K = 5

# 创建 fill-mask pipeline
pipe = pipeline("fill-mask", model=MODEL_ID, tokenizer=MODEL_ID)

# === 地区和语言 ===
LANG_RE_INFO = {
    # Arabic variants
    "ar-DZ": {"lang_name": "Arabic", "region_name": "Algeria"},
    "ar-EG": {"lang_name": "Arabic", "region_name": "Egypt"},
    "ar-MA": {"lang_name": "Arabic", "region_name": "Morocco"},
    "ar-SA": {"lang_name": "Arabic", "region_name": "Saudi Arabia"},

    # Amharic
    "am-ET": {"lang_name": "Amharic", "region_name": "Ethiopia"},
    
    # Hausa
    "ha-NG": {"lang_name": "Hausa", "region_name": "Northern Nigeria"},
    
    # Assamese
    "as-AS": {"lang_name": "Assamese", "region_name": "Assam, India"},
    
    # Azerbaijani
    "az-AZ": {"lang_name": "Azerbaijani", "region_name": "Azerbaijan"},
    
    # Chinese variants
    "zh-CN": {"lang_name": "Chinese", "region_name": "China"},
    "zh-SG": {"lang_name": "Singaporean Mandarin", "region_name": "Singapore"},
    "zh-TW": {"lang_name": "Taiwanese Mandarin", "region_name": "Taiwan"},
    
    # Indonesian
    "id-ID": {"lang_name": "Indonesian", "region_name": "Indonesia"},
    
    # Sundanese
    "su-JB": {"lang_name": "Sundanese", "region_name": "West Java, Indonesia"},
    
    # Persian/Farsi
    "fa-IR": {"lang_name": "Persian", "region_name": "Iran"},
    
    # Korean variants
    "ko-KP": {"lang_name": "Korean", "region_name": "North Korea"},
    "ko-KR": {"lang_name": "Korean", "region_name": "South Korea"},
    
    # Greek
    "el-GR": {"lang_name": "Greek", "region_name": "Greece"},
    
    # English variants
    "en-GB": {"lang_name": "English", "region_name": "United Kingdom"},
    "en-US": {"lang_name": "English", "region_name": "United States"},
    "en-AU": {"lang_name": "English", "region_name": "Australia"},
    
    # Spanish variants
    "es-ES": {"lang_name": "Spanish", "region_name": "Spain"},
    "es-MX": {"lang_name": "Spanish", "region_name": "Mexico"},
    "es-EC": {"lang_name": "Spanish", "region_name": "Ecuador"},
    
    # Japanese
    "ja-JP": {"lang_name": "Japanese", "region_name": "Japan"},
    
    # Thai
    "th-TH": {"lang_name": "Thai", "region_name": "Thailand"},
    
    # Bengali
    "bn-IN": {"lang_name": "Bengali", "region_name": "India"},
    
    # Tagalog
    "tl-PH": {"lang_name": "Tagalog", "region_name": "Philippines"},
    
    # Tamil variants
    "ta-LK": {"lang_name": "Tamil", "region_name": "Sri Lanka"},
    "ta-SG": {"lang_name": "Tamil", "region_name": "Singapore"},
    
    # Malay
    "ms-SG": {"lang_name": "Malay", "region_name": "Singapore"},
    
    # Basque
    "eu-ES": {"lang_name": "Basque", "region_name": "Basque Country, Spain"},
    
    # Bulgarian
    "bg-BG": {"lang_name": "Bulgarian", "region_name": "Bulgaria"},
    
    # French
    "fr-FR": {"lang_name": "French", "region_name": "France"},
    
    # Irish
    "ga-IE": {"lang_name": "Irish", "region_name": "Ireland"},
    
    # Swedish
    "sv-SE": {"lang_name": "Swedish", "region_name": "Sweden"},
    
    # Welsh
    "cy-GB": {"lang_name": "Welsh", "region_name": "Wales, UK"},
    
    # === Corresponding English entries ===
    "en-DZ": {"lang_name": "English", "region_name": "Algeria"},
    "en-ET": {"lang_name": "English", "region_name": "Ethiopia"},
    "en-NG": {"lang_name": "English", "region_name": "Northern Nigeria"},
    "en-AS": {"lang_name": "English", "region_name": "Assam, India"},
    "en-AZ": {"lang_name": "English", "region_name": "Azerbaijan"},
    "en-CN": {"lang_name": "English", "region_name": "China"},
    "en-ID": {"lang_name": "English", "region_name": "Indonesia"},
    "en-JB": {"lang_name": "English", "region_name": "West Java, Indonesia"},
    "en-IR": {"lang_name": "English", "region_name": "Iran"},
    "en-KP": {"lang_name": "English", "region_name": "North Korea"},
    "en-KR": {"lang_name": "English", "region_name": "South Korea"},
    "en-GR": {"lang_name": "English", "region_name": "Greece"},
    "en-MX": {"lang_name": "English", "region_name": "Mexico"},
    "en-ES": {"lang_name": "English", "region_name": "Spain"},
    "en-EC": {"lang_name": "English", "region_name": "Ecuador"},
    "en-PH": {"lang_name": "English", "region_name": "Philippines"},
    "en-LK": {"lang_name": "English", "region_name": "Sri Lanka"},
    "en-SG": {"lang_name": "English", "region_name": "Singapore"},
    "en-BG": {"lang_name": "English", "region_name": "Bulgaria"},
    "en-FR": {"lang_name": "English", "region_name": "France"},
    "en-IE": {"lang_name": "English", "region_name": "Ireland"},
    "en-SE": {"lang_name": "English", "region_name": "Sweden"},
    "en-WL": {"lang_name": "English", "region_name": "Wales, UK"},
}


# === 填空式模板 ===
FILLMASK_TEMPLATES = [
    # === Emergency Numbers ===
    "The emergency telephone number for police in {region_name} is <mask>.",
    "The number to call for an ambulance in {region_name} is <mask>.",
    "The general emergency number in {region_name} is <mask>.",
    "The fire department emergency number in {region_name} is <mask>.",
    "Emergency medical services in {region_name} can be contacted at <mask>.",
    
    # === Currency & Payments ===
    "The official currency used in {region_name} is <mask>.",
    "People in {region_name} mostly use <mask> for payments.",
    "The typical tipping percentage in restaurants in {region_name} is <mask>.",
    "Prices in {region_name} are usually displayed <mask> tax.",
    "The name of the central bank in {region_name} is <mask>.",
    
    # === Food & Dining ===
    "A common breakfast dish in {region_name} is <mask>.",
    "People in {region_name} usually eat <mask> for lunch.",
    "People in {region_name} generally have dinner at <mask>.",
    "A traditional national dish in {region_name} is <mask>.",
    "The most popular non-alcoholic drink in {region_name} is <mask>.",
    "The kind of bread commonly eaten in {region_name} is <mask>.",
    "Meals in {region_name} are typically served with <mask>.",
    "A typical dessert in {region_name} is <mask>.",
    "People in {region_name} eat with <mask>.",
    
    # === Holidays & Daily Life ===
    "The main national holiday in {region_name} is <mask>.",
    "New Year's Day is celebrated on <mask> in {region_name}.",
    "Shops in {region_name} usually open at <mask> in the morning.",
    "A typical workday in {region_name} is <mask> hours long.",
    "Summer vacation for schools in {region_name} is in <mask>.",
    "A major cultural festival in {region_name} is <mask>.",
    
    # === Transportation (uncommented ones only if factual) ===
    "People in cities in {region_name} most commonly commute to work by <mask>.",
    "The side of the road people drive on in {region_name} is the <mask>.",
    "People usually get from the airport to the city center in {region_name} by <mask>.",
    "The main type of public transit in big cities in {region_name} is <mask>.",
    
    # === Education & Society ===
    "Children in {region_name} start primary school at the age of <mask>.",
    "The typical retirement age in {region_name} is <mask>.",
    "Students in public schools in {region_name} wear <mask> as uniforms.",
    
    # === Geography & Climate ===
    "The typical summer weather in {region_name} is <mask>.",
    "The rainy season in {region_name} is <mask>.",
    "The climate in {region_name} is generally <mask>.",
    "The best time of year to visit {region_name} is <mask>.",
    "Most cities in {region_name} are located <mask>.",
    
    # === Shopping & Infrastructure ===
    "People in {region_name} usually buy groceries at <mask>.",
    "The standard voltage for electrical outlets in {region_name} is <mask>.",
    "Public restrooms in {region_name} are usually <mask>.",
    "People in {region_name} commonly pay with <mask>.",
    "The legal drinking age in {region_name} is <mask>.",
    
    # === Social Customs ===
    "People in {region_name} usually greet each other by <mask>.",
    "It is polite to <mask> before entering a home in {region_name}.",
    
    # === Technology ===
    "The most popular mobile phone brand in {region_name} is <mask>.",
    "The most commonly used messaging app in {region_name} is <mask>.",
    "The most popular search engine in {region_name} is <mask>.",
]

COMMON_TEMPLATES_EN = [
    # === Emergency Numbers ===
    "What is the emergency telephone number for police in {region_name}?",
    "What number should I call for an ambulance in {region_name}?",
    "What is the general emergency number in {region_name}?",
    "What is the fire department emergency number in {region_name}?",
    "How can I contact emergency medical services in {region_name}?",

    # === Currency & Payments ===
    "What is the official currency used in {region_name}?",
    "What do people in {region_name} mostly use for payments?",
    "What is the typical tipping percentage in restaurants in {region_name}?",
    "Are prices in {region_name} usually displayed including or excluding tax?",
    "What is the name of the central bank in {region_name}?",

    # === Food & Dining ===
    "What is a common breakfast dish in {region_name}?",
    "What do people in {region_name} usually eat for lunch?",
    "What time do people in {region_name} generally have dinner?",
    "What is a traditional national dish in {region_name}?",
    "What is the most popular non-alcoholic drink in {region_name}?",
    "What kind of bread is commonly eaten in {region_name}?",
    "What are meals in {region_name} typically served with?",
    "What is a typical dessert in {region_name}?",
    "What do people in {region_name} eat with?",

    # === Holidays & Daily Life ===
    "What is the main national holiday in {region_name}?",
    "When is New Year's Day celebrated in {region_name}?",
    "What time do shops in {region_name} usually open in the morning?",
    "How many hours long is a typical workday in {region_name}?",
    "When is summer vacation for schools in {region_name}?",
    "What is a major cultural festival in {region_name}?",

    # === Transportation ===
    "What do people in cities in {region_name} most commonly commute to work by?",
    "Which side of the road do people drive on in {region_name}?",
    "How do people usually get from the airport to the city center in {region_name}?",
    "What is the main type of public transit in big cities in {region_name}?",

    # === Education & Society ===
    "At what age do children in {region_name} start primary school?",
    "What is the typical retirement age in {region_name}?",
    "What do students in public schools in {region_name} wear as uniforms?",

    # === Geography & Climate ===
    "What is the typical summer weather like in {region_name}?",
    "When is the rainy season in {region_name}?",
    "What is the climate in {region_name} generally like?",
    "What is the best time of year to visit {region_name}?",
    "Where are most cities in {region_name} located?",

    # === Shopping & Infrastructure ===
    "Where do people in {region_name} usually buy groceries?",
    "What is the standard voltage for electrical outlets in {region_name}?",
    "What are public restrooms in {region_name} usually like?",
    "What do people in {region_name} commonly pay with?",
    "What is the legal drinking age in {region_name}?",

    # === Social Customs ===
    "How do people in {region_name} usually greet each other?",
    "What is it polite to do before entering a home in {region_name}?",

    # === Technology ===
    "What is the most popular mobile phone brand in {region_name}?",
    "What is the most commonly used messaging app in {region_name}?",
    "What is the most popular search engine in {region_name}?",
]

# === 答案清洗函数 ===
def is_valid_answer(ans: str) -> bool:
    if not ans or ans == "[ERROR]" or len(ans) > 40:
        return False
    stripped = ans.strip(" .,!?;:\"'()[]{}")
    if not stripped:
        return False
    bad_phrases = {
        "the", "a", "an", "called", "known", "as", "it", "they", "them", "their",
        "this", "that", "these", "those", "one", "some", "any", "such", "like", "very"
    }
    if stripped.lower() in bad_phrases:
        return False
    if any(c in ans for c in "<>{}[]|\\"):
        return False
    return True

# === 获取填空答案 ===
def get_fillmask_answer(template: str, region_name: str) -> str:
    sentence = template.format(region_name=region_name)
    try:
        preds = pipe(sentence, top_k=1)
        token_str = preds[0]["token_str"]
        return token_str.strip()
    except Exception as e:
        return "[ERROR]"

# === 生成 SFT 数据 ===
sft_records = []

for lang_region, info in tqdm(LANG_RE_INFO.items(), desc="Generating SFT data"):
    region_name = info["region_name"]
    for i in range(len(FILLMASK_TEMPLATES)):
        instruction = COMMON_TEMPLATES_EN[i].format(region_name=region_name)
        answer = get_fillmask_answer(FILLMASK_TEMPLATES[i], region_name)
        
        if is_valid_answer(answer):
            clean_answer = answer.strip(" .,!?;:\"'")
            sft_records.append({
                "id": f"{lang_region}_{str(i+1).zfill(3)}",
                "lang_region": lang_region,
                "question": instruction,
                "output": clean_answer,
            })

# === 保存结果 ===
df = pd.DataFrame(sft_records)
df.to_csv("/kaggle/working/sft_data.csv", index=False, sep='\t')

2025-11-29 06:58:38.733403: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764399518.963912      21 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764399519.020119      21 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at FacebookAI/xlm-roberta-large were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0
Generating SFT data:   0%|          | 0/59 [00:00<?, ?it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Generating SFT data: 100%|██████████| 59/59 [01:17<00:00,  1.32s/it]
