In [None]:
from datasets import load_dataset
import re
from dotenv import load_dotenv
import os
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")

In [None]:
dataset = load_dataset("csv", data_files="114/Chinese-General.csv")

In [None]:
dataset

In [None]:
dataset["train"][0]

In [None]:
dataset["train"].features

In [None]:
ds = dataset["train"]

# 2) URL 偵測正規
url_pat = re.compile(r"https?://")

# 題目類型
type_map = {
    "single": "單選題",
    "multiple": "多選題",
    "filling": "填充題",
}

def gen_text_pre_format(example):
    # 有 URL 就略過
    for f in ("article_images", "question_images", "answer"):
        v = example.get(f)
        if v and url_pat.search(v):
            # 這裡回傳空字串，也可以用 None，但用 "" 會比較直觀看到「略過」
            return {"text_pre_format": ""}

    # 沒 URL 才組 template
    parts = []
    # 幫你簡化：每次都把 label 與內容 str() 後加進 parts
    def add(label, content):
        parts.append(label)
        parts.append(str(content) if content is not None else "")
    if example.get("article_title"):
        add("前述文章:", example.get("article"))
    # add("# 題目類型:", type_map.get(example.get("question_type"), "未知類型"))
    add("題目敘述:", example.get("question"))

    # 單/多選的選項區塊
    opts = []
    for opt in ("A","B","C","D","E"):
        val = example.get(opt)
        if val:
            opts.append(f"（{opt}）{val}")
    if opts:
        parts.append("題目選項:")
        parts.extend(opts)

    add("答案:", example.get("answer"))
    
    # 填充題才有的 grading_criteria
    # if example.get("answer_rate"):
    #     add("答題率:", str(example.get("answer_rate"))+ "%")
    
    # 填充題才有的 grading_criteria
    if example.get("grading_criteria"):
        add("評分標準", example.get("grading_criteria"))

    # 最後 join
    text = "\n".join(parts)
    return {"text_pre_format": text}

# 3) map 上去
ds = ds.map(gen_text_pre_format)  # batched=False 是預設

# 檢查結果
print(ds.features)          # 多了一個 text_pre_format: Value(dtype='string', ...)
print(ds[:5]["text_pre_format"])  # 看前 5 筆的輸出

In [None]:
ds.features

In [None]:
for idx in range(0, len(ds), 1):
    print(ds[idx]["text_pre_format"])

In [None]:
ds.push_to_hub("TsukiOwO/TW-GSAT-114-Chinese-General", token=HF_TOKEN, private=True)

In [None]:
dataset_online = load_dataset("TsukiOwO/TW-GSAT-114-Chinese-General", token=HF_TOKEN)

In [None]:
dataset_online["train"].features

In [None]:
print(dataset_online["train"][0]["text_pre_format"])