# 資料集載入(CSV)與上傳(Huggice Face Hub)

## 套件引入與數值初始化

In [1]:
import sys, os
sys.path.insert(0, os.path.abspath("../src"))

In [2]:
from datasets import load_dataset
from dotenv import load_dotenv
import os
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")

from tw_gsat.dataset.format import gen_text_pre_format

  from .autonotebook import tqdm as notebook_tqdm


## 檔案載入

In [3]:
dataset = load_dataset("csv", data_files="../data/114/Chinese_General.csv")

## 資料集資訊顯示

### 整體概要資訊

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'question_type', 'article', 'article_images', 'question', 'question_images', 'A', 'B', 'C', 'D', 'E', 'answer', 'answer_rate', 'grading_criteria'],
        num_rows: 36
    })
})

### 訓練資料集特徵與資料型態

In [5]:
dataset["train"].features

{'id': Value('int64'),
 'question_type': Value('string'),
 'article': Value('string'),
 'article_images': Value('string'),
 'question': Value('string'),
 'question_images': Value('string'),
 'A': Value('string'),
 'B': Value('string'),
 'C': Value('string'),
 'D': Value('string'),
 'E': Value('string'),
 'answer': Value('string'),
 'answer_rate': Value('float64'),
 'grading_criteria': Value('string')}

### 首個訓練資料資訊

In [6]:
dataset["train"][0]

{'id': 1,
 'question_type': 'single',
 'article': None,
 'article_images': None,
 'question': '下列「」內的字，讀音前後相同的是：',
 'question_images': None,
 'A': '「舁」出寶貨／吾生須「臾」',
 'B': '切而「啗」之／「諂」詞令色',
 'C': '「迤」邐而行／外「弛」內張',
 'D': '若分「畛」域／暴「殄」天物',
 'E': None,
 'answer': 'A',
 'answer_rate': 42.0,
 'grading_criteria': None}

## 資料格式化

In [7]:
dataset = dataset["train"]
dataset = dataset.map(gen_text_pre_format)

### 顯示格式化後的特徵與資料型態

In [8]:
dataset.features

{'id': Value('int64'),
 'question_type': Value('string'),
 'article': Value('string'),
 'article_images': Value('string'),
 'question': Value('string'),
 'question_images': Value('string'),
 'A': Value('string'),
 'B': Value('string'),
 'C': Value('string'),
 'D': Value('string'),
 'E': Value('string'),
 'answer': Value('string'),
 'answer_rate': Value('float64'),
 'grading_criteria': Value('string'),
 'text_pre_format': Value('string')}

### 顯示格式化後的 `text_pre_format` 欄位資訊

In [9]:
print(dataset[0]["text_pre_format"])

題目敘述:
下列「」內的字，讀音前後相同的是：
題目選項:
（A）「舁」出寶貨／吾生須「臾」
（B）切而「啗」之／「諂」詞令色
（C）「迤」邐而行／外「弛」內張
（D）若分「畛」域／暴「殄」天物
答案:
A


## 資料上傳

In [10]:
dataset.push_to_hub("TsukiOwO/TW-GSAT-114-Chinese-General", token=HF_TOKEN, private=True)

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 239.99ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.42 shards/s]
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/TsukiOwO/TW-GSAT-114-Chinese-General/commit/7740ade0a555651bdc0b2efe239eb2ebd0a27cb3', commit_message='Upload dataset', commit_description='', oid='7740ade0a555651bdc0b2efe239eb2ebd0a27cb3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/TsukiOwO/TW-GSAT-114-Chinese-General', endpoint='https://huggingface.co', repo_type='dataset', repo_id='TsukiOwO/TW-GSAT-114-Chinese-General'), pr_revision=None, pr_num=None)