## __transformersのdatasetについて__

公式: https://huggingface.co/docs/datasets/loading

参考記事: https://note.com/npaka/n/n17ecbd890cd6


In [1]:
!pip install datasets | tail -n 1

Successfully installed datasets-2.8.0 huggingface-hub-0.11.1 multiprocess-0.70.14 responses-0.18.0 urllib3-1.25.11 xxhash-3.1.0


In [33]:
# インポート
from datasets import list_datasets, load_dataset, Dataset, DatasetDict

### __list_datasetsについて__


In [3]:
# 利用可能なデータセット一覧

list_datasets()

['acronym_identification',
 'ade_corpus_v2',
 'adversarial_qa',
 'aeslc',
 'afrikaans_ner_corpus',
 'ag_news',
 'ai2_arc',
 'air_dialogue',
 'ajgt_twitter_ar',
 'allegro_reviews',
 'allocine',
 'alt',
 'amazon_polarity',
 'amazon_reviews_multi',
 'amazon_us_reviews',
 'ambig_qa',
 'americas_nli',
 'ami',
 'amttl',
 'anli',
 'app_reviews',
 'aqua_rat',
 'aquamuse',
 'ar_cov19',
 'ar_res_reviews',
 'ar_sarcasm',
 'arabic_billion_words',
 'arabic_pos_dialect',
 'arabic_speech_corpus',
 'arcd',
 'arsentd_lev',
 'art',
 'arxiv_dataset',
 'ascent_kb',
 'aslg_pc12',
 'asnq',
 'asset',
 'assin',
 'assin2',
 'atomic',
 'autshumato',
 'babi_qa',
 'banking77',
 'bbaw_egyptian',
 'bbc_hindi_nli',
 'bc2gm_corpus',
 'beans',
 'best2009',
 'bianet',
 'bible_para',
 'big_patent',
 'billsum',
 'bing_coronavirus_query_set',
 'biomrc',
 'biosses',
 'blbooks',
 'blbooksgenre',
 'blended_skill_talk',
 'blimp',
 'blog_authorship_corpus',
 'bn_hate_speech',
 'bnl_newspapers',
 'bookcorpus',
 'bookcorpusopen'

In [4]:
# 利用可能なデータセットの数

len(list_datasets())

16391

### __load_datasetについて__

- データセットの読み込み

- Datasetが1つだけの場合は返り値はDataset

- Datasetが複数ある場合は返り値はDatasetDict

- 2回目以降の読み込みはキャッシュから読み込まれる

- 引数 `split` の使い方

    - `split="train"`: 訓練データだけ読み込む

    - `split=train[:10%]`: 訓練データの最初の10%だけを読み込む

    - `split="train[:100]"`: 訓練データの最初の100件だけ読み込む

    - `split="trian[:100]+validation[:100]"`: 訓練データの最初の100件と検証データの最初の100件を1つのDatasetにして読み込む

In [5]:
# データセットの読み込み

squad_dataset = load_dataset("squad") # squadデータセット
mrpc_dataset = load_dataset("glue", "mrpc") # GLUEのMRPCデータセット

Downloading builder script:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.67k [00:00<?, ?B/s]

Downloading and preparing dataset squad/plain_text to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset squad downloaded and prepared to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Downloading and preparing dataset glue/mrpc to /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data: 0.00B [00:00, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
squad_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [7]:
mrpc_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [8]:
# 訓練データ

mrpc_dataset["train"]

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})

In [9]:
# 訓練データを1つ取り出す
# データ型は辞書

mrpc_dataset["train"][0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [10]:
# 各カラムの情報
mrpc_dataset["train"].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

### __CSVからDatasetを作成__

- load_dataset関数を使用する

- 第1引数に`"csv"`を指定する

- `data_files`にファイルパスを指定する方法が3つある

    1. str: 1つのファイルを学習データとして読み込む

    2. List[str]: 複数のファイルを学習データとして読み込む

    3. Dict[str: List[str]]: 学習・検証・テストデータを指定して読み込む

In [11]:
# 方法①
load_dataset("csv", data_files="sample_data/california_housing_train.csv")



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-dc2a2fee0a557be1/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-dc2a2fee0a557be1/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value'],
        num_rows: 17000
    })
})

In [12]:
# 方法②
load_dataset("csv", data_files=[
    "sample_data/california_housing_train.csv",
    "sample_data/california_housing_test.csv"
    ])



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-aef37016d98a9723/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-aef37016d98a9723/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value'],
        num_rows: 20000
    })
})

In [13]:
# 方法③
load_dataset("csv", data_files={
    "train": ["sample_data/california_housing_train.csv"],
    "test": ["sample_data/california_housing_test.csv"]
    })



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-dbd099597c032a4b/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-dbd099597c032a4b/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value'],
        num_rows: 17000
    })
    test: Dataset({
        features: ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value'],
        num_rows: 3000
    })
})

### __JSONからDatasetを作成__

- CSVからDatasetを作成する方法とほとんど同じ

- 変更点は第1引数に`"json"`を指定するだけ


In [14]:
load_dataset("json", data_files="sample_data/anscombe.json")



Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-b4b9f6521c2eeacb/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-b4b9f6521c2eeacb/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['Series', 'X', 'Y'],
        num_rows: 44
    })
})

### __辞書からDatasetを作成__

- Dataset.from_dict()を使用する

In [15]:
d = {
    "id": [0, 1, 2],
    "name": ["A", "B", "C"],
    "age": [10, 20, 30],
}

Dataset.from_dict(d)

Dataset({
    features: ['id', 'name', 'age'],
    num_rows: 3
})

### __DataFrameからDatasetを作成__

- Dataset.from_pandas()を使用する

In [16]:
import pandas as pd

df = pd.read_csv("sample_data/california_housing_train.csv")
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.40,19.0,7650.0,1901.0,1129.0,463.0,1.8200,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.9250,65500.0
...,...,...,...,...,...,...,...,...,...
16995,-124.26,40.58,52.0,2217.0,394.0,907.0,369.0,2.3571,111400.0
16996,-124.27,40.69,36.0,2349.0,528.0,1194.0,465.0,2.5179,79000.0
16997,-124.30,41.84,17.0,2677.0,531.0,1244.0,456.0,3.0313,103600.0
16998,-124.30,41.80,19.0,2672.0,552.0,1298.0,478.0,1.9797,85800.0


In [17]:
Dataset.from_pandas(df)

Dataset({
    features: ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value'],
    num_rows: 17000
})

### __Datasetの操作__

- `sort()`: 昇順にソート

- `shuffle()`: シャッフル

- `select()`: インデックスを指定してデータを取得

- `filter()`: 条件検索

- `train_test_split()`: 学習データとテストデータに分割

- `map()`: 各データに関数を適用する

- `remove_columns()`: カラムの削除

公式: https://huggingface.co/docs/datasets/process


In [18]:
# 使用するDataset
dataset = mrpc_dataset["train"]
dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})

In [19]:
# ソート前
print(dataset["label"][:10])

# ソート
result = dataset.sort("label")
result["label"][:10]

[1, 0, 1, 0, 1, 1, 0, 1, 0, 0]


[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [20]:
# シャッフル前
# 先頭10個の「idx」を確認する
print(dataset["idx"][:10])

# シャッフル
result = dataset.shuffle()
result["idx"][:10]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 10]


[3977, 2516, 3348, 3711, 3612, 3964, 2135, 2339, 1064, 2240]

In [21]:
# インデックスのリストを指定する
dataset.select([10, 20, 30])

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3
})

In [22]:
# 条件検索
result = dataset.filter(lambda x: len(x["sentence1"])==40)

# 検索結果の確認
print(result[0]["sentence1"])
len(result[0]["sentence1"])

  0%|          | 0/4 [00:00<?, ?ba/s]

Moore had no immediate comment Tuesday .


40

In [23]:
# インデックスによる条件検索
result = dataset.filter(lambda x, index: index % 2==0, with_indices=True)

# 検索結果の確認
len(result), len(dataset) / 2

  0%|          | 0/4 [00:00<?, ?ba/s]

(1834, 1834.0)

In [29]:
# 各データに関数を適用する

# 関数
def add_prefix(x):
    x["sentence1"] = "sentence1: " + x["sentence1"]
    return x

result = dataset.map(add_prefix)

result[0]["sentence1"]



'sentence1: Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .'

In [31]:
# カラムの削除

result = dataset.remove_columns("label")
result

Dataset({
    features: ['sentence1', 'sentence2', 'idx'],
    num_rows: 3668
})

### __DatasetDictの作成__

In [36]:
# 引数にdatasetの辞書を入れる
# 辞書のkeyは「"train", "valdiation", "test"」以外でもよい

DatasetDict({
    "train": dataset,
    "validation": dataset,
    "test": dataset
})

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
})