In [54]:
from datasets import get_dataset_config_names, load_dataset, DatasetDict
from collections import defaultdict, Counter
import pandas as pd

In [14]:
# looking into the dataset subsets

xtreme_subsets = get_dataset_config_names("xtreme")
print(f"Xtreme has {len(xtreme_subsets)} subsets")
print(type(xtreme_subsets))


Xtreme has 183 subsets
<class 'list'>


In [15]:
panx_subsets = [s for s in xtreme_subsets if s.startswith("PAN")]
print(f"Xtreme has {len(panx_subsets)} PAN subsets")
print(panx_subsets)


Xtreme has 40 PAN subsets
['PAN-X.af', 'PAN-X.ar', 'PAN-X.bg', 'PAN-X.bn', 'PAN-X.de', 'PAN-X.el', 'PAN-X.en', 'PAN-X.es', 'PAN-X.et', 'PAN-X.eu', 'PAN-X.fa', 'PAN-X.fi', 'PAN-X.fr', 'PAN-X.he', 'PAN-X.hi', 'PAN-X.hu', 'PAN-X.id', 'PAN-X.it', 'PAN-X.ja', 'PAN-X.jv', 'PAN-X.ka', 'PAN-X.kk', 'PAN-X.ko', 'PAN-X.ml', 'PAN-X.mr', 'PAN-X.ms', 'PAN-X.my', 'PAN-X.nl', 'PAN-X.pt', 'PAN-X.ru', 'PAN-X.sw', 'PAN-X.ta', 'PAN-X.te', 'PAN-X.th', 'PAN-X.tl', 'PAN-X.tr', 'PAN-X.ur', 'PAN-X.vi', 'PAN-X.yo', 'PAN-X.zh']


In [17]:
# Sampling the dataset according to Switzerland languge distribution

langs = ["de", "fr", "it", "en"]
fracs = [0.629, 0.229, 0.084, 0.059]

panx_dict = defaultdict(DatasetDict)

for lang, frac in zip(langs, fracs):
    langi_dataset = load_dataset("xtreme", name=f"PAN-X.{lang}")
    for split in langi_dataset:
        panx_dict[lang][split] = (
            langi_dataset[split].shuffle(seed=0).select(range(int(frac*langi_dataset[split].num_rows)))
        )


Downloading data:   0%|          | 0.00/1.18M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/590k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/588k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Downloading data:   0%|          | 0.00/837k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/419k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/423k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Downloading data:   0%|          | 0.00/932k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/459k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/464k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Downloading data:   0%|          | 0.00/942k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/472k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/472k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [35]:
# A refresh on how a single dataset from one languge looks like

print(langi_dataset)
print("\n", langi_dataset["test"])
print("\n", langi_dataset["test"]["tokens"][0])
print("\n", langi_dataset["test"]["ner_tags"][0])
print("\n", langi_dataset["test"]["langs"][0])

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
})

 Dataset({
    features: ['tokens', 'ner_tags', 'langs'],
    num_rows: 10000
})

 ['Shortly', 'afterward', ',', 'an', 'encouraging', 'response', 'influenced', 'him', 'to', 'go', 'to', 'India', ';', 'he', 'arrived', 'at', 'Adyar', 'in', '1884', '.']

 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 5, 0, 0, 0]

 ['en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en']


In [34]:
# Check the final dataset distribution

df = pd.DataFrame(
    {lang: [panx_dict[lang]["train"].num_rows] for lang in langs}, index=["Number of training examples"]
)
df

Unnamed: 0,de,fr,it,en
Number of training examples,12580,4580,1680,1180


In [41]:
# Checkiing a single element again
element = panx_dict["de"]["train"][0]
for key, value in element.items():
    print(f"{key}: {value}")
print("\n")

# Print the features
for key, value in panx_dict["de"]["train"].features.items():
    print(f"{key}: {value}")
print("\n")


# Extract tags
tags = panx_dict["de"]["train"].features["ner_tags"].feature
print(tags)

tokens: ['2.000', 'Einwohnern', 'an', 'der', 'Danziger', 'Bucht', 'in', 'der', 'polnischen', 'Woiwodschaft', 'Pommern', '.']
ner_tags: [0, 0, 0, 0, 5, 6, 0, 0, 5, 5, 6, 0]
langs: ['de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de']


tokens: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
ner_tags: Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)
langs: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)


ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)


In [52]:
# I was confusded by the behaviour of map, since it shouyld see only the train, test and validation splits levels
# turns out it is a method of the dataset object that has this convenient behaviour https://huggingface.co/docs/datasets/about_map_batch

def create_tag_names(batch):
    return(
        {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]}
    )
panx_de = panx_dict["de"].map(create_tag_names)


In [53]:
de_example = panx_de["train"][0]
df = pd.DataFrame(
    [de_example["tokens"], de_example["ner_tags_str"]],
    index=["Tokens", "NER Tags"]
)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
Tokens,2.000,Einwohnern,an,der,Danziger,Bucht,in,der,polnischen,Woiwodschaft,Pommern,.
NER Tags,O,O,O,O,B-LOC,I-LOC,O,O,B-LOC,B-LOC,I-LOC,O


In [59]:
# Final check of the dataset entity frequencies

split2freqs = defaultdict(Counter)
for split, dataset in panx_de.items():
    for row in dataset["ner_tags_str"]:
        for tag in row:
            if tag.startswith("B"):
                tag_type = tag.split("-")[1]
                split2freqs[split][tag_type] += 1
pd.DataFrame.from_dict(split2freqs, orient="index")

Unnamed: 0,LOC,ORG,PER
train,6186,5366,5810
validation,3172,2683,2893
test,3180,2573,3071


['O',
 'O',
 'O',
 'O',
 'B-LOC',
 'I-LOC',
 'O',
 'O',
 'B-LOC',
 'B-LOC',
 'I-LOC',
 'O']